From bb53102c10df403307ecd8f795ffb3c6f23a0e77 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Thu, 5 Mar 2026 15:10:27 +0800
Subject: [PATCH 01/18] feat(storage): add transaction support with journal,
 undo, and crash recovery

Implement a full transaction system for VikingFS storage operations including
write-ahead journal, path locking, undo/rollback, context manager API, and
crash recovery. Includes comprehensive tests and documentation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/en/concepts/09-transaction.md            | 330 ++++++++++
 docs/en/guides/01-configuration.md            |  30 +-
 docs/zh/concepts/09-transaction.md            | 367 ++++++++---
 docs/zh/guides/01-configuration.md            |  32 +-
 openviking/agfs_manager.py                    |  15 +
 openviking/parse/tree_builder.py              |  62 +-
 openviking/service/core.py                    |  22 +-
 openviking/session/session.py                 | 145 ++++-
 openviking/storage/errors.py                  |  12 +
 openviking/storage/queuefs/named_queue.py     |  32 +-
 openviking/storage/queuefs/queue_manager.py   |  21 +-
 openviking/storage/queuefs/semantic_dag.py    |  41 +-
 .../storage/queuefs/semantic_processor.py     |  61 +-
 openviking/storage/transaction/__init__.py    |   9 +-
 .../storage/transaction/context_manager.py    | 146 +++++
 openviking/storage/transaction/journal.py     | 114 ++++
 openviking/storage/transaction/path_lock.py   | 609 ++++++++++++------
 .../transaction/transaction_manager.py        | 267 +++++++-
 .../storage/transaction/transaction_record.py |  71 +-
 openviking/storage/transaction/undo.py        | 147 +++++
 openviking/storage/viking_fs.py               | 145 ++++-
 .../storage/viking_vector_index_backend.py    |  16 +-
 openviking/utils/agfs_utils.py                |   4 +
 openviking_cli/utils/config/storage_config.py |   6 +
 .../utils/config/transaction_config.py        |  37 ++
 tests/transaction/__init__.py                 |   0
 tests/transaction/conftest.py                 |  56 ++
 tests/transaction/test_concurrent_lock.py     | 103 +++
 tests/transaction/test_context_manager.py     | 224 +++++++
 tests/transaction/test_crash_recovery.py      | 385 +++++++++++
 tests/transaction/test_e2e.py                 | 238 +++++++
 tests/transaction/test_journal.py             | 215 +++++++
 tests/transaction/test_path_lock.py           | 334 ++++++++++
 tests/transaction/test_post_actions.py        | 112 ++++
 tests/transaction/test_rm_rollback.py         | 233 +++++++
 tests/transaction/test_transaction_manager.py | 323 ++++++++++
 tests/transaction/test_undo.py                | 163 +++++
 .../pkg/plugins/queuefs/backend.go            |  31 +-
 .../pkg/plugins/queuefs/db_backend.go         |   6 +
 .../pkg/plugins/queuefs/queuefs.go            |  22 +-
 .../pkg/plugins/queuefs/sqlite_backend.go     | 321 +++++++++
 41 files changed, 5061 insertions(+), 446 deletions(-)
 create mode 100644 docs/en/concepts/09-transaction.md
 create mode 100644 openviking/storage/transaction/context_manager.py
 create mode 100644 openviking/storage/transaction/journal.py
 create mode 100644 openviking/storage/transaction/undo.py
 create mode 100644 openviking_cli/utils/config/transaction_config.py
 create mode 100644 tests/transaction/__init__.py
 create mode 100644 tests/transaction/conftest.py
 create mode 100644 tests/transaction/test_concurrent_lock.py
 create mode 100644 tests/transaction/test_context_manager.py
 create mode 100644 tests/transaction/test_crash_recovery.py
 create mode 100644 tests/transaction/test_e2e.py
 create mode 100644 tests/transaction/test_journal.py
 create mode 100644 tests/transaction/test_path_lock.py
 create mode 100644 tests/transaction/test_post_actions.py
 create mode 100644 tests/transaction/test_rm_rollback.py
 create mode 100644 tests/transaction/test_transaction_manager.py
 create mode 100644 tests/transaction/test_undo.py
 create mode 100644 third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go

diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
new file mode 100644
index 00000000..65ec4c3b
--- /dev/null
+++ b/docs/en/concepts/09-transaction.md
@@ -0,0 +1,330 @@
+# Transaction Mechanism
+
+OpenViking's transaction mechanism protects the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur.
+
+## Design Philosophy
+
+OpenViking is a context database where FS is the source of truth and VectorDB is a derived index. A lost index can be rebuilt from source data, but lost source data is unrecoverable. Therefore:
+
+> **Better to miss a search result than to return a bad one.**
+
+## Design Principles
+
+1. **Transactions cover synchronous operations only**: FS + VectorDB operations run inside transactions; SemanticQueue/EmbeddingQueue enqueue runs after commit (as post_actions) — they are idempotent and retriable
+2. **On by default**: All data operations automatically use transactions; no extra configuration needed
+3. **Write-exclusive**: Path locks ensure only one write transaction can operate on a path at a time
+4. **Undo Log model**: Record reverse operations before each change; replay them in reverse order on failure
+5. **Persistent journal**: Each transaction writes a journal file to AGFS for crash recovery
+
+## Architecture
+
+```
+Service Layer (rm / mv / add_resource / session.commit)
+    |
+    v
++--[TransactionContext async context manager]--+
+|                                              |
+|  1. Create transaction + write journal       |
+|  2. Acquire path lock (poll + timeout)       |
+|  3. Execute operations (FS + VectorDB)       |
+|  4. Record Undo Log (mark completed)         |
+|  5. Commit / Rollback                        |
+|  6. Execute post_actions (enqueue etc)       |
+|  7. Release lock + clean up journal          |
+|                                              |
+|  On exception: reverse Undo Log + unlock     |
++----------------------------------------------+
+    |
+    v
+Storage Layer (VikingFS, VectorDB, QueueManager)
+```
+
+## Consistency Issues and Solutions
+
+### rm(uri)
+
+| Problem | Solution |
+|---------|----------|
+| Delete file first, then index -> file gone but index remains -> search returns non-existent file | **Reverse order**: delete index first, then file. Index deletion failure -> both file and index intact |
+
+Transaction flow:
+
+```
+1. Begin transaction, acquire lock (lock_mode="subtree")
+2. Snapshot VectorDB records (for rollback recovery)
+3. Delete VectorDB index -> immediately invisible to search
+4. Delete FS file
+5. Commit -> release lock -> delete journal
+```
+
+Rollback: Step 4 fails -> restore VectorDB records from snapshot.
+
+### mv(old_uri, new_uri)
+
+| Problem | Solution |
+|---------|----------|
+| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Transaction wrapper; rollback on failure |
+
+Transaction flow:
+
+```
+1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on source + POINT on destination)
+2. Move FS file
+3. Update VectorDB URIs
+4. Commit -> release lock -> delete journal
+```
+
+Rollback: Step 3 fails -> move file back to original location.
+
+### add_resource (TreeBuilder.finalize_from_temp)
+
+| Problem | Solution |
+|---------|----------|
+| File moved from temp to final directory, then crash -> file exists but never searchable | Transaction wrapper for mv + post_action protects enqueue |
+
+Transaction flow:
+
+```
+1. Begin transaction, lock final_uri (lock_mode="point")
+2. mv temp directory -> final location
+3. Register post_action: enqueue SemanticQueue
+4. Commit -> execute post_action -> release lock -> delete journal
+```
+
+Crash recovery: Journal records the post_action; replayed automatically on restart.
+
+### session.commit()
+
+| Problem | Solution |
+|---------|----------|
+| Messages cleared but archive not written -> conversation data lost | Split into two transactions + checkpoint |
+
+LLM calls have unpredictable latency (5s~60s+), so they cannot be inside a transaction. Split into:
+
+```
+Transaction 1 (Archive):
+  1. Write archive (history/archive_N/messages.jsonl + summaries)
+  2. Clear messages.jsonl
+  3. Write checkpoint (status="archived")
+  4. Commit
+
+LLM call (no transaction):
+  Extract memories from archived messages
+
+Transaction 2 (Memory write):
+  1. Write memory files
+  2. Write relations
+  3. Update checkpoint (status="completed")
+  4. Register post_action: enqueue SemanticQueue
+  5. Commit
+```
+
+Crash recovery: Read checkpoint, resume from the appropriate step based on status.
+
+## TransactionContext
+
+`TransactionContext` is an **async** context manager that encapsulates the full transaction lifecycle:
+
+```python
+from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
+tx_manager = get_transaction_manager()
+
+async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx:
+    # Record undo (call before making changes)
+    seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot})
+    # Execute change
+    delete_from_vector_store(uris)
+    # Mark completed
+    tx.mark_completed(seq)
+
+    # Register post-commit action (optional)
+    tx.add_post_action("enqueue_semantic", {"uri": uri, ...})
+
+    # Commit
+    await tx.commit()
+# Auto-rollback if commit() not called
+```
+
+**Lock modes**:
+
+| lock_mode | Use case | Behavior |
+|-----------|----------|----------|
+| `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors |
+| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path and any lock on descendants |
+| `mv` | Move operations | Acquire SUBTREE lock on source path, then POINT lock on destination path |
+
+## Lock Types (POINT vs SUBTREE)
+
+The lock mechanism uses two lock types to handle different conflict patterns:
+
+| | POINT on same path | SUBTREE on same path | POINT on descendant | SUBTREE on ancestor |
+|---|---|---|---|---|
+| **POINT** | Conflict | Conflict | — | Conflict |
+| **SUBTREE** | Conflict | Conflict | Conflict | — |
+
+- **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock.
+- **SUBTREE (S)**: Used for rm and mv-source operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants for conflicting locks.
+
+## Undo Log
+
+Each transaction maintains an Undo Log recording the reverse action for each step:
+
+| op_type | Forward operation | Rollback action |
+|---------|-------------------|-----------------|
+| `fs_mv` | Move file | Move back |
+| `fs_rm` | Delete file | Skip (irreversible; rm is always the last step by design) |
+| `fs_write_new` | Create new file/directory | Delete |
+| `fs_mkdir` | Create directory | Delete |
+| `vectordb_delete` | Delete index records | Restore from snapshot |
+| `vectordb_upsert` | Insert index records | Delete |
+| `vectordb_update_uri` | Update URI | Restore old value |
+
+Rollback rules: Only entries with `completed=True` are rolled back, in **reverse order**. Each step has independent try-catch (best-effort). During crash recovery, `recover_all=True` also reverses uncompleted entries to clean up partial operations.
+
+## Lock Mechanism
+
+### Lock Protocol
+
+Lock file path: `{path}/.path.ovlock`
+
+Lock file content (Fencing Token):
+```
+{transaction_id}:{time_ns}:{lock_type}
+```
+
+Where `lock_type` is `P` (POINT) or `S` (SUBTREE).
+
+### Lock Acquisition (POINT mode)
+
+```
+loop until timeout (poll interval: 200ms):
+    1. Check target directory exists
+    2. Check if target directory is locked by another transaction
+       - Stale lock? -> remove and retry
+       - Active lock? -> wait
+    3. Check all ancestor directories for SUBTREE locks
+       - Stale lock? -> remove and retry
+       - Active lock? -> wait
+    4. Write POINT (P) lock file
+    5. TOCTOU double-check: re-scan ancestors for SUBTREE locks
+       - Conflict found: compare (timestamp, tx_id)
+       - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock
+       - Wait and retry
+    6. Verify lock file ownership (fencing token matches)
+    7. Success
+
+Timeout (default 0 = no-wait) raises LockAcquisitionError
+```
+
+### Lock Acquisition (SUBTREE mode)
+
+```
+loop until timeout (poll interval: 200ms):
+    1. Check target directory exists
+    2. Check if target directory is locked by another transaction
+    3. Scan all descendant directories for any locks by other transactions
+    4. Write SUBTREE (S) lock file (only one file, at the root path)
+    5. TOCTOU double-check: re-scan descendants for new locks
+       - Conflict found: later one backs off (livelock prevention)
+    6. Verify lock file ownership
+    7. Success
+```
+
+### Lock Expiry Cleanup
+
+**Stale lock detection**: PathLock checks the fencing token timestamp. Locks older than `lock_expire` (default 300s) are considered stale and are removed automatically during acquisition.
+
+**Transaction timeout**: TransactionManager checks active transactions every 60 seconds. Transactions with `updated_at` exceeding the transaction timeout (default 3600s) are rolled back.
+
+## Transaction Journal
+
+Each transaction persists a journal in AGFS:
+
+```
+/local/_system/transactions/{tx_id}/journal.json
+```
+
+Contains: transaction ID, status, lock paths, init_info, undo_log, post_actions.
+
+### Lifecycle
+
+```
+Create transaction -> write journal (INIT)
+Acquire lock       -> update journal (AQUIRE -> EXEC)
+Execute changes    -> update journal per step (mark undo entry completed)
+Commit             -> update journal (COMMIT + post_actions)
+                   -> execute post_actions -> release locks -> delete journal
+Rollback           -> execute undo log -> release locks -> delete journal
+```
+
+## Crash Recovery
+
+`TransactionManager.start()` automatically scans for residual journals on startup:
+
+| Journal status at crash | Recovery action |
+|------------------------|----------------|
+| `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal |
+| `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal |
+| `EXEC` / `FAIL` / `RELEASING` | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal |
+| `INIT` / `AQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) |
+
+### Defense Summary
+
+| Failure scenario | Defense | Recovery timing |
+|-----------------|--------|-----------------|
+| Crash during transaction | Journal + undo log rollback | On restart |
+| Crash after commit, before enqueue | Journal post_actions replay | On restart |
+| Crash after enqueue, before worker processes | QueueFS SQLite persistence | Worker auto-pulls after restart |
+| Crash during session.commit LLM call | Checkpoint file recovery | On restart, re-invoke LLM |
+| Orphan index | Cleaned on L2 on-demand load | When user accesses |
+| Crash between lock creation and journal update | init_info records intended lock paths; recovery checks and cleans orphan locks | On restart |
+
+## Transaction State Machine
+
+```
+INIT -> AQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED
+                    |
+                   FAIL -> RELEASING -> RELEASED
+```
+
+- `INIT`: Transaction created, waiting for lock
+- `AQUIRE`: Acquiring lock
+- `EXEC`: Transaction operations executing
+- `COMMIT`: Committed, post_actions may be pending
+- `FAIL`: Execution failed, entering rollback
+- `RELEASING`: Releasing locks
+- `RELEASED`: Locks released, transaction complete
+
+## Configuration
+
+The transaction mechanism is enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section:
+
+```json
+{
+  "storage": {
+    "transaction": {
+      "lock_timeout": 5.0,
+      "lock_expire": 300.0,
+      "max_parallel_locks": 8
+    }
+  }
+}
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `lock_timeout` | float | Lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds. | `0.0` |
+| `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` |
+| `max_parallel_locks` | int | Max parallel locks for rm/mv operations | `8` |
+
+### QueueFS Persistence
+
+The transaction mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup.
+
+## Related Documentation
+
+- [Architecture](./01-architecture.md) - System architecture overview
+- [Storage](./05-storage.md) - AGFS and vector store
+- [Session Management](./08-session.md) - Session and memory management
+- [Configuration](../guides/01-configuration.md) - Configuration reference
diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md
index 764d7047..8c2c3420 100644
--- a/docs/en/guides/01-configuration.md
+++ b/docs/en/guides/01-configuration.md
@@ -479,7 +479,6 @@ Supports S3 storage in VirtualHostStyle mode, such as TOS.
 
 </details>
 
-
 #### vectordb
 
 Vector database storage configuration
@@ -603,6 +602,30 @@ When `root_api_key` is configured, the server enables multi-tenant authenticatio
 
 For startup and deployment details see [Deployment](./03-deployment.md), for authentication see [Authentication](./04-authentication.md).
 
+## storage.transaction Section
+
+The transaction mechanism is enabled by default and usually requires no configuration. **The default behavior is no-wait**: if the target path is already locked by another transaction, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry.
+
+```json
+{
+  "storage": {
+    "transaction": {
+      "lock_timeout": 5.0,
+      "lock_expire": 300.0,
+      "max_parallel_locks": 8
+    }
+  }
+}
+```
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `lock_timeout` | float | Path lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds, then raise `LockAcquisitionError`. | `0.0` |
+| `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` |
+| `max_parallel_locks` | int | Max parallel locks during recursive locking for rm/mv operations | `8` |
+
+For details on the transaction mechanism, see [Transaction Mechanism](../concepts/09-transaction.md).
+
 ## Full Schema
 
 ```json
@@ -637,6 +660,11 @@ For startup and deployment details see [Deployment](./03-deployment.md), for aut
       "url": "string",
       "timeout": 10
     },
+    "transaction": {
+      "lock_timeout": 0.0,
+      "lock_expire": 300.0,
+      "max_parallel_locks": 8
+    },
     "vectordb": {
       "backend": "local|remote",
       "url": "string",
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index 503ed683..99723042 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -1,167 +1,330 @@
 # 事务机制
 
-OpenViking 的事务机制为 AI Agent 上下文数据库提供可靠的操作保障，解决数据一致性、并发控制和错误恢复等核心问题。
+OpenViking 的事务机制保护核心写操作（`rm`、`mv`、`add_resource`、`session.commit`）的一致性，确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。
 
-## 概览
+## 设计哲学
 
-```
-操作请求 → TransactionManager   →   锁保护   →   执行操作 → 状态更新
-          ↓                          ↓              ↓
-        事务ID分配和事务状态管理      路径锁校验和加锁
+OpenViking 是上下文数据库，FS 是源数据，VectorDB 是派生索引。索引丢了可从源数据重建，源数据丢失不可恢复。因此：
+
+> **宁可搜不到，不要搜到坏结果。**
+
+## 设计原则
 
+1. **事务只覆盖同步部分**：FS + VectorDB 操作在事务内；SemanticQueue/EmbeddingQueue 的 enqueue 在事务提交后执行（post_actions），它们是幂等的，失败可重试
+2. **默认生效**：所有数据操作命令自动开启事务机制，用户无需额外配置
+3. **写互斥**：通过路径锁保证同一路径同一时间只有一个写事务
+4. **Undo Log 模型**：变更前记录反向操作，失败时反序执行回滚
+5. **事务日志持久化**：每个事务在 AGFS 中写入 journal 文件，支持崩溃恢复
 
-事务生命周期：开始操作 → 创建事务 → 锁保护生效 → 文件系统同步操作 → 摘要和索引异步操作 → 移除锁保护 → 事务结束
+## 架构
+
+```
+Service Layer (rm / mv / add_resource / session.commit)
+    │
+    ▼
+┌──[TransactionContext 异步上下文管理器]──┐
+│                                         │
+│  1. 创建事务 + 写 journal               │
+│  2. 获取路径锁（轮询 + 超时）           │
+│  3. 执行操作（FS + VectorDB）           │
+│  4. 记录 Undo Log（每步完成后标记）     │
+│  5. Commit / Rollback                   │
+│  6. 执行 post_actions（enqueue 等）     │
+│  7. 释放锁 + 清理 journal               │
+│                                         │
+│  异常时：反序执行 Undo Log → 释放锁     │
+└─────────────────────────────────────────┘
+    │
+    ▼
+Storage Layer (VikingFS, VectorDB, QueueManager)
 ```
 
-**设计原则**：
-1. **最小化锁粒度**：仅支持路径锁机制，不实现复杂的 MVCC 等
-2. **写互斥优先**：暂不实现读锁（共享锁），先承诺写操作的互斥性
-3. **渐进式扩展**：避免过度设计，聚焦核心需求，未来需要时再添加更复杂的锁机制
-4. **默认生效**：所有数据操作命令均开启事务机制，用户无需额外配置
+## 一致性问题与解决方案
+
+### rm(uri)
 
-## 核心需求分析
+| 问题 | 方案 |
+|------|------|
+| 先删文件再删索引 → 文件已删但索引残留 → 搜索返回不存在的文件 | **调换顺序**：先删索引再删文件。索引删除失败 → 文件和索引都在，搜索正常 |
 
-OpenViking 的数据操作命令（如 `add_resource`、`rm`、`mv` 等）存在以下无保护操作问题：
+事务流程：
 
-1. **并发冲突**：多个用户同时操作同一目录可能导致数据不一致
-2. **无原子性**：`add_resource` 多阶段操作中，某个阶段失败可能留下中间状态
-3. **无可观测性**：操作结果无法预测，用户无法直接观察到正在操作的状态
+```
+1. 开始事务，加锁（lock_mode="subtree"）
+2. 快照 VectorDB 中受影响的记录（用于回滚恢复）
+3. 删除 VectorDB 索引 → 搜索立刻不可见
+4. 删除 FS 文件
+5. 提交 → 删锁 → 删 journal
+```
+
+回滚：第 4 步失败 → 从快照恢复 VectorDB 记录，文件和索引都在。
 
-## 系统一致性要求
+### mv(old_uri, new_uri)
 
-从系统分析的角度，OpenViking 要求实现组件间的分布式一致性：
+| 问题 | 方案 |
+|------|------|
+| 文件移到新路径但索引指向旧路径 → 搜索返回旧路径（不存在） | 事务包装，移动失败则回滚 |
 
-1. **向量索引的最终一致**：所有上下文数据的向量表征依托独立的向量数据库或向量索引实现，要求确保在任何操作序列下，向量表示的更新都能实现最终一致
-2. **文件系统的读写一致性**：所有上下文数据的文件系统表示依托 VikingFS 实现，底层为 AGFS 桥接的分布式文件系统，要求确保在任何操作序列下，文件系统的更新都能保证数据不会损坏或丢失
-3. **队列和异步数据处理的一致性**：所有上下文数据的异步操作依托队列实现，要求确保在任何操作序列下，队列中的数据都能实现最终一致，即队列中的数据会最终被处理，不会丢失或重复
+事务流程：
 
-## TransactionManager（事务管理器）
+```
+1. 开始事务，加锁（lock_mode="mv"，源路径 SUBTREE + 目标路径 POINT）
+2. 移动 FS 文件
+3. 更新 VectorDB 中的 URI
+4. 提交 → 删锁 → 删 journal
+```
 
-TransactionManager 是全局单例，负责管理事务生命周期和锁机制实现。
+回滚：第 3 步失败 → 把文件移回原位。
 
-### 核心职责
+### add_resource (TreeBuilder.finalize_from_temp)
 
-- 分配事务ID
-- 管理事务生命周期（开始、提交、回滚）
-- 提供事务的锁机制实现接口，防止死锁
+| 问题 | 方案 |
+|------|------|
+| 文件从临时目录移到正式目录后崩溃 → 文件存在但永远搜不到 | 事务包装 mv + post_action 保护 enqueue |
 
-### 关键特性
+事务流程：
 
 ```
-路径锁 + 写互斥 = 并发冲突防护
+1. 开始事务，加锁（lock_mode="point"，锁 final_uri）
+2. mv 临时目录 → 正式位置
+3. 注册 post_action: enqueue SemanticQueue
+4. 提交 → 执行 post_action → 删锁 → 删 journal
 ```
 
-- **路径锁**：锁定目标目录，防止并发的目录级操作如目录删除、目录移动等
-- **写互斥**：同一时间只允许一个事务写操作，路径锁机制确保所有写操作的互斥性
-- **事务结束状态**：事务有明确的结束状态，包括完成、失败丢弃等
+崩溃恢复：journal 中记录了 post_action，重启时自动重放 enqueue。
+
+### session.commit()
 
-### 事务状态机
+| 问题 | 方案 |
+|------|------|
+| 消息已清空但 archive 未写入 → 对话数据丢失 | 拆为两段事务 + checkpoint |
+
+LLM 调用耗时不可控（5s~60s+），放在事务内会长时间持锁。因此拆为：
 
 ```
-INIT → AQUIRE → EXEC → COMMIT/FAIL → RELEASING → RELEASED
+第一段事务（归档）：
+  1. 写 archive（history/archive_N/messages.jsonl + 摘要）
+  2. 清空 messages.jsonl
+  3. 写 checkpoint（status="archived"）
+  4. 提交
+
+LLM 调用（无事务）：
+  从归档消息提取 memories
+
+第二段事务（memory 写入）：
+  1. 写 memory 文件
+  2. 写 relations
+  3. 更新 checkpoint（status="completed"）
+  4. 注册 post_action: enqueue SemanticQueue
+  5. 提交
 ```
 
-**状态说明**：
-- `INIT`：事务初始化完成，等待锁获取
-- `AQUIRE`：正在获取锁资源
-- `EXEC`：事务操作正在执行
-- `COMMIT/FAIL`：事务执行完成，进入最终状态
-- `RELEASING`：正在释放锁资源
-- `RELEASED`：锁资源已完全释放，事务结束
+崩溃恢复：读 checkpoint，根据 status 决定从哪一步继续。
+
+## TransactionContext
 
-### 事务记录属性
+`TransactionContext` 是**异步**上下文管理器，封装事务的完整生命周期：
 
 ```python
-TransactionRecord(
-    id: str,                # 事务ID，采用 uuid 格式，唯一标识一个事务
-    locks: List[str],       # 锁列表
-    status: str,            # 当前状态
-    init_info: Dict,        # 事务初始化信息
-    rollback_info: Dict,    # 回滚信息
-    created_at: float,      # 创建时间
-    updated_at: float,      # 更新时间
-)
+from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
+tx_manager = get_transaction_manager()
+
+async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx:
+    # 记录 undo（变更前调用）
+    seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot})
+    # 执行变更
+    delete_from_vector_store(uris)
+    # 标记完成
+    tx.mark_completed(seq)
+
+    # 注册提交后动作（可选）
+    tx.add_post_action("enqueue_semantic", {"uri": uri, ...})
+
+    # 提交
+    await tx.commit()
+# 未 commit 时自动回滚
 ```
 
-### 设计决策
+**锁模式**：
 
-- 暂不实现共享锁（读锁），简化设计
-- 锁粒度仅限目录，不实现范围锁机制
-- 不实现复杂的死锁检测，通过超时机制防止死锁，事务超时后自动释放所有锁
-- 支持可选的自下而上并行加锁模式，提升大型目录树操作的性能和一致性
-- 事务状态机增加AQUIRE+RELEASING状态，明确跟踪锁释放过程，提高系统可观测性
+| lock_mode | 用途 | 行为 |
+|-----------|------|------|
+| `point` | 写操作 | 锁定指定路径；与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 |
+| `subtree` | 删除操作 | 锁定子树根节点；与同路径的任何锁和后代目录的任何锁冲突 |
+| `mv` | 移动操作 | 源路径加 SUBTREE 锁，目标路径加 POINT 锁 |
 
-## 锁机制
+## 锁类型（POINT vs SUBTREE）
+
+锁机制使用两种锁类型来处理不同的冲突场景：
+
+| | 同路径 POINT | 同路径 SUBTREE | 后代 POINT | 祖先 SUBTREE |
+|---|---|---|---|---|
+| **POINT** | 冲突 | 冲突 | — | 冲突 |
+| **SUBTREE** | 冲突 | 冲突 | 冲突 | — |
+
+- **POINT (P)**：用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。
+- **SUBTREE (S)**：用于删除和移动源操作。逻辑上覆盖整个子树，但只在根目录写**一个锁文件**。获取前扫描所有后代确认无冲突锁。
+
+## Undo Log
 
-锁机制是事务管理的核心组件，当前只提供路径锁类型。
+每个事务维护一个 Undo Log，记录每步操作的反向动作：
 
-### 锁类型
+| op_type | 正向操作 | 回滚动作 |
+|---------|---------|---------|
+| `fs_mv` | 移动文件 | 移回原位 |
+| `fs_rm` | 删除文件 | 跳过（不可逆，设计上 rm 是最后一步） |
+| `fs_write_new` | 创建新文件/目录 | 删除 |
+| `fs_mkdir` | 创建目录 | 删除 |
+| `vectordb_delete` | 删除索引记录 | 从快照恢复 |
+| `vectordb_upsert` | 插入索引记录 | 删除 |
+| `vectordb_update_uri` | 更新 URI | 恢复旧值 |
 
-| 锁类型 | 作用范围 | 用例 |
-|--------|----------|------|
-| 路径锁 | 整个目录 | 用于阻止目录被意外整体移动或删除，确保事务操作过程的路径合法性
+回滚规则：只回滚 `completed=True` 的条目，**反序执行**。每步独立 try-catch（best-effort）。崩溃恢复时使用 `recover_all=True`，也会回滚未完成的条目以清理部分操作残留。
+
+## 锁机制
 
 ### 锁协议
 
+锁文件路径：`{path}/.path.ovlock`
+
+锁文件内容（Fencing Token）：
 ```
-viking://resources/github/volcengine/OpenViking/.path.ovlock
+{transaction_id}:{time_ns}:{lock_type}
 ```
 
-- 锁文件存在即表示已加锁
-- 文件内容为事务ID，用于标识当前事务
-- 事务操作完成后，删除锁文件以释放锁
+其中 `lock_type` 为 `P`（POINT）或 `S`（SUBTREE）。
 
-### 加锁流程
+### 获取锁流程（POINT 模式）
+
+```
+循环直到超时（轮询间隔：200ms）：
+    1. 检查目标目录存在
+    2. 检查目标路径是否被其他事务锁定
+       - 陈旧锁？ → 移除后重试
+       - 活跃锁？ → 等待
+    3. 检查所有祖先目录是否有 SUBTREE 锁
+       - 陈旧锁？ → 移除后重试
+       - 活跃锁？ → 等待
+    4. 写入 POINT (P) 锁文件
+    5. TOCTOU 双重检查：重新扫描祖先目录的 SUBTREE 锁
+       - 发现冲突：比较 (timestamp, tx_id)
+       - 后到者（更大的 timestamp/tx_id）主动让步（删除自己的锁），防止活锁
+       - 等待后重试
+    6. 验证锁文件归属（fencing token 匹配）
+    7. 成功
+
+超时（默认 0 = 不等待）抛出 LockAcquisitionError
+```
 
-#### 普通操作加锁流程
+### 获取锁流程（SUBTREE 模式）
 
 ```
-1. 检查目标目录是否存在
-2. 检查目标目录是否已被其他事务锁定
-3. 检查目标目录的父目录是否已被其他事务锁定
-4. 创建 .path.ovlock 文件，文件内容为事务ID
-5. 再次检查目标目录的父目录是否已被其他事务锁定
-6. 读取刚创建的 .path.ovlock 文件内容，确认为当前事务ID
-7. 一切正常，则返回加锁成功
+循环直到超时（轮询间隔：200ms）：
+    1. 检查目标目录存在
+    2. 检查目标路径是否被其他事务锁定
+    3. 扫描所有后代目录，检查是否有其他事务持有的锁
+    4. 写入 SUBTREE (S) 锁文件（只写一个文件，在根路径）
+    5. TOCTOU 双重检查：重新扫描后代目录
+       - 发现冲突：后到者主动让步（活锁防止）
+    6. 验证锁文件归属
+    7. 成功
 ```
 
-#### rm 操作加锁流程
+### 锁过期清理
 
+**陈旧锁检测**：PathLock 检查 fencing token 中的时间戳。超过 `lock_expire`（默认 300s）的锁被视为陈旧锁，在加锁过程中自动移除。
+
+**事务超时**：TransactionManager 每 60 秒检查活跃事务，`updated_at` 超过事务超时时间（默认 3600s）的事务强制回滚。
+
+## 事务日志（Journal）
+
+每个事务在 AGFS 持久化一份 journal：
+
+```
+/local/_system/transactions/{tx_id}/journal.json
 ```
-# 传统串行模式：存在更大的竞态条件窗口
-1. 检查目标目录是否存在
-2. 检查目标目录是否已被其他事务锁定
-3. 检查目标目录的父目录是否已被其他事务锁定
-4. 在目标目录下创建 .path.ovlock 文件，文件内容为事务ID
-5. 递归地在目标目录的所有子目录下创建 .path.ovlock 文件
-6. 如果发生加锁失败，移除所有已经创建的 .path.ovlock 文件
-7. 一切正常，则返回加锁成功
 
-# 自下而上并行模式
-1. 并行遍历整个目录树，收集所有子目录路径
-2. 按照目录层级从深到浅排序，从最深层子目录开始
-3. 以有限并行度（默认最大8）批量创建 .path.ovlock 文件
-4. 最后锁定目标目录
-5. 如果任一位置加锁失败，逆序移除所有已经创建的 .path.ovlock 文件
+内容包含：事务 ID、状态、锁路径、init_info、undo_log、post_actions。
+
+### 生命周期
+
+```
+创建事务 → 写 journal（INIT）
+获取锁   → 更新 journal（AQUIRE → EXEC）
+执行变更 → 每步更新 journal（标记 undo entry completed）
+提交     → 更新 journal（COMMIT + post_actions）
+         → 执行 post_actions → 删锁 → 删 journal
+回滚     → 执行 undo log → 删锁 → 删 journal
 ```
 
-#### mv 操作加锁流程
+## 崩溃恢复
+
+`TransactionManager.start()` 启动时自动扫描残留 journal：
+
+| 崩溃时 journal 状态 | 恢复方式 |
+|---------------------|---------|
+| `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal |
+| `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal |
+| `EXEC` / `FAIL` / `RELEASING` | 执行 undo log 回滚（`recover_all=True`） → 删锁 → 删 journal |
+| `INIT` / `AQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal（变更未执行） |
+
+### 防线总结
+
+| 异常场景 | 防线 | 恢复时机 |
+|---------|------|---------|
+| 事务内崩溃 | journal + undo log 回滚 | 重启时 |
+| 提交后 enqueue 前崩溃 | journal post_actions 重放 | 重启时 |
+| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后自动拉取 |
+| session.commit LLM 调用中崩溃 | checkpoint 文件恢复 | 重启时重新调用 LLM |
+| 孤儿索引 | L2 按需加载时清理 | 用户访问时 |
+| 加锁后 journal 更新前崩溃 | init_info 记录预期锁路径，恢复时检查并清理孤儿锁 | 重启时 |
+
+## 事务状态机
 
 ```
-1. 先参照 rm 操作对原目录进行加锁
-2. 再参照普通操作过程对新目录进行加锁
+INIT → AQUIRE → EXEC → COMMIT → RELEASING → RELEASED
+                   ↓
+                  FAIL → RELEASING → RELEASED
 ```
 
-### 锁机制性能分析
+- `INIT`：事务已创建，等待锁获取
+- `AQUIRE`：正在获取锁
+- `EXEC`：事务操作执行中
+- `COMMIT`：已提交，可能有 post_actions 待执行
+- `FAIL`：执行失败，进入回滚
+- `RELEASING`：正在释放锁
+- `RELEASED`：锁已释放，事务结束
+
+## 配置
+
+事务机制默认启用，无需额外配置。**默认不等待**：若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试，可通过 `storage.transaction` 段配置：
+
+```json
+{
+  "storage": {
+    "transaction": {
+      "lock_timeout": 5.0,
+      "lock_expire": 300.0,
+      "max_parallel_locks": 8
+    }
+  }
+}
+```
+
+| 参数 | 类型 | 说明 | 默认值 |
+|------|------|------|--------|
+| `lock_timeout` | float | 获取锁的等待超时（秒）。`0` = 立即失败（默认）；`> 0` = 最多等待此时间 | `0.0` |
+| `lock_expire` | float | 锁过期时间（秒），超过此时间的事务锁将被视为陈旧锁并强制释放 | `300.0` |
+| `max_parallel_locks` | int | rm/mv 操作的最大并行加锁数 | `8` |
+
+### QueueFS 持久化
 
-- 并行遍历采用广度优先策略，同时处理同一层级的所有目录
-- 并行加锁从最深层开始，逐层向上锁定，确保整个目录树的一致性
-- 有限并行度（默认最大8）避免AGFS服务过载
-- 加锁失败时采用逆序回滚，确保所有已加锁目录都能正确释放
-- 事务状态机明确区分锁管理过程（AQUIRE+RELEASING状态），提高系统可观测性和调试效率
+事务机制依赖 QueueFS 使用 SQLite 后端，确保 enqueue 的任务在进程重启后可恢复。这是默认配置，无需手动设置。
 
 ## 相关文档
 
 - [架构概述](./01-architecture.md) - 系统整体架构
 - [存储架构](./05-storage.md) - AGFS 和向量库
-- [会话管理](./08-session.md) - 会话和记忆管理
\ No newline at end of file
+- [会话管理](./08-session.md) - 会话和记忆管理
+- [配置](../guides/01-configuration.md) - 配置文件说明
diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md
index 457738cc..ffe71183 100644
--- a/docs/zh/guides/01-configuration.md
+++ b/docs/zh/guides/01-configuration.md
@@ -487,10 +487,9 @@ AST 提取支持：Python、JavaScript/TypeScript、Rust、Go、Java、C/C++。
 
 </details>
 
-
 #### vectordb
 
-向量库存储的配置 
+向量库存储的配置
 
 | 参数 | 类型 | 说明 | 默认值 |
 |------|------|------|--------|
@@ -612,6 +611,30 @@ HTTP 客户端（`SyncHTTPClient` / `AsyncHTTPClient`）和 CLI 工具连接远
 
 启动方式和部署详情见 [服务部署](./03-deployment.md)，认证详情见 [认证](./04-authentication.md)。
 
+## storage.transaction 段
+
+事务机制默认启用，通常无需配置。**默认行为是不等待**：若目标路径已被其他事务锁定，操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试，请将 `lock_timeout` 设为正数。
+
+```json
+{
+  "storage": {
+    "transaction": {
+      "lock_timeout": 5.0,
+      "lock_expire": 300.0,
+      "max_parallel_locks": 8
+    }
+  }
+}
+```
+
+| 参数 | 类型 | 说明 | 默认值 |
+|------|------|------|--------|
+| `lock_timeout` | float | 获取路径锁的等待超时（秒）。`0` = 立即失败（默认）；`> 0` = 最多等待此时间后抛出 `LockAcquisitionError` | `0.0` |
+| `lock_expire` | float | 锁过期时间（秒）。超过此时间的事务锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` |
+| `max_parallel_locks` | int | rm/mv 操作递归加锁时的最大并行数 | `8` |
+
+事务机制的详细说明见 [事务机制](../concepts/09-transaction.md)。
+
 ## 完整 Schema
 
 ```json
@@ -646,6 +669,11 @@ HTTP 客户端（`SyncHTTPClient` / `AsyncHTTPClient`）和 CLI 工具连接远
       "url": "string",
       "timeout": 10
     },
+    "transaction": {
+      "lock_timeout": 0.0,
+      "lock_expire": 300.0,
+      "max_parallel_locks": 8
+    },
     "vectordb": {
       "backend": "local|remote",
       "url": "string",
diff --git a/openviking/agfs_manager.py b/openviking/agfs_manager.py
index 14ed124a..9ae796f2 100644
--- a/openviking/agfs_manager.py
+++ b/openviking/agfs_manager.py
@@ -133,9 +133,23 @@ def _generate_config(self) -> Path:
                         "version": "1.0.0",
                     },
                 },
+                # TODO(multi-node): SQLite backend is single-node only. Each AGFS instance
+                # gets its own isolated queue.db under its own data_path, so messages
+                # enqueued on node A are invisible to node B. For multi-node deployments,
+                # switch backend to "tidb" or "mysql" so all nodes share the same queue.
+                #
+                # Additionally, the TiDB backend currently uses immediate soft-delete on
+                # Dequeue (no two-phase status='processing' transition), meaning there is
+                # no at-least-once guarantee: a worker crash loses the in-flight message.
+                # The TiDB backend's Ack() and RecoverStale() are both no-ops and must be
+                # implemented before it can be used safely in production.
                 "queuefs": {
                     "enabled": True,
                     "path": "/queue",
+                    "config": {
+                        "backend": "sqlite",
+                        "db_path": str(self.data_path / "_system" / "queue" / "queue.db"),
+                    },
                 },
             },
         }
@@ -196,6 +210,7 @@ def start(self) -> None:
         self._check_port_available()
 
         self.vikingfs_path.mkdir(parents=True, exist_ok=True)
+        (self.data_path / "_system" / "queue").mkdir(parents=True, exist_ok=True)
         # NOTICE: should use viking://temp/ instead of self.vikingfs_path / "temp"
         # Create temp directory for Parser use
         # (self.vikingfs_path / "temp").mkdir(exist_ok=True)
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 820ca554..00e1e354 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -163,9 +163,53 @@ async def finalize_from_temp(
         else:
             logger.info(f"[TreeBuilder] Finalizing from temp: {final_uri}")
 
-        # 4. Move directory tree from temp to final location in AGFS
-        await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx)
-        logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}")
+        # 4. Move directory tree from temp to final location in AGFS (transactional)
+        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
+        tx_manager = get_transaction_manager()
+        final_path = viking_fs._uri_to_path(final_uri, ctx=ctx)
+        # Lock parent directory (final_path doesn't exist yet)
+        parent_path = final_path.rsplit("/", 1)[0] if "/" in final_path else final_path
+
+        if tx_manager:
+            # Ensure parent directories exist before locking
+            await self._ensure_parent_dirs(final_uri, ctx=ctx)
+
+            async with TransactionContext(
+                tx_manager, "finalize_from_temp", [parent_path], lock_mode="point"
+            ) as tx:
+                # Move temp to final
+                seq = tx.record_undo("fs_write_new", {"uri": final_path})
+                await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx)
+                tx.mark_completed(seq)
+                logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}")
+
+                # Register semantic enqueue as post_action
+                tx.add_post_action(
+                    "enqueue_semantic",
+                    {
+                        "uri": final_uri,
+                        "context_type": "resource",
+                        "account_id": ctx.account_id,
+                        "user_id": ctx.user.user_id,
+                        "agent_id": ctx.user.agent_id,
+                        "role": ctx.role.value,
+                    },
+                )
+
+                await tx.commit()
+        else:
+            # Fallback: no transaction support
+            await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx)
+            logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}")
+
+            try:
+                await self._enqueue_semantic_generation(final_uri, "resource", ctx=ctx)
+                logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}")
+            except Exception as e:
+                logger.error(
+                    f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True
+                )
 
         # 5. Cleanup temporary root directory
         try:
@@ -174,21 +218,13 @@ async def finalize_from_temp(
         except Exception as e:
             logger.warning(f"[TreeBuilder] Failed to cleanup temp root: {e}")
 
-        # 6. Enqueue to SemanticQueue for async semantic generation
-        if trigger_semantic:
-            try:
-                await self._enqueue_semantic_generation(final_uri, "resource", ctx=ctx)
-                logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}")
-            except Exception as e:
-                logger.error(f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True)
-
-        # 7. Return simple BuildingTree (no scanning needed)
+        # 6. Return simple BuildingTree (no scanning needed)
         tree = BuildingTree(
             source_path=source_path,
             source_format=source_format,
         )
         tree._root_uri = final_uri
-        
+
         # Create a minimal Context object for the root so that tree.root is not None
         root_context = Context(uri=final_uri)
         tree.add_context(root_context)
diff --git a/openviking/service/core.py b/openviking/service/core.py
index 7b9a35c0..c1c62c73 100644
--- a/openviking/service/core.py
+++ b/openviking/service/core.py
@@ -139,12 +139,20 @@ def _init_storage(
             vectordb_config=config.vectordb, queue_manager=self._queue_manager
         )
 
-        # Configure queues if QueueManager is available
+        # Configure queues if QueueManager is available.
+        # Workers are NOT started here — start() is called after VikingFS is initialized
+        # in initialize(), so that recovered tasks don't race against VikingFS init.
         if self._queue_manager:
-            self._queue_manager.setup_standard_queues(self._vikingdb_manager)
+            self._queue_manager.setup_standard_queues(self._vikingdb_manager, start=False)
 
         # Initialize TransactionManager
-        self._transaction_manager = init_transaction_manager(agfs_config=config.agfs)
+        tx_cfg = config.transaction
+        self._transaction_manager = init_transaction_manager(
+            agfs_config=config.agfs,
+            max_parallel_locks=tx_cfg.max_parallel_locks,
+            lock_timeout=tx_cfg.lock_timeout,
+            lock_expire=tx_cfg.lock_expire,
+        )
 
     @property
     def viking_fs(self) -> Optional[VikingFS]:
@@ -240,6 +248,14 @@ async def initialize(self) -> None:
         if enable_recorder:
             logger.info("VikingFS IO Recorder enabled")
 
+        # Start queue workers now that VikingFS is ready.
+        # Doing it here (rather than in _init_storage) ensures that any tasks
+        # recovered from a previous crash are not processed before VikingFS is
+        # initialized, which would cause "VikingFS not initialized" errors.
+        if self._queue_manager:
+            self._queue_manager.start()
+            logger.info("QueueManager workers started")
+
         # Initialize directories
         directory_initializer = DirectoryInitializer(vikingdb=self._vikingdb_manager)
         self._directory_initializer = directory_initializer
diff --git a/openviking/session/session.py b/openviking/session/session.py
index 62679223..0bbc94aa 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -219,7 +219,14 @@ def update_tool_part(
         self._update_message_in_jsonl()
 
     def commit(self) -> Dict[str, Any]:
-        """Commit session: create archive, extract memories, persist."""
+        """Commit session: two-phase transaction with checkpoint.
+
+        Phase 1 (Archive): Lock session, write archive, clear messages, write checkpoint.
+        LLM call (no transaction): Extract long-term memories.
+        Phase 2 (Memory): Lock session, write memories + relations, update checkpoint.
+        """
+        from openviking.storage.transaction import get_transaction_manager
+
         result = {
             "session_id": self.session_id,
             "status": "committed",
@@ -231,7 +238,10 @@ def commit(self) -> Dict[str, Any]:
         if not self._messages:
             return result
 
-        # 1. Archive current messages
+        tx_manager = get_transaction_manager()
+        session_path = self._viking_fs._uri_to_path(self._session_uri, ctx=self.ctx)
+
+        # ===== Phase 1: Archive =====
         self._compression.compression_index += 1
         messages_to_archive = self._messages.copy()
 
@@ -239,22 +249,35 @@ def commit(self) -> Dict[str, Any]:
         archive_abstract = self._extract_abstract_from_summary(summary)
         archive_overview = summary
 
-        self._write_archive(
-            index=self._compression.compression_index,
-            messages=messages_to_archive,
-            abstract=archive_abstract,
-            overview=archive_overview,
-        )
+        if tx_manager:
+            run_async(
+                self._phase1_archive_async(
+                    tx_manager,
+                    session_path,
+                    self._compression.compression_index,
+                    messages_to_archive,
+                    archive_abstract,
+                    archive_overview,
+                )
+            )
+        else:
+            self._write_archive(
+                index=self._compression.compression_index,
+                messages=messages_to_archive,
+                abstract=archive_abstract,
+                overview=archive_overview,
+            )
+            self._write_to_agfs(messages=[])
 
         self._compression.original_count += len(messages_to_archive)
         result["archived"] = True
-
         self._messages.clear()
         logger.info(
-            f"Archived: {len(messages_to_archive)} messages → history/archive_{self._compression.compression_index:03d}/"
+            f"Archived: {len(messages_to_archive)} messages → "
+            f"history/archive_{self._compression.compression_index:03d}/"
         )
 
-        # 2. Extract long-term memories
+        # ===== LLM call (no transaction) =====
         if self._session_compressor:
             logger.info(
                 f"Starting memory extraction from {len(messages_to_archive)} archived messages"
@@ -271,17 +294,18 @@ def commit(self) -> Dict[str, Any]:
             result["memories_extracted"] = len(memories)
             self._stats.memories_extracted += len(memories)
 
-        # 3. Write current messages to AGFS
-        self._write_to_agfs(self._messages)
+        # ===== Phase 2: Memory write =====
+        if tx_manager:
+            run_async(self._phase2_memory_async(tx_manager, session_path))
+        else:
+            self._write_to_agfs(self._messages)
+            self._write_relations()
 
-        # 4. Create relations
-        self._write_relations()
-
-        # 5. Update active_count
+        # Update active_count
         active_count_updated = self._update_active_counts()
         result["active_count_updated"] = active_count_updated
 
-        # 6. Update statistics
+        # Update statistics
         self._stats.compression_count = self._compression.compression_index
         result["stats"] = {
             "total_turns": self._stats.total_turns,
@@ -294,6 +318,58 @@ def commit(self) -> Dict[str, Any]:
         logger.info(f"Session {self.session_id} committed")
         return result
 
+    async def _phase1_archive_async(
+        self,
+        tx_manager: Any,
+        session_path: str,
+        compression_index: int,
+        messages_to_archive: list,
+        archive_abstract: str,
+        archive_overview: str,
+    ) -> None:
+        """Phase 1 of commit: archive messages inside a transaction."""
+        from openviking.storage.transaction import TransactionContext
+
+        async with TransactionContext(
+            tx_manager, "session_archive", [session_path], lock_mode="point"
+        ) as tx:
+            seq = tx.record_undo("fs_write_new", {"uri": session_path})
+            self._write_archive(
+                index=compression_index,
+                messages=messages_to_archive,
+                abstract=archive_abstract,
+                overview=archive_overview,
+            )
+            self._write_to_agfs(messages=[])
+            self._write_checkpoint({"status": "archived", "archive_index": compression_index})
+            tx.mark_completed(seq)
+            await tx.commit()
+
+    async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None:
+        """Phase 2 of commit: write memories inside a transaction."""
+        from openviking.storage.transaction import TransactionContext
+
+        async with TransactionContext(
+            tx_manager, "session_memory", [session_path], lock_mode="point"
+        ) as tx:
+            seq = tx.record_undo("fs_write_new", {"uri": session_path})
+            self._write_to_agfs(self._messages)
+            self._write_relations()
+            self._write_checkpoint({"status": "completed"})
+            tx.mark_completed(seq)
+            tx.add_post_action(
+                "enqueue_semantic",
+                {
+                    "uri": self._session_uri,
+                    "context_type": "memory",
+                    "account_id": self.ctx.account_id,
+                    "user_id": self.ctx.user.user_id,
+                    "agent_id": self.ctx.user.agent_id,
+                    "role": self.ctx.role.value,
+                },
+            )
+            await tx.commit()
+
     def _update_active_counts(self) -> int:
         """Update active_count for used contexts/skills."""
         if not self._vikingdb_manager:
@@ -581,6 +657,39 @@ def _write_relations(self) -> None:
             except Exception as e:
                 logger.warning(f"Failed to create relation to {usage.uri}: {e}")
 
+    def _write_checkpoint(self, data: Dict[str, Any]) -> None:
+        """Write a commit checkpoint file for crash recovery."""
+        if not self._viking_fs:
+            return
+
+        checkpoint = {
+            **data,
+            "session_id": self.session_id,
+            "compression_index": self._compression.compression_index,
+            "timestamp": get_current_timestamp(),
+        }
+        run_async(
+            self._viking_fs.write_file(
+                f"{self._session_uri}/.commit_checkpoint.json",
+                json.dumps(checkpoint, ensure_ascii=False),
+                ctx=self.ctx,
+            )
+        )
+
+    def _read_checkpoint(self) -> Optional[Dict[str, Any]]:
+        """Read commit checkpoint file if it exists."""
+        if not self._viking_fs:
+            return None
+        try:
+            content = run_async(
+                self._viking_fs.read_file(
+                    f"{self._session_uri}/.commit_checkpoint.json", ctx=self.ctx
+                )
+            )
+            return json.loads(content)
+        except Exception:
+            return None
+
     # ============= Properties =============
 
     @property
diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py
index bc3e36be..7f6a483b 100644
--- a/openviking/storage/errors.py
+++ b/openviking/storage/errors.py
@@ -29,3 +29,15 @@ class ConnectionError(StorageException):
 
 class SchemaError(StorageException):
     """Raised when schema validation fails."""
+
+
+class TransactionError(VikingDBException):
+    """Raised when a transaction operation fails."""
+
+
+class LockAcquisitionError(TransactionError):
+    """Raised when lock acquisition fails."""
+
+
+class TransactionRollbackError(TransactionError):
+    """Raised when transaction rollback fails."""
diff --git a/openviking/storage/queuefs/named_queue.py b/openviking/storage/queuefs/named_queue.py
index ca0e9b29..495a284b 100644
--- a/openviking/storage/queuefs/named_queue.py
+++ b/openviking/storage/queuefs/named_queue.py
@@ -198,6 +198,21 @@ async def enqueue(self, data: Union[str, Dict[str, Any]]) -> str:
         msg_id = self._agfs.write(enqueue_file, data.encode("utf-8"))
         return msg_id if isinstance(msg_id, str) else str(msg_id)
 
+    async def ack(self, msg_id: str) -> None:
+        """Acknowledge successful processing of a message (deletes it from persistent storage).
+
+        Must be called after the dequeue handler finishes processing a message.
+        If not called (e.g. process crashes), the message will be automatically
+        re-queued on the next startup via RecoverStale.
+        """
+        if not msg_id:
+            return
+        ack_file = f"{self.path}/ack"
+        try:
+            self._agfs.write(ack_file, msg_id.encode("utf-8"))
+        except Exception as e:
+            logger.warning(f"[NamedQueue] Ack failed for {self.name} msg_id={msg_id}: {e}")
+
     def _read_queue_message(self) -> Optional[Dict[str, Any]]:
         """Read and remove one message from the AGFS queue; return parsed dict or None.
 
@@ -217,15 +232,30 @@ def _read_queue_message(self) -> Optional[Dict[str, Any]]:
         return json.loads(raw.decode("utf-8"))
 
     async def dequeue(self) -> Optional[Dict[str, Any]]:
-        """Get and remove message from queue, then invoke the dequeue handler."""
+        """Dequeue a message, process it, then ack to confirm deletion.
+
+        Flow (at-least-once delivery):
+          1. Read from /dequeue  → backend marks message as 'processing' (not deleted yet)
+          2. Call on_dequeue()   → actual processing
+          3. Call ack()          → backend deletes the message permanently
+
+        If the process crashes between steps 1 and 3, the backend's RecoverStale
+        on the next startup resets the message back to 'pending' for retry.
+        """
         await self._ensure_initialized()
         try:
             data = self._read_queue_message()
             if data is None:
                 return None
+            # Capture message ID before passing data to handler (handler may modify it)
+            msg_id = data.get("id", "") if isinstance(data, dict) else ""
             if self._dequeue_handler:
                 self._on_dequeue_start()
                 data = await self._dequeue_handler.on_dequeue(data)
+            # Ack unconditionally after handler returns (success or handled error).
+            # If on_dequeue raises, the exception propagates and ack is skipped —
+            # the message will be recovered on next startup.
+            await self.ack(msg_id)
             return data
         except Exception as e:
             logger.debug(f"[NamedQueue] Dequeue failed for {self.name}: {e}")
diff --git a/openviking/storage/queuefs/queue_manager.py b/openviking/storage/queuefs/queue_manager.py
index 95e9aeb2..b5a68af4 100644
--- a/openviking/storage/queuefs/queue_manager.py
+++ b/openviking/storage/queuefs/queue_manager.py
@@ -107,16 +107,16 @@ def start(self) -> None:
 
         logger.info("[QueueManager] Started")
 
-    def setup_standard_queues(self, vector_store: Any) -> None:
+    def setup_standard_queues(self, vector_store: Any, start: bool = True) -> None:
         """
         Setup standard queues (Embedding and Semantic) with their handlers.
 
-        This method initializes the EmbeddingQueue with TextEmbeddingHandler
-        and the SemanticQueue with SemanticProcessor, then ensures the
-        queue manager is started.
-
         Args:
             vector_store: Vector store instance for handlers to write results.
+            start: Whether to start worker threads immediately (default True).
+                   Pass False when the consumer depends on resources that are
+                   not yet initialized (e.g. VikingFS); call start() manually
+                   after those resources are ready.
         """
         # Import handlers here to avoid circular dependencies
         from openviking.storage.collection_schemas import TextEmbeddingHandler
@@ -140,8 +140,8 @@ def setup_standard_queues(self, vector_store: Any) -> None:
         )
         logger.info("Semantic queue initialized with SemanticProcessor")
 
-        # Start QueueManager processing
-        self.start()
+        if start:
+            self.start()
 
     def _start_queue_worker(self, queue: NamedQueue) -> None:
         """Start a dedicated worker thread for a queue if not already running."""
@@ -207,10 +207,14 @@ async def _worker_async_concurrent(
 
         async def process_one(data: Dict[str, Any]) -> None:
             async with sem:
+                msg_id = data.get("id", "") if isinstance(data, dict) else ""
                 try:
                     await queue.process_dequeued(data)
+                    # Ack after successful processing (delete from persistent storage).
+                    await queue.ack(msg_id)
                 except Exception as e:
                     # Handler did not call report_error; decrement in_progress manually.
+                    # Do NOT ack — let RecoverStale re-queue on next startup.
                     queue._on_process_error(str(e), data)
                     logger.error(f"[QueueManager] Concurrent worker error for {queue.name}: {e}")
 
@@ -280,9 +284,6 @@ def get_queue(
         allow_create: bool = False,
     ) -> NamedQueue:
         """Get or create a named queue object."""
-        if not self._started:
-            self.start()
-
         if name not in self._queues:
             if not allow_create:
                 raise RuntimeError(f"Queue {name} does not exist and allow_create is False")
diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index 0307521f..0e894474 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -238,6 +238,9 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]:
         return results
 
     async def _overview_task(self, dir_uri: str) -> None:
+        from openviking.storage.errors import LockAcquisitionError
+        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
         node = self._nodes.get(dir_uri)
         if not node:
             return
@@ -246,26 +249,30 @@ async def _overview_task(self, dir_uri: str) -> None:
             file_summaries = self._finalize_file_summaries(node)
             children_abstracts = self._finalize_children_abstracts(node)
 
-        try:
-            async with self._llm_sem:
-                overview = await self._processor._generate_overview(
-                    dir_uri, file_summaries, children_abstracts
-                )
-            abstract = self._processor._extract_abstract_from_overview(overview)
+        abstract = ""
+        dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx)
 
-            try:
+        try:
+            async with TransactionContext(
+                get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point"
+            ) as tx:
+                async with self._llm_sem:
+                    overview = await self._processor._generate_overview(
+                        dir_uri, file_summaries, children_abstracts
+                    )
+                abstract = self._processor._extract_abstract_from_overview(overview)
                 await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)
                 await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx)
-            except Exception as e:
-                logger.warning(f"Failed to write overview/abstract for {dir_uri}: {e}")
-
-            try:
-                await self._processor._vectorize_directory_simple(
-                    dir_uri, self._context_type, abstract, overview, ctx=self._ctx
-                )
-            except Exception as e:
-                logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True)
-
+                try:
+                    await self._processor._vectorize_directory_simple(
+                        dir_uri, self._context_type, abstract, overview, ctx=self._ctx
+                    )
+                except Exception as e:
+                    logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True)
+                await tx.commit()
+        except LockAcquisitionError:
+            logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping")
+            abstract = ""
         except Exception as e:
             logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True)
             abstract = ""
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 59700783..24e485ed 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -237,33 +237,56 @@ async def _process_single_directory(
         file_paths: List[str],
     ) -> None:
         """Process single directory, generate .abstract.md and .overview.md."""
+        from openviking.storage.errors import LockAcquisitionError
+        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
         viking_fs = get_viking_fs()
+        dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx)
 
-        # 1. Collect .abstract.md from subdirectories (already processed earlier)
-        children_abstracts = await self._collect_children_abstracts(children_uris)
+        try:
+            async with TransactionContext(
+                get_transaction_manager(), "semantic", [dir_path], lock_mode="point"
+            ) as tx:
+                # 1. Collect .abstract.md from subdirectories
+                children_abstracts = await self._collect_children_abstracts(children_uris)
+
+                # 2. Generate file summaries (vectorize inline, not via enqueue)
+                file_summaries = await self._generate_file_summaries(
+                    file_paths, context_type=context_type, parent_uri=uri, enqueue_files=False
+                )
 
-        # 2. Concurrently generate summaries for files in directory
-        file_summaries = await self._generate_file_summaries(
-            file_paths, context_type=context_type, parent_uri=uri, enqueue_files=True
-        )
+                # 3. Generate .overview.md
+                overview = await self._generate_overview(uri, file_summaries, children_abstracts)
 
-        # 3. Generate .overview.md (contains brief description)
-        overview = await self._generate_overview(uri, file_summaries, children_abstracts)
+                # 4. Extract abstract from overview
+                abstract = self._extract_abstract_from_overview(overview)
 
-        # 4. Extract abstract from overview
-        abstract = self._extract_abstract_from_overview(overview)
+                # 5. Write files
+                await viking_fs.write_file(f"{uri}/.overview.md", overview, ctx=self._current_ctx)
+                await viking_fs.write_file(f"{uri}/.abstract.md", abstract, ctx=self._current_ctx)
 
-        # 5. Write files
-        await viking_fs.write_file(f"{uri}/.overview.md", overview, ctx=self._current_ctx)
-        await viking_fs.write_file(f"{uri}/.abstract.md", abstract, ctx=self._current_ctx)
+                logger.debug(f"Generated overview and abstract for {uri}")
 
-        logger.debug(f"Generated overview and abstract for {uri}")
+                # 6. Vectorize directory and files (all inside the lock)
+                try:
+                    await self._vectorize_directory_simple(uri, context_type, abstract, overview)
+                except Exception as e:
+                    logger.error(f"Failed to vectorize directory {uri}: {e}", exc_info=True)
+
+                for fp, summary in zip(file_paths, file_summaries):
+                    try:
+                        await self._vectorize_single_file(
+                            parent_uri=uri,
+                            context_type=context_type,
+                            file_path=fp,
+                            summary_dict=summary,
+                        )
+                    except Exception as e:
+                        logger.error(f"Failed to vectorize file {fp}: {e}", exc_info=True)
 
-        # 6. Vectorize directory
-        try:
-            await self._vectorize_directory_simple(uri, context_type, abstract, overview)
-        except Exception as e:
-            logger.error(f"Failed to vectorize directory {uri}: {e}", exc_info=True)
+                await tx.commit()
+        except LockAcquisitionError:
+            logger.info(f"[SemanticProcessor] {uri} does not exist or is locked, skipping")
 
     async def _collect_children_abstracts(self, children_uris: List[str]) -> List[Dict[str, str]]:
         """Collect .abstract.md from subdirectories."""
diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py
index b6c06d6e..2730cd2e 100644
--- a/openviking/storage/transaction/__init__.py
+++ b/openviking/storage/transaction/__init__.py
@@ -6,6 +6,8 @@
 Provides transaction management and lock mechanisms for data operations.
 """
 
+from openviking.storage.transaction.context_manager import TransactionContext
+from openviking.storage.transaction.journal import TransactionJournal
 from openviking.storage.transaction.path_lock import PathLock
 from openviking.storage.transaction.transaction_manager import (
     TransactionManager,
@@ -16,12 +18,17 @@
     TransactionRecord,
     TransactionStatus,
 )
+from openviking.storage.transaction.undo import UndoEntry, execute_rollback
 
 __all__ = [
     "PathLock",
+    "TransactionContext",
+    "TransactionJournal",
     "TransactionManager",
     "TransactionRecord",
     "TransactionStatus",
-    "init_transaction_manager",
+    "UndoEntry",
+    "execute_rollback",
     "get_transaction_manager",
+    "init_transaction_manager",
 ]
diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py
new file mode 100644
index 00000000..10107dde
--- /dev/null
+++ b/openviking/storage/transaction/context_manager.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Transaction context manager for OpenViking.
+
+Provides an async context manager that wraps a set of operations in a
+transaction with automatic rollback on failure.
+"""
+
+from typing import Any, Dict, List, Optional
+
+from openviking.storage.errors import LockAcquisitionError, TransactionError
+from openviking.storage.transaction.transaction_record import TransactionRecord
+from openviking.storage.transaction.undo import UndoEntry
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class TransactionContext:
+    """Async context manager for transactional operations.
+
+    Usage::
+
+        async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx:
+            seq = tx.record_undo("fs_rm", {"uri": uri})
+            # ... do work ...
+            tx.mark_completed(seq)
+            await tx.commit()
+    """
+
+    def __init__(
+        self,
+        tx_manager: Any,
+        operation: str,
+        lock_paths: List[str],
+        lock_mode: str = "point",
+        mv_dst_path: Optional[str] = None,
+    ):
+        self._tx_manager = tx_manager
+        self._operation = operation
+        self._lock_paths = lock_paths
+        self._lock_mode = lock_mode
+        self._mv_dst_path = mv_dst_path
+        self._record: Optional[TransactionRecord] = None
+        self._committed = False
+        self._sequence = 0
+
+    @property
+    def record(self) -> TransactionRecord:
+        if self._record is None:
+            raise TransactionError("Transaction not started")
+        return self._record
+
+    async def __aenter__(self) -> "TransactionContext":
+        self._record = self._tx_manager.create_transaction(
+            init_info={
+                "operation": self._operation,
+                "lock_paths": self._lock_paths,
+                "lock_mode": self._lock_mode,
+                "mv_dst_path": self._mv_dst_path,
+            }
+        )
+        tx_id = self._record.id
+
+        # Write journal BEFORE acquiring locks so that crash recovery can
+        # find orphan locks via init_info even if the process dies between
+        # lock creation and journal update.
+        try:
+            self._tx_manager.journal.write(self._record.to_journal())
+        except Exception as e:
+            logger.warning(f"[Transaction] Failed to write journal for {tx_id}: {e}")
+
+        success = False
+        if self._lock_mode == "subtree":
+            for path in self._lock_paths:
+                success = await self._tx_manager.acquire_lock_subtree(tx_id, path)
+                if not success:
+                    break
+        elif self._lock_mode == "mv":
+            if len(self._lock_paths) < 1 or not self._mv_dst_path:
+                raise TransactionError("mv lock mode requires lock_paths[0] and mv_dst_path")
+            success = await self._tx_manager.acquire_lock_mv(
+                tx_id, self._lock_paths[0], self._mv_dst_path
+            )
+        else:
+            # "point" mode (default)
+            for path in self._lock_paths:
+                success = await self._tx_manager.acquire_lock_point(tx_id, path)
+                if not success:
+                    break
+
+        if not success:
+            await self._tx_manager.rollback(tx_id)
+            raise LockAcquisitionError(
+                f"Failed to acquire {self._lock_mode} lock for {self._lock_paths}"
+            )
+
+        # Update journal with actual lock paths now populated in the record.
+        try:
+            self._tx_manager.journal.update(self._record.to_journal())
+        except Exception as e:
+            logger.warning(f"[Transaction] Failed to update journal for {tx_id}: {e}")
+
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if not self._committed:
+            try:
+                await self._tx_manager.rollback(self._record.id)
+            except Exception as e:
+                logger.error(f"Rollback failed during __aexit__: {e}")
+        return False
+
+    def record_undo(self, op_type: str, params: Dict[str, Any]) -> int:
+        seq = self._sequence
+        self._sequence += 1
+        entry = UndoEntry(sequence=seq, op_type=op_type, params=params)
+        self.record.undo_log.append(entry)
+
+        try:
+            self._tx_manager.journal.update(self.record.to_journal())
+        except Exception:
+            pass
+
+        return seq
+
+    def mark_completed(self, sequence: int) -> None:
+        for entry in self.record.undo_log:
+            if entry.sequence == sequence:
+                entry.completed = True
+                break
+
+        try:
+            self._tx_manager.journal.update(self.record.to_journal())
+        except Exception:
+            pass
+
+    def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None:
+        self.record.post_actions.append({"type": action_type, "params": params})
+
+    async def commit(self) -> None:
+        self._committed = True
+        success = await self._tx_manager.commit(self._record.id)
+        if not success:
+            raise TransactionError(f"Failed to commit transaction {self._record.id}")
diff --git a/openviking/storage/transaction/journal.py b/openviking/storage/transaction/journal.py
new file mode 100644
index 00000000..d641e905
--- /dev/null
+++ b/openviking/storage/transaction/journal.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Transaction journal for crash recovery.
+
+Persists transaction state to AGFS so that incomplete transactions can be
+detected and recovered after a process restart.
+"""
+
+import json
+from typing import Any, Dict, List
+
+from pyagfs import AGFSClient
+
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+# Journal root path (global, not behind VikingFS URI mapping)
+_JOURNAL_ROOT = "/local/_system/transactions"
+
+
+class TransactionJournal:
+    """Persists transaction records to AGFS for crash recovery.
+
+    Journal files live at ``/local/_system/transactions/{tx_id}/journal.json``.
+    """
+
+    def __init__(self, agfs: AGFSClient):
+        self._agfs = agfs
+
+    def _tx_dir(self, tx_id: str) -> str:
+        return f"{_JOURNAL_ROOT}/{tx_id}"
+
+    def _journal_path(self, tx_id: str) -> str:
+        return f"{_JOURNAL_ROOT}/{tx_id}/journal.json"
+
+    def _ensure_dir(self, path: str) -> None:
+        """Create directory, ignoring already-exists errors."""
+        try:
+            self._agfs.mkdir(path)
+        except Exception as e:
+            logger.warning(f"[Journal] mkdir {path}: {e}")
+
+    def write(self, data: Dict[str, Any]) -> None:
+        """Create a new journal entry for a transaction.
+
+        Args:
+            data: Serialized transaction record (from TransactionRecord.to_journal()).
+        """
+        tx_id = data["id"]
+        self._ensure_dir("/local/_system")
+        self._ensure_dir(_JOURNAL_ROOT)
+        self._ensure_dir(self._tx_dir(tx_id))
+        payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8")
+        self._agfs.write(self._journal_path(tx_id), payload)
+        logger.info(f"[Journal] Written: {self._journal_path(tx_id)}")
+
+    def update(self, data: Dict[str, Any]) -> None:
+        """Overwrite an existing journal entry.
+
+        Args:
+            data: Updated serialized transaction record.
+        """
+        tx_id = data["id"]
+        payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8")
+        self._agfs.write(self._journal_path(tx_id), payload)
+
+    def read(self, tx_id: str) -> Dict[str, Any]:
+        """Read a journal entry.
+
+        Args:
+            tx_id: Transaction ID.
+
+        Returns:
+            Parsed journal data.
+
+        Raises:
+            FileNotFoundError: If journal does not exist.
+        """
+        content = self._agfs.cat(self._journal_path(tx_id))
+        if isinstance(content, bytes):
+            content = content.decode("utf-8")
+        return json.loads(content)
+
+    def delete(self, tx_id: str) -> None:
+        """Delete a transaction's journal directory.
+
+        Args:
+            tx_id: Transaction ID.
+        """
+        try:
+            self._agfs.rm(self._tx_dir(tx_id), recursive=True)
+            logger.debug(f"[Journal] Deleted journal for tx {tx_id}")
+        except Exception as e:
+            logger.warning(f"[Journal] Failed to delete journal for tx {tx_id}: {e}")
+
+    def list_all(self) -> List[str]:
+        """List all transaction IDs that have journal entries.
+
+        Returns:
+            List of transaction ID strings.
+        """
+        try:
+            entries = self._agfs.ls(_JOURNAL_ROOT)
+            tx_ids = []
+            if isinstance(entries, list):
+                for entry in entries:
+                    name = entry.get("name", "") if isinstance(entry, dict) else str(entry)
+                    if name and name not in (".", "..") and entry.get("isDir", True):
+                        tx_ids.append(name)
+            return tx_ids
+        except Exception:
+            return []
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index c4f959b6..d66bf4c5 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -5,10 +5,33 @@
 
 Provides path-based locking mechanism to prevent concurrent directory operations.
 Lock protocol: viking://resources/.../.path.ovlock file exists = locked
+
+Lock files contain a fencing token in the format ``{tx_id}:{time_ns}:{lock_type}`` so that
+stale locks (left by crashed processes) can be detected and removed.
+
+Two lock types:
+  POINT (P): Locks a specific directory for write/semantic operations.
+             Blocks if any ancestor holds a SUBTREE lock.
+  SUBTREE (S): Locks an entire directory subtree for rm/mv-source operations.
+               Blocks if any descendant holds any lock.
+
+Livelock prevention: after both parties write their lock files and detect a conflict,
+the "later" one (larger (timestamp, tx_id)) backs off and retries.
+
+# TODO(multi-node): File-based locks only work correctly when all nodes share the
+# same AGFS backend with strong read-write consistency. For multi-node deployments
+# with replicated or partitioned storage, replace this implementation with a
+# distributed lock backend (e.g. etcd txn+lease, ZooKeeper ephemeral nodes).
+# The PathLock interface should be extracted to allow swappable backends.
+# Key requirements for a distributed backend:
+#   - Atomic compare-and-set (to avoid write-write races on lock acquisition)
+#   - Session-bound leases (so crashed nodes auto-release without TTL polling)
+#   - Monotonically increasing fencing tokens (etcd revision works well)
 """
 
 import asyncio
-from typing import List, Optional
+import time
+from typing import Optional, Tuple
 
 from pyagfs import AGFSClient
 
@@ -20,301 +43,495 @@
 # Lock file name
 LOCK_FILE_NAME = ".path.ovlock"
 
+# Lock type constants
+LOCK_TYPE_POINT = "P"
+LOCK_TYPE_SUBTREE = "S"
+
+# Default poll interval when waiting for a lock (seconds)
+_POLL_INTERVAL = 0.2
+
+
+def _make_fencing_token(tx_id: str, lock_type: str = LOCK_TYPE_POINT) -> str:
+    """Create a fencing token for a transaction.
+
+    Format: ``{tx_id}:{time_ns}:{lock_type}`` where time_ns is the current
+    wall-clock time in nanoseconds and lock_type is P or S.
+
+    Args:
+        tx_id: Transaction ID
+        lock_type: Lock type, either LOCK_TYPE_POINT ("P") or LOCK_TYPE_SUBTREE ("S")
+
+    Returns:
+        Fencing token string
+    """
+    return f"{tx_id}:{time.time_ns()}:{lock_type}"
+
+
+def _parse_fencing_token(token: str) -> Tuple[str, int, str]:
+    """Parse a fencing token into (tx_id, timestamp_ns, lock_type).
+
+    Supports:
+    - New format: ``{tx_id}:{time_ns}:P`` or ``{tx_id}:{time_ns}:S``
+    - Legacy format: ``{tx_id}:{time_ns}`` (defaults to POINT)
+    - Very legacy: plain tx_id (ts=0, defaults to POINT)
+
+    Args:
+        token: Fencing token string
+
+    Returns:
+        (tx_id, timestamp_ns, lock_type) — timestamp_ns is 0 for legacy tokens,
+        lock_type defaults to LOCK_TYPE_POINT for legacy tokens.
+    """
+    # New format ends with ":P" or ":S"
+    if token.endswith(f":{LOCK_TYPE_POINT}") or token.endswith(f":{LOCK_TYPE_SUBTREE}"):
+        lock_type = token[-1]
+        rest = token[:-2]  # strip ":{lock_type}"
+        idx = rest.rfind(":")
+        if idx >= 0:
+            tx_id_part = rest[:idx]
+            ts_part = rest[idx + 1 :]
+            try:
+                return tx_id_part, int(ts_part), lock_type
+            except ValueError:
+                pass
+        return rest, 0, lock_type
+
+    # Legacy format: {tx_id}:{time_ns}
+    if ":" in token:
+        idx = token.rfind(":")
+        tx_id_part = token[:idx]
+        ts_part = token[idx + 1 :]
+        try:
+            return tx_id_part, int(ts_part), LOCK_TYPE_POINT
+        except ValueError:
+            pass
+
+    return token, 0, LOCK_TYPE_POINT
+
 
 class PathLock:
     """Path lock manager for transaction-based directory locking.
 
     Implements path-based locking using lock files (.path.ovlock) to prevent
     concurrent operations on the same directory tree.
+
+    Two lock types:
+      POINT (P): Used for write and semantic processing operations.
+      SUBTREE (S): Used for rm and mv-source operations.
     """
 
-    def __init__(self, agfs_client: AGFSClient):
+    def __init__(self, agfs_client: AGFSClient, lock_expire: float = 300.0):
         """Initialize path lock manager.
 
         Args:
             agfs_client: AGFS client for file system operations
+            lock_expire: Stale lock expiry threshold in seconds (default: 300s).
+                Locks held longer than this by a crashed process are force-released.
         """
         self._agfs = agfs_client
+        self._lock_expire = lock_expire
 
     def _get_lock_path(self, path: str) -> str:
-        """Get lock file path for a directory.
-
-        Args:
-            path: Directory path to lock
-
-        Returns:
-            Lock file path (path/.path.ovlock)
-        """
-        # Remove trailing slash if present
+        """Get lock file path for a directory."""
         path = path.rstrip("/")
         return f"{path}/{LOCK_FILE_NAME}"
 
     def _get_parent_path(self, path: str) -> Optional[str]:
-        """Get parent directory path.
-
-        Args:
-            path: Directory path
-
-        Returns:
-            Parent directory path or None if at root
-        """
+        """Get parent directory path."""
         path = path.rstrip("/")
         if "/" not in path:
             return None
         parent = path.rsplit("/", 1)[0]
         return parent if parent else None
 
-    async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool:
-        """Check if path is locked by another transaction.
-
-        Args:
-            lock_path: Lock file path
-            transaction_id: Current transaction ID
-
-        Returns:
-            True if locked by another transaction, False otherwise
-        """
+    def _read_token(self, lock_path: str) -> Optional[str]:
+        """Read fencing token from lock file, returning None if absent."""
         try:
             content = self._agfs.cat(lock_path)
             if isinstance(content, bytes):
-                lock_owner = content.decode("utf-8").strip()
-            else:
-                lock_owner = str(content).strip()
-            return lock_owner != transaction_id
+                return content.decode("utf-8").strip()
+            return str(content).strip()
         except Exception:
-            # Lock file doesn't exist or can't be read - not locked
+            return None
+
+    async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool:
+        """Check if path is locked by another transaction (any lock type)."""
+        token = self._read_token(lock_path)
+        if token is None:
             return False
+        lock_owner, _, _ = _parse_fencing_token(token)
+        return lock_owner != transaction_id
 
-    async def _create_lock_file(self, lock_path: str, transaction_id: str) -> None:
-        """Create lock file with transaction ID.
-
-        Args:
-            lock_path: Lock file path
-            transaction_id: Transaction ID to write to lock file
-        """
-        self._agfs.write(lock_path, transaction_id.encode("utf-8"))
+    async def _create_lock_file(
+        self, lock_path: str, transaction_id: str, lock_type: str = LOCK_TYPE_POINT
+    ) -> None:
+        """Create lock file with fencing token."""
+        token = _make_fencing_token(transaction_id, lock_type)
+        self._agfs.write(lock_path, token.encode("utf-8"))
 
     async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool:
-        """Verify lock file is owned by current transaction.
+        """Verify lock file is owned by current transaction."""
+        token = self._read_token(lock_path)
+        if token is None:
+            return False
+        lock_owner, _, _ = _parse_fencing_token(token)
+        return lock_owner == transaction_id
+
+    async def _remove_lock_file(self, lock_path: str) -> None:
+        """Remove lock file."""
+        try:
+            self._agfs.rm(lock_path)
+        except Exception:
+            pass
+
+    def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool:
+        """Check if a lock file is stale (left by a crashed process).
+
+        A lock is considered stale if:
+        - The lock file does not exist (already cleaned up)
+        - The lock file contains a legacy token (no timestamp)
+        - The lock has been held longer than ``expire_seconds``
 
         Args:
             lock_path: Lock file path
-            transaction_id: Current transaction ID
+            expire_seconds: Lock expiry threshold in seconds (default: 5 minutes)
 
         Returns:
-            True if lock is owned by current transaction, False otherwise
+            True if the lock is stale, False if it is still fresh
         """
-        try:
-            content = self._agfs.cat(lock_path)
-            if isinstance(content, bytes):
-                lock_owner = content.decode("utf-8").strip()
-            else:
-                lock_owner = str(content).strip()
-            return lock_owner == transaction_id
-        except Exception:
-            return False
+        token = self._read_token(lock_path)
+        if token is None:
+            return True  # No file = stale
+        _, ts, _ = _parse_fencing_token(token)
+        if ts == 0:
+            return True  # Legacy format = consider stale
+        age = (time.time_ns() - ts) / 1e9
+        return age > expire_seconds
+
+    async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> Optional[str]:
+        """Walk all ancestor directories and return the first SUBTREE lock held by another tx.
 
-    async def _remove_lock_file(self, lock_path: str) -> None:
-        """Remove lock file.
+        Args:
+            path: Starting directory path (its ancestors are checked, not itself)
+            exclude_tx_id: Transaction ID to exclude from conflict detection
+
+        Returns:
+            Lock file path of the conflicting SUBTREE lock, or None if no conflict
+        """
+        parent = self._get_parent_path(path)
+        while parent:
+            lock_path = self._get_lock_path(parent)
+            token = self._read_token(lock_path)
+            if token is not None:
+                owner_id, _, lock_type = _parse_fencing_token(token)
+                if owner_id != exclude_tx_id and lock_type == LOCK_TYPE_SUBTREE:
+                    return lock_path
+            parent = self._get_parent_path(parent)
+        return None
+
+    async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Optional[str]:
+        """Recursively scan all descendant directories for locks held by another tx.
 
         Args:
-            lock_path: Lock file path
+            path: Root directory path to scan (its own lock is NOT checked here)
+            exclude_tx_id: Transaction ID to exclude from conflict detection
+
+        Returns:
+            Lock file path of the first conflicting lock found, or None if no conflict
         """
         try:
-            self._agfs.rm(lock_path)
-        except Exception:
-            # Lock file might not exist, ignore
-            pass
-
-    async def acquire_normal(self, path: str, transaction: TransactionRecord) -> bool:
-        """Acquire path lock for normal operations.
+            entries = self._agfs.ls(path)
+            if not isinstance(entries, list):
+                return None
+            for entry in entries:
+                if not isinstance(entry, dict):
+                    continue
+                name = entry.get("name", "")
+                if not name or name in (".", ".."):
+                    continue
+                if not entry.get("isDir", False):
+                    continue
+                subdir = f"{path.rstrip('/')}/{name}"
+                subdir_lock = self._get_lock_path(subdir)
+                token = self._read_token(subdir_lock)
+                if token is not None:
+                    owner_id, _, _ = _parse_fencing_token(token)
+                    if owner_id != exclude_tx_id:
+                        return subdir_lock
+                # Recurse into subdir
+                result = await self._scan_descendants_for_locks(subdir, exclude_tx_id)
+                if result:
+                    return result
+        except Exception as e:
+            logger.warning(f"Failed to scan descendants of {path}: {e}")
+        return None
 
-        Lock acquisition flow for normal operations:
-        1. Check if target directory exists
-        2. Check if target directory is locked by another transaction
-        3. Check if parent directory is locked by another transaction
-        4. Create .path.ovlock file with transaction ID
-        5. Check again if parent directory is locked by another transaction
-        6. Read lock file to confirm it contains current transaction ID
-        7. Return success if all checks pass
+    async def acquire_point(
+        self, path: str, transaction: TransactionRecord, timeout: float = 0.0
+    ) -> bool:
+        """Acquire POINT lock for write/semantic-processing operations.
+
+        A POINT lock is placed on a single directory. It conflicts with:
+        - Any lock (P or S) on the same directory by another transaction
+        - Any SUBTREE (S) lock on any ancestor directory
+
+        Lock acquisition flow:
+        1. Check target directory exists
+        2. Check if target directory is locked by another transaction → wait/stale-remove
+        3. Check if any ancestor holds a SUBTREE lock → wait/stale-remove
+        4. Write POINT(P) lock file
+        5. TOCTOU double-check: re-scan ancestors for SUBTREE locks
+           - Conflict found: compare (ts, tx_id); later one backs off and retries
+        6. Verify lock ownership
+        7. Return success
 
         Args:
             path: Directory path to lock
             transaction: Transaction record
+            timeout: Maximum time to wait for the lock in seconds.
+                0 (default) = fail immediately if locked.
+                > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout.
 
         Returns:
-            True if lock acquired successfully, False otherwise
+            True if lock acquired successfully, False if timeout exceeded
         """
         transaction_id = transaction.id
         lock_path = self._get_lock_path(path)
-        parent_path = self._get_parent_path(path)
+        deadline = asyncio.get_event_loop().time() + timeout
 
-        # Step 1: Check if target directory exists
+        # Step 1: Check target directory exists (once, before polling)
         try:
             self._agfs.stat(path)
         except Exception:
-            logger.warning(f"Directory does not exist: {path}")
-            return False
-
-        # Step 2: Check if target directory is locked by another transaction
-        if await self._is_locked_by_other(lock_path, transaction_id):
-            logger.warning(f"Path already locked by another transaction: {path}")
-            return False
-
-        # Step 3: Check if parent directory is locked by another transaction
-        if parent_path:
-            parent_lock_path = self._get_lock_path(parent_path)
-            if await self._is_locked_by_other(parent_lock_path, transaction_id):
-                logger.warning(f"Parent path locked by another transaction: {parent_path}")
-                return False
-
-        # Step 4: Create lock file
-        try:
-            await self._create_lock_file(lock_path, transaction_id)
-        except Exception as e:
-            logger.error(f"Failed to create lock file: {e}")
+            logger.warning(f"[POINT] Directory does not exist: {path}")
             return False
 
-        # Step 5: Check again if parent directory is locked
-        if parent_path:
-            parent_lock_path = self._get_lock_path(parent_path)
-            if await self._is_locked_by_other(parent_lock_path, transaction_id):
-                logger.warning(f"Parent path locked after lock creation: {parent_path}")
-                await self._remove_lock_file(lock_path)
+        while True:
+            # Step 2: Check if target directory is locked by another transaction
+            if await self._is_locked_by_other(lock_path, transaction_id):
+                if self.is_lock_stale(lock_path, self._lock_expire):
+                    logger.warning(f"[POINT] Removing stale lock: {lock_path}")
+                    await self._remove_lock_file(lock_path)
+                    continue
+                if asyncio.get_event_loop().time() >= deadline:
+                    logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Step 3: Check all ancestors for SUBTREE locks
+            ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id)
+            if ancestor_conflict:
+                if self.is_lock_stale(ancestor_conflict, self._lock_expire):
+                    logger.warning(
+                        f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
+                    )
+                    await self._remove_lock_file(ancestor_conflict)
+                    continue
+                if asyncio.get_event_loop().time() >= deadline:
+                    logger.warning(
+                        f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
+                    )
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Step 4: Write POINT lock file
+            try:
+                await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_POINT)
+            except Exception as e:
+                logger.error(f"[POINT] Failed to create lock file: {e}")
                 return False
 
-        # Step 6: Verify lock ownership
-        if not await self._verify_lock_ownership(lock_path, transaction_id):
-            logger.error(f"Lock ownership verification failed: {path}")
-            return False
-
-        # Step 7: Success - add lock to transaction
-        transaction.add_lock(lock_path)
-        logger.debug(f"Lock acquired: {lock_path}")
-        return True
-
-    async def _collect_subdirectories(self, path: str) -> List[str]:
-        """Collect all subdirectory paths recursively.
-
-        Args:
-            path: Root directory path
-
-        Returns:
-            List of all subdirectory paths
-        """
-        subdirs = []
-        try:
-            entries = self._agfs.ls(path)
-            if isinstance(entries, list):
-                for entry in entries:
-                    if isinstance(entry, dict) and entry.get("isDir"):
-                        entry_path = entry.get("name", "")
-                        if entry_path:
-                            subdirs.append(entry_path)
-                            # Recursively collect subdirectories
-                            subdirs.extend(await self._collect_subdirectories(entry_path))
-        except Exception as e:
-            logger.warning(f"Failed to list directory {path}: {e}")
-
-        return subdirs
+            # Step 5: TOCTOU double-check ancestors for SUBTREE locks
+            backed_off = False
+            conflict_after = await self._check_ancestors_for_subtree(path, transaction_id)
+            if conflict_after:
+                their_token = self._read_token(conflict_after)
+                if their_token:
+                    their_tx_id, their_ts, _ = _parse_fencing_token(their_token)
+                    my_token = self._read_token(lock_path)
+                    _, my_ts, _ = (
+                        _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT)
+                    )
+                    # Later one (larger (ts, tx_id)) backs off
+                    if (my_ts, transaction_id) > (their_ts, their_tx_id):
+                        logger.debug(f"[POINT] Backing off (livelock guard) on {path}")
+                        await self._remove_lock_file(lock_path)
+                        backed_off = True
+                # Either: I backed off, or they will back off.
+                # In both cases restart the outer loop after a brief wait.
+                if asyncio.get_event_loop().time() >= deadline:
+                    if not backed_off:
+                        await self._remove_lock_file(lock_path)
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Step 6: Verify lock ownership
+            if not await self._verify_lock_ownership(lock_path, transaction_id):
+                logger.debug(f"[POINT] Lock ownership verification failed: {path}")
+                if asyncio.get_event_loop().time() >= deadline:
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Success
+            transaction.add_lock(lock_path)
+            logger.debug(f"[POINT] Lock acquired: {lock_path}")
+            return True
 
-    async def acquire_rm(
-        self, path: str, transaction: TransactionRecord, max_parallel: int = 8
+    async def acquire_subtree(
+        self, path: str, transaction: TransactionRecord, timeout: float = 0.0
     ) -> bool:
-        """Acquire path lock for rm operation using bottom-up parallel locking.
-
-        Lock acquisition flow for rm operations (parallel bottom-up mode):
-        1. Collect all subdirectory paths recursively
-        2. Sort by depth (deepest first)
-        3. Create lock files in batches with limited parallelism
-        4. Lock the target directory last
-        5. If any lock fails, release all acquired locks in reverse order
+        """Acquire SUBTREE lock for rm/mv-source operations.
+
+        A SUBTREE lock is placed on a single directory (the root of the subtree).
+        It conflicts with:
+        - Any lock (P or S) on the same directory by another transaction
+        - Any lock (P or S) on any descendant directory by another transaction
+
+        Lock acquisition flow:
+        1. Check target directory exists
+        2. Check if target directory is locked by another transaction → wait/stale-remove
+        3. Scan all descendants for any locks → wait/stale-remove
+        4. Write SUBTREE(S) lock file (only one file, at the root path)
+        5. TOCTOU double-check: re-scan descendants for any new locks
+           - Conflict found: compare (ts, tx_id); later one backs off and retries
+        6. Verify lock ownership
+        7. Return success
 
         Args:
-            path: Directory path to lock
+            path: Directory path to lock (root of the subtree)
             transaction: Transaction record
-            max_parallel: Maximum number of parallel lock operations
+            timeout: Maximum time to wait for the lock in seconds.
+                0 (default) = fail immediately if locked.
+                > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout.
 
         Returns:
-            True if all locks acquired successfully, False otherwise
+            True if lock acquired successfully, False if timeout exceeded
         """
         transaction_id = transaction.id
         lock_path = self._get_lock_path(path)
-        acquired_locks = []
+        deadline = asyncio.get_event_loop().time() + timeout
 
-        # Step 1: Collect all subdirectories
-        subdirs = await self._collect_subdirectories(path)
+        # Step 1: Check target directory exists
+        try:
+            self._agfs.stat(path)
+        except Exception:
+            logger.warning(f"[SUBTREE] Directory does not exist: {path}")
+            return False
 
-        # Step 2: Sort by depth (deepest first)
-        subdirs.sort(key=lambda p: p.count("/"), reverse=True)
+        while True:
+            # Step 2: Check if target directory is locked by another transaction
+            if await self._is_locked_by_other(lock_path, transaction_id):
+                if self.is_lock_stale(lock_path, self._lock_expire):
+                    logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}")
+                    await self._remove_lock_file(lock_path)
+                    continue
+                if asyncio.get_event_loop().time() >= deadline:
+                    logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Step 3: Scan all descendants for any locks by other transactions
+            desc_conflict = await self._scan_descendants_for_locks(path, transaction_id)
+            if desc_conflict:
+                if self.is_lock_stale(desc_conflict, self._lock_expire):
+                    logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}")
+                    await self._remove_lock_file(desc_conflict)
+                    continue
+                if asyncio.get_event_loop().time() >= deadline:
+                    logger.warning(
+                        f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}"
+                    )
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Step 4: Write SUBTREE lock file (only one file)
+            try:
+                await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_SUBTREE)
+            except Exception as e:
+                logger.error(f"[SUBTREE] Failed to create lock file: {e}")
+                return False
 
-        # Step 3: Create lock files in batches
-        try:
-            # Lock subdirectories in batches
-            for i in range(0, len(subdirs), max_parallel):
-                batch = subdirs[i : i + max_parallel]
-                tasks = []
-                for subdir in batch:
-                    subdir_lock_path = self._get_lock_path(subdir)
-                    tasks.append(self._create_lock_file(subdir_lock_path, transaction_id))
-
-                # Execute batch in parallel
-                await asyncio.gather(*tasks)
-                acquired_locks.extend([self._get_lock_path(s) for s in batch])
-
-            # Step 4: Lock target directory
-            await self._create_lock_file(lock_path, transaction_id)
-            acquired_locks.append(lock_path)
-
-            # Add all locks to transaction
-            for lock in acquired_locks:
-                transaction.add_lock(lock)
-
-            logger.debug(f"RM locks acquired for {len(acquired_locks)} paths")
+            # Step 5: TOCTOU double-check descendants
+            backed_off = False
+            conflict_after = await self._scan_descendants_for_locks(path, transaction_id)
+            if conflict_after:
+                their_token = self._read_token(conflict_after)
+                if their_token:
+                    their_tx_id, their_ts, _ = _parse_fencing_token(their_token)
+                    my_token = self._read_token(lock_path)
+                    _, my_ts, _ = (
+                        _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE)
+                    )
+                    # Later one (larger (ts, tx_id)) backs off
+                    if (my_ts, transaction_id) > (their_ts, their_tx_id):
+                        logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}")
+                        await self._remove_lock_file(lock_path)
+                        backed_off = True
+                # Either: I backed off, or they will back off.
+                # In both cases restart the outer loop after a brief wait.
+                if asyncio.get_event_loop().time() >= deadline:
+                    if not backed_off:
+                        await self._remove_lock_file(lock_path)
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Step 6: Verify lock ownership
+            if not await self._verify_lock_ownership(lock_path, transaction_id):
+                logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}")
+                if asyncio.get_event_loop().time() >= deadline:
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
+            # Success
+            transaction.add_lock(lock_path)
+            logger.debug(f"[SUBTREE] Lock acquired: {lock_path}")
             return True
 
-        except Exception as e:
-            logger.error(f"Failed to acquire RM locks: {e}")
-            # Step 5: Release all acquired locks in reverse order
-            for lock in reversed(acquired_locks):
-                await self._remove_lock_file(lock)
-            return False
-
     async def acquire_mv(
         self,
         src_path: str,
         dst_path: str,
         transaction: TransactionRecord,
-        max_parallel: int = 8,
+        timeout: float = 0.0,
     ) -> bool:
         """Acquire path lock for mv operation.
 
         Lock acquisition flow for mv operations:
-        1. Lock source directory (using RM-style locking)
-        2. Lock destination directory (using normal locking)
+        1. Acquire SUBTREE lock on source directory
+        2. Acquire POINT lock on destination parent directory
 
         Args:
             src_path: Source directory path
-            dst_path: Destination directory path
+            dst_path: Destination parent directory path
             transaction: Transaction record
-            max_parallel: Maximum number of parallel lock operations
+            timeout: Maximum time to wait for each lock in seconds.
+                0 (default) = fail immediately if locked.
+                > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout.
 
         Returns:
             True if all locks acquired successfully, False otherwise
         """
-        # Step 1: Lock source directory
-        if not await self.acquire_rm(src_path, transaction, max_parallel):
-            logger.warning(f"Failed to lock source path: {src_path}")
+        # Step 1: Lock source directory with SUBTREE lock
+        if not await self.acquire_subtree(src_path, transaction, timeout=timeout):
+            logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}")
             return False
 
-        # Step 2: Lock destination directory
-        if not await self.acquire_normal(dst_path, transaction):
-            logger.warning(f"Failed to lock destination path: {dst_path}")
-            # Release source locks
+        # Step 2: Lock destination parent directory with POINT lock
+        if not await self.acquire_point(dst_path, transaction, timeout=timeout):
+            logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}")
+            # Release source lock
             await self.release(transaction)
             return False
 
-        logger.debug(f"MV locks acquired: {src_path} -> {dst_path}")
+        logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}")
         return True
 
     async def release(self, transaction: TransactionRecord) -> None:
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index 32a24c44..58d9d4df 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -9,7 +9,7 @@
 import asyncio
 import threading
 import time
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 from pyagfs import AGFSClient
 
@@ -35,6 +35,7 @@ class TransactionManager:
     - Allocating transaction IDs
     - Managing transaction lifecycle (start, commit, rollback)
     - Providing transaction lock mechanism interface, preventing deadlocks
+    - Persisting transaction state to journal for crash recovery
     """
 
     def __init__(
@@ -42,6 +43,8 @@ def __init__(
         agfs_client: AGFSClient,
         timeout: int = 3600,
         max_parallel_locks: int = 8,
+        lock_timeout: float = 0.0,
+        lock_expire: float = 300.0,
     ):
         """Initialize transaction manager.
 
@@ -49,11 +52,19 @@ def __init__(
             agfs_client: AGFS client for file system operations
             timeout: Transaction timeout in seconds (default: 3600)
             max_parallel_locks: Maximum number of parallel lock operations (default: 8)
+            lock_timeout: Path lock acquisition timeout in seconds.
+                0 (default) = fail immediately if locked.
+                > 0 = wait/retry up to this many seconds.
+            lock_expire: Stale lock expiry threshold in seconds (default: 300s).
         """
+        from openviking.storage.transaction.journal import TransactionJournal
+
         self._agfs = agfs_client
         self._timeout = timeout
         self._max_parallel_locks = max_parallel_locks
-        self._path_lock = PathLock(agfs_client)
+        self._lock_timeout = lock_timeout
+        self._path_lock = PathLock(agfs_client, lock_expire=lock_expire)
+        self._journal = TransactionJournal(agfs_client)
 
         # Active transactions: {transaction_id: TransactionRecord}
         self._transactions: Dict[str, TransactionRecord] = {}
@@ -66,10 +77,15 @@ def __init__(
             f"TransactionManager initialized (timeout={timeout}s, max_parallel_locks={max_parallel_locks})"
         )
 
+    @property
+    def journal(self):
+        return self._journal
+
     async def start(self) -> None:
         """Start transaction manager.
 
-        Starts the background cleanup task for timed-out transactions.
+        Starts the background cleanup task and recovers any pending transactions
+        left from a previous process crash.
         """
         if self._running:
             logger.debug("TransactionManager already running")
@@ -77,6 +93,12 @@ async def start(self) -> None:
 
         self._running = True
         self._cleanup_task = asyncio.create_task(self._cleanup_loop())
+
+        # Recover any transactions that were interrupted by a previous crash.
+        # Journal entries are written BEFORE lock acquisition, so every orphan
+        # lock has a corresponding journal entry that recovery can use to clean it up.
+        await self._recover_pending_transactions()
+
         logger.info("TransactionManager started")
 
     def stop(self) -> None:
@@ -125,6 +147,121 @@ async def _cleanup_timed_out(self) -> None:
             logger.warning(f"Transaction timed out: {tx_id}")
             await self.rollback(tx_id)
 
+    async def _recover_pending_transactions(self) -> None:
+        """Recover pending transactions from journal after a crash.
+
+        Reads all journal entries and rolls back any transactions that were
+        not cleanly committed or rolled back.
+        """
+        try:
+            pending_ids = self._journal.list_all()
+        except Exception as e:
+            logger.warning(f"Failed to list journal entries for recovery: {e}")
+            return
+
+        if not pending_ids:
+            return
+
+        logger.info(f"Found {len(pending_ids)} pending transaction(s) to recover")
+
+        for tx_id in pending_ids:
+            try:
+                await self._recover_one(tx_id)
+            except Exception as e:
+                logger.error(f"Failed to recover transaction {tx_id}: {e}")
+
+    async def _recover_one(self, tx_id: str) -> None:
+        """Recover a single transaction from journal.
+
+        Recovery strategy by status:
+          COMMITTED + post_actions  → replay post_actions (enqueue etc.), then clean up
+          COMMITTED, no post_actions / RELEASED → just clean up
+          EXEC / FAIL / RELEASING   → rollback completed+partial ops, then clean up
+          INIT / ACQUIRE            → nothing executed yet, just clean up
+        """
+        from openviking.storage.transaction.undo import execute_rollback
+
+        try:
+            data = self._journal.read(tx_id)
+        except Exception as e:
+            logger.warning(f"Cannot read journal for tx {tx_id}: {e}")
+            return
+
+        tx = TransactionRecord.from_journal(data)
+        logger.info(f"Recovering transaction {tx_id} (status={tx.status})")
+
+        if tx.status == TransactionStatus.COMMIT:
+            # Transaction was committed — replay any unfinished post_actions
+            if tx.post_actions:
+                logger.info(
+                    f"Replaying {len(tx.post_actions)} post_action(s) for committed tx {tx_id}"
+                )
+                try:
+                    await self._execute_post_actions(tx.post_actions)
+                except Exception as e:
+                    logger.warning(f"Post-action replay failed for tx {tx_id}: {e}")
+        elif tx.status in (TransactionStatus.INIT, TransactionStatus.AQUIRE):
+            # Transaction never executed any operations — nothing to rollback.
+            # However, locks may have been created before the journal was updated
+            # with the actual locks list. Use init_info.lock_paths to find and
+            # clean up orphan lock files owned by this transaction.
+            logger.info(f"Transaction {tx_id} never executed, cleaning up orphan locks")
+            if not tx.locks:
+                await self._cleanup_orphan_locks_from_init_info(tx_id, tx.init_info)
+        else:
+            # EXEC / FAIL / RELEASING: process crashed mid-operation — rollback
+            # Pass recover_all=True so partial (completed=False) ops are also reversed,
+            # e.g. a directory mv that started but never finished still leaves residue.
+            try:
+                execute_rollback(tx.undo_log, self._agfs, recover_all=True)
+            except Exception as e:
+                logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}")
+
+        # Release any lock files still present
+        await self._path_lock.release(tx)
+
+        # Clean up journal
+        try:
+            self._journal.delete(tx_id)
+        except Exception:
+            pass
+
+        logger.info(f"Recovered transaction {tx_id}")
+
+    async def _cleanup_orphan_locks_from_init_info(
+        self, tx_id: str, init_info: Dict[str, Any]
+    ) -> None:
+        """Clean up orphan lock files using lock path hints from init_info.
+
+        When a crash occurs between lock creation and journal update, the
+        journal's ``locks`` list is empty but ``init_info.lock_paths`` records
+        the paths that were intended to be locked. This method checks those
+        paths and removes any lock files still owned by this transaction.
+        """
+        from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _parse_fencing_token
+
+        lock_paths = init_info.get("lock_paths", [])
+        lock_mode = init_info.get("lock_mode", "point")
+        mv_dst_path = init_info.get("mv_dst_path")
+
+        # Collect all candidate paths to check
+        paths_to_check = list(lock_paths)
+        if lock_mode == "mv" and mv_dst_path:
+            paths_to_check.append(mv_dst_path)
+
+        for path in paths_to_check:
+            lock_file = f"{path.rstrip('/')}/{LOCK_FILE_NAME}"
+            try:
+                token = self._path_lock._read_token(lock_file)
+                if token is None:
+                    continue
+                owner_id, _, _ = _parse_fencing_token(token)
+                if owner_id == tx_id:
+                    await self._path_lock._remove_lock_file(lock_file)
+                    logger.info(f"Removed orphan lock for tx {tx_id}: {lock_file}")
+            except Exception as e:
+                logger.warning(f"Failed to check orphan lock {lock_file}: {e}")
+
     def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord:
         """Create a new transaction.
 
@@ -171,6 +308,8 @@ async def begin(self, transaction_id: str) -> bool:
     async def commit(self, transaction_id: str) -> bool:
         """Commit a transaction.
 
+        Executes post-actions, releases all locks, and removes the journal entry.
+
         Args:
             transaction_id: Transaction ID
 
@@ -185,6 +324,16 @@ async def commit(self, transaction_id: str) -> bool:
         # Update status to COMMIT
         tx.update_status(TransactionStatus.COMMIT)
 
+        # Persist final committed state before releasing
+        try:
+            self._journal.update(tx.to_journal())
+        except Exception:
+            pass
+
+        # Execute post-actions (best-effort, errors are logged but don't fail commit)
+        if tx.post_actions:
+            await self._execute_post_actions(tx.post_actions)
+
         # Release all locks
         tx.update_status(TransactionStatus.RELEASING)
         await self._path_lock.release(tx)
@@ -195,18 +344,29 @@ async def commit(self, transaction_id: str) -> bool:
         # Remove from active transactions
         self._transactions.pop(transaction_id, None)
 
+        # Clean up journal entry (last step — lock is already released)
+        try:
+            self._journal.delete(transaction_id)
+        except Exception as e:
+            logger.warning(f"Failed to delete journal on commit for {transaction_id}: {e}")
+
         logger.debug(f"Transaction committed: {transaction_id}")
         return True
 
     async def rollback(self, transaction_id: str) -> bool:
         """Rollback a transaction.
 
+        Executes undo log entries in reverse order, releases all locks,
+        and removes the journal entry.
+
         Args:
             transaction_id: Transaction ID
 
         Returns:
             True if transaction rolled back successfully, False otherwise
         """
+        from openviking.storage.transaction.undo import execute_rollback
+
         tx = self.get_transaction(transaction_id)
         if not tx:
             logger.error(f"Transaction not found: {transaction_id}")
@@ -215,6 +375,21 @@ async def rollback(self, transaction_id: str) -> bool:
         # Update status to FAIL
         tx.update_status(TransactionStatus.FAIL)
 
+        # Persist rollback state
+        try:
+            self._journal.update(tx.to_journal())
+        except Exception:
+            pass
+
+        # Execute undo log (best-effort)
+        if tx.undo_log:
+            try:
+                execute_rollback(tx.undo_log, self._agfs)
+            except Exception as e:
+                logger.warning(
+                    f"Undo log execution failed during rollback of {transaction_id}: {e}"
+                )
+
         # Release all locks
         tx.update_status(TransactionStatus.RELEASING)
         await self._path_lock.release(tx)
@@ -225,11 +400,57 @@ async def rollback(self, transaction_id: str) -> bool:
         # Remove from active transactions
         self._transactions.pop(transaction_id, None)
 
+        # Clean up journal entry (last step — lock is already released)
+        try:
+            self._journal.delete(transaction_id)
+        except Exception as e:
+            logger.warning(f"Failed to delete journal on rollback for {transaction_id}: {e}")
+
         logger.debug(f"Transaction rolled back: {transaction_id}")
         return True
 
-    async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool:
-        """Acquire path lock for normal (non-rm/mv) operations.
+    async def _execute_post_actions(self, post_actions: List[Dict[str, Any]]) -> None:
+        """Execute post-commit actions.
+
+        Post-actions are executed after a successful commit. Errors are logged
+        but do not affect the commit outcome.
+
+        Args:
+            post_actions: List of post-action dicts with 'type' and 'params' keys
+        """
+        for action in post_actions:
+            action_type = action.get("type", "")
+            params = action.get("params", {})
+            try:
+                if action_type == "enqueue_semantic":
+                    await self._post_enqueue_semantic(params)
+                else:
+                    logger.warning(f"Unknown post-action type: {action_type}")
+            except Exception as e:
+                logger.warning(f"Post-action '{action_type}' failed: {e}")
+
+    async def _post_enqueue_semantic(self, params: Dict[str, Any]) -> None:
+        """Execute enqueue_semantic post-action."""
+        from openviking.storage.queuefs import get_queue_manager
+        from openviking.storage.queuefs.semantic_msg import SemanticMsg
+
+        queue_manager = get_queue_manager()
+        if queue_manager is None:
+            logger.debug("No queue manager available, skipping enqueue_semantic post-action")
+            return
+
+        uri = params.get("uri")
+        context_type = params.get("context_type", "resource")
+        account_id = params.get("account_id", "default")
+        if not uri:
+            return
+
+        msg = SemanticMsg(uri=uri, context_type=context_type, account_id=account_id)
+        semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC)
+        await semantic_queue.enqueue(msg)
+
+    async def acquire_lock_point(self, transaction_id: str, path: str) -> bool:
+        """Acquire POINT lock for write/semantic-processing operations.
 
         Args:
             transaction_id: Transaction ID
@@ -244,7 +465,7 @@ async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool:
             return False
 
         tx.update_status(TransactionStatus.AQUIRE)
-        success = await self._path_lock.acquire_normal(path, tx)
+        success = await self._path_lock.acquire_point(path, tx, timeout=self._lock_timeout)
 
         if success:
             tx.update_status(TransactionStatus.EXEC)
@@ -253,15 +474,15 @@ async def acquire_lock_normal(self, transaction_id: str, path: str) -> bool:
 
         return success
 
-    async def acquire_lock_rm(
-        self, transaction_id: str, path: str, max_parallel: Optional[int] = None
+    async def acquire_lock_subtree(
+        self, transaction_id: str, path: str, timeout: Optional[float] = None
     ) -> bool:
-        """Acquire path lock for rm operation.
+        """Acquire SUBTREE lock for rm/mv-source operations.
 
         Args:
             transaction_id: Transaction ID
-            path: Directory path to lock
-            max_parallel: Maximum number of parallel lock operations (default: from config)
+            path: Directory path to lock (root of the subtree)
+            timeout: Maximum time to wait for the lock in seconds (default: from config)
 
         Returns:
             True if lock acquired successfully, False otherwise
@@ -272,8 +493,8 @@ async def acquire_lock_rm(
             return False
 
         tx.update_status(TransactionStatus.AQUIRE)
-        parallel = max_parallel or self._max_parallel_locks
-        success = await self._path_lock.acquire_rm(path, tx, parallel)
+        effective_timeout = timeout if timeout is not None else self._lock_timeout
+        success = await self._path_lock.acquire_subtree(path, tx, timeout=effective_timeout)
 
         if success:
             tx.update_status(TransactionStatus.EXEC)
@@ -287,15 +508,15 @@ async def acquire_lock_mv(
         transaction_id: str,
         src_path: str,
         dst_path: str,
-        max_parallel: Optional[int] = None,
+        timeout: Optional[float] = None,
     ) -> bool:
         """Acquire path lock for mv operation.
 
         Args:
             transaction_id: Transaction ID
             src_path: Source directory path
-            dst_path: Destination directory path
-            max_parallel: Maximum number of parallel lock operations (default: from config)
+            dst_path: Destination parent directory path
+            timeout: Maximum time to wait for each lock in seconds (default: from config)
 
         Returns:
             True if lock acquired successfully, False otherwise
@@ -306,8 +527,10 @@ async def acquire_lock_mv(
             return False
 
         tx.update_status(TransactionStatus.AQUIRE)
-        parallel = max_parallel or self._max_parallel_locks
-        success = await self._path_lock.acquire_mv(src_path, dst_path, tx, parallel)
+        effective_timeout = timeout if timeout is not None else self._lock_timeout
+        success = await self._path_lock.acquire_mv(
+            src_path, dst_path, tx, timeout=effective_timeout
+        )
 
         if success:
             tx.update_status(TransactionStatus.EXEC)
@@ -337,6 +560,8 @@ def init_transaction_manager(
     agfs_config: Any,
     tx_timeout: int = 3600,
     max_parallel_locks: int = 8,
+    lock_timeout: float = 0.0,
+    lock_expire: float = 300.0,
 ) -> TransactionManager:
     """Initialize transaction manager singleton.
 
@@ -344,6 +569,10 @@ def init_transaction_manager(
         agfs_config: AGFS configuration (url, timeout, etc.)
         tx_timeout: Transaction timeout in seconds (default: 3600)
         max_parallel_locks: Maximum number of parallel lock operations (default: 8)
+        lock_timeout: Path lock acquisition timeout in seconds.
+            0 (default) = fail immediately if locked.
+            > 0 = wait/retry up to this many seconds.
+        lock_expire: Stale lock expiry threshold in seconds (default: 300s).
 
     Returns:
         TransactionManager instance
@@ -367,6 +596,8 @@ def init_transaction_manager(
             agfs_client=agfs_client,
             timeout=tx_timeout,
             max_parallel_locks=max_parallel_locks,
+            lock_timeout=lock_timeout,
+            lock_expire=lock_expire,
         )
 
         logger.info("TransactionManager initialized as singleton")
diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py
index fba6480b..c73775de 100644
--- a/openviking/storage/transaction/transaction_record.py
+++ b/openviking/storage/transaction/transaction_record.py
@@ -41,6 +41,8 @@ class TransactionRecord:
         status: Current transaction status
         init_info: Transaction initialization information
         rollback_info: Information for rollback operations
+        undo_log: List of undo entries for rollback
+        post_actions: Actions to execute after successful commit
         created_at: Creation timestamp (Unix timestamp in seconds)
         updated_at: Last update timestamp (Unix timestamp in seconds)
     """
@@ -50,44 +52,30 @@ class TransactionRecord:
     status: TransactionStatus = field(default=TransactionStatus.INIT)
     init_info: Dict[str, Any] = field(default_factory=dict)
     rollback_info: Dict[str, Any] = field(default_factory=dict)
+    undo_log: List[Any] = field(default_factory=list)
+    post_actions: List[Dict[str, Any]] = field(default_factory=list)
     created_at: float = field(default_factory=time.time)
     updated_at: float = field(default_factory=time.time)
 
     def update_status(self, status: TransactionStatus) -> None:
-        """Update transaction status and timestamp.
-
-        Args:
-            status: New transaction statusudi
-        """
+        """Update transaction status and timestamp."""
         self.status = status
         self.updated_at = time.time()
 
     def add_lock(self, lock_path: str) -> None:
-        """Add a lock to the transaction.
-
-        Args:
-            lock_path: Path to be locked
-        """
+        """Add a lock to the transaction."""
         if lock_path not in self.locks:
             self.locks.append(lock_path)
             self.updated_at = time.time()
 
     def remove_lock(self, lock_path: str) -> None:
-        """Remove a lock from the transaction.
-
-        Args:
-            lock_path: Path to be unlocked
-        """
+        """Remove a lock from the transaction."""
         if lock_path in self.locks:
             self.locks.remove(lock_path)
             self.updated_at = time.time()
 
     def to_dict(self) -> Dict[str, Any]:
-        """Convert transaction record to dictionary.
-
-        Returns:
-            Dictionary representation of the transaction record
-        """
+        """Convert transaction record to dictionary."""
         return {
             "id": self.id,
             "locks": self.locks,
@@ -98,16 +86,45 @@ def to_dict(self) -> Dict[str, Any]:
             "updated_at": self.updated_at,
         }
 
+    def to_journal(self) -> Dict[str, Any]:
+        """Serialize to journal format (includes undo_log and post_actions)."""
+        from openviking.storage.transaction.undo import UndoEntry
+
+        return {
+            "id": self.id,
+            "locks": self.locks,
+            "status": str(self.status),
+            "init_info": self.init_info,
+            "undo_log": [e.to_dict() if isinstance(e, UndoEntry) else e for e in self.undo_log],
+            "post_actions": self.post_actions,
+            "created_at": self.created_at,
+            "updated_at": self.updated_at,
+        }
+
     @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord":
-        """Create transaction record from dictionary.
+    def from_journal(cls, data: Dict[str, Any]) -> "TransactionRecord":
+        """Restore from journal format."""
+        from openviking.storage.transaction.undo import UndoEntry
+
+        status_str = data.get("status", "INIT")
+        status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str
+        undo_log = [UndoEntry.from_dict(e) for e in data.get("undo_log", [])]
 
-        Args:
-            data: Dictionary representation of the transaction record
+        return cls(
+            id=data.get("id", str(uuid.uuid4())),
+            locks=data.get("locks", []),
+            status=status,
+            init_info=data.get("init_info", {}),
+            rollback_info={},
+            undo_log=undo_log,
+            post_actions=data.get("post_actions", []),
+            created_at=data.get("created_at", time.time()),
+            updated_at=data.get("updated_at", time.time()),
+        )
 
-        Returns:
-            TransactionRecord instance
-        """
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord":
+        """Create transaction record from dictionary."""
         status_str = data.get("status", "INIT")
         status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str
 
diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py
new file mode 100644
index 00000000..d64d1619
--- /dev/null
+++ b/openviking/storage/transaction/undo.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Undo log and rollback executor for transaction management.
+
+Records operations performed within a transaction so they can be reversed
+on rollback. Each UndoEntry captures one atomic sub-operation.
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class UndoEntry:
+    """A single undo log entry representing one reversible sub-operation.
+
+    Attributes:
+        sequence: Monotonically increasing index within the transaction.
+        op_type: Operation type (fs_mv, fs_rm, fs_mkdir, fs_write_new,
+                 vectordb_upsert, vectordb_delete, vectordb_update_uri).
+        params: Parameters needed to reverse the operation.
+        completed: Whether the forward operation completed successfully.
+    """
+
+    sequence: int
+    op_type: str
+    params: Dict[str, Any] = field(default_factory=dict)
+    completed: bool = False
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "sequence": self.sequence,
+            "op_type": self.op_type,
+            "params": self.params,
+            "completed": self.completed,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "UndoEntry":
+        return cls(
+            sequence=data.get("sequence", 0),
+            op_type=data.get("op_type", ""),
+            params=data.get("params", {}),
+            completed=data.get("completed", False),
+        )
+
+
+def execute_rollback(
+    undo_log: List[UndoEntry],
+    agfs: Any,
+    vector_store: Optional[Any] = None,
+    ctx: Optional[Any] = None,
+    recover_all: bool = False,
+) -> None:
+    """Execute rollback by reversing operations in reverse order.
+
+    Best-effort: each step is wrapped in try-except so a single failure
+    does not prevent subsequent undo steps from running.
+
+    Args:
+        undo_log: List of undo entries to process.
+        agfs: AGFS client for filesystem operations.
+        vector_store: Optional vector store client.
+        ctx: Optional request context.
+        recover_all: If True, also attempt to reverse entries that were not
+            marked completed (used during crash recovery to clean up partial
+            operations such as a directory mv that only half-finished).
+    """
+    if recover_all:
+        entries = list(undo_log)
+    else:
+        entries = [e for e in undo_log if e.completed]
+    entries.sort(key=lambda e: e.sequence, reverse=True)
+
+    for entry in entries:
+        try:
+            _rollback_entry(entry, agfs, vector_store, ctx)
+            logger.info(f"[Rollback] Reversed {entry.op_type} seq={entry.sequence}")
+        except Exception as e:
+            logger.warning(
+                f"[Rollback] Failed to reverse {entry.op_type} seq={entry.sequence}: {e}"
+            )
+
+
+def _rollback_entry(
+    entry: UndoEntry,
+    agfs: Any,
+    vector_store: Optional[Any],
+    ctx: Optional[Any],
+) -> None:
+    """Dispatch rollback for a single undo entry."""
+    from openviking_cli.utils import run_async
+
+    op = entry.op_type
+    params = entry.params
+
+    if op == "fs_mv":
+        agfs.mv(params["dst"], params["src"])
+
+    elif op == "fs_rm":
+        logger.debug("[Rollback] fs_rm is not reversible, skipping")
+
+    elif op == "fs_mkdir":
+        try:
+            agfs.rm(params["uri"])
+        except Exception:
+            pass
+
+    elif op == "fs_write_new":
+        try:
+            agfs.rm(params["uri"], recursive=True)
+        except Exception:
+            pass
+
+    elif op == "vectordb_upsert":
+        if vector_store:
+            record_id = params.get("record_id")
+            if record_id:
+                run_async(vector_store.delete([record_id]))
+
+    elif op == "vectordb_delete":
+        if vector_store and ctx:
+            records_snapshot = params.get("records_snapshot", [])
+            for record in records_snapshot:
+                try:
+                    run_async(vector_store.upsert(record))
+                except Exception as e:
+                    logger.warning(f"[Rollback] Failed to restore vector record: {e}")
+
+    elif op == "vectordb_update_uri":
+        if vector_store and ctx:
+            run_async(
+                vector_store.update_uri_mapping(
+                    ctx=ctx,
+                    uri=params["new_uri"],
+                    new_uri=params["old_uri"],
+                    new_parent_uri=params.get("old_parent_uri", ""),
+                )
+            )
+
+    else:
+        logger.warning(f"[Rollback] Unknown op_type: {op}")
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 5b5afbb6..200caef0 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -260,15 +260,61 @@ async def rm(
 
         This method is idempotent: deleting a non-existent file succeeds
         after cleaning up any orphan index records.
+
+        Wrapped in a transaction: deletes VectorDB records first, then FS files.
+        On rollback, VectorDB records are restored from snapshot.
         """
+        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
         self._ensure_access(uri, ctx)
         path = self._uri_to_path(uri, ctx=ctx)
         target_uri = self._path_to_uri(path, ctx=ctx)
         uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
         uris_to_delete.append(target_uri)
-        result = self.agfs.rm(path, recursive=recursive)
-        await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
-        return result
+
+        tx_manager = get_transaction_manager()
+        if not tx_manager:
+            # Fallback: no transaction support
+            result = self.agfs.rm(path, recursive=recursive)
+            await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
+            return result
+
+        # Check existence and determine lock strategy
+        try:
+            stat = self.agfs.stat(path)
+            is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False
+        except Exception:
+            # Path does not exist: clean up any orphan index records and return
+            await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
+            logger.info(f"[VikingFS] rm target not found, cleaned orphan index: {uri}")
+            return {}
+
+        if is_dir:
+            lock_paths = [path]
+            lock_mode = "subtree"
+        else:
+            parent = path.rsplit("/", 1)[0] if "/" in path else path
+            lock_paths = [parent]
+            lock_mode = "point"
+
+        async with TransactionContext(tx_manager, "rm", lock_paths, lock_mode=lock_mode) as tx:
+            # Snapshot vector records for rollback
+            records_snapshot = await self._snapshot_vector_records(uris_to_delete, ctx=ctx)
+
+            # Step 1: Delete from VectorDB first
+            seq_vdb = tx.record_undo(
+                "vectordb_delete", {"uris": uris_to_delete, "records_snapshot": records_snapshot}
+            )
+            await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
+            tx.mark_completed(seq_vdb)
+
+            # Step 2: Delete from FS
+            seq_fs = tx.record_undo("fs_rm", {"uri": path, "recursive": recursive})
+            result = self.agfs.rm(path, recursive=recursive)
+            tx.mark_completed(seq_fs)
+
+            await tx.commit()
+            return result
 
     async def mv(
         self,
@@ -276,7 +322,13 @@ async def mv(
         new_uri: str,
         ctx: Optional[RequestContext] = None,
     ) -> Dict[str, Any]:
-        """Move file/directory + recursively update vector index."""
+        """Move file/directory + recursively update vector index.
+
+        Wrapped in a transaction: performs FS mv first, then VectorDB URI update.
+        On rollback, the file is moved back and VectorDB mappings are restored.
+        """
+        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+
         self._ensure_access(old_uri, ctx)
         self._ensure_access(new_uri, ctx)
         old_path = self._uri_to_path(old_uri, ctx=ctx)
@@ -285,15 +337,61 @@ async def mv(
         uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
         uris_to_move.append(target_uri)
 
+        tx_manager = get_transaction_manager()
+        if not tx_manager:
+            # Fallback: no transaction support
+            try:
+                result = self.agfs.mv(old_path, new_path)
+                await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
+                return result
+            except AGFSHTTPError as e:
+                if e.status_code == 404:
+                    await self._delete_from_vector_store(uris_to_move, ctx=ctx)
+                    logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
+                raise
+
+        # Verify source exists before locking
         try:
-            result = self.agfs.mv(old_path, new_path)
+            self.agfs.stat(old_path)
+        except Exception:
+            raise FileNotFoundError(f"mv source not found: {old_uri}")
+
+        # Lock source and destination's parent (dst doesn't exist yet)
+        dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path
+
+        async with TransactionContext(
+            tx_manager, "mv", [old_path], lock_mode="mv", mv_dst_path=dst_parent
+        ) as tx:
+            # Step 1: FS move
+            seq_mv = tx.record_undo("fs_mv", {"src": old_path, "dst": new_path})
+            try:
+                result = self.agfs.mv(old_path, new_path)
+            except AGFSHTTPError as e:
+                if e.status_code == 404:
+                    await self._delete_from_vector_store(uris_to_move, ctx=ctx)
+                    logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
+                raise
+            tx.mark_completed(seq_mv)
+
+            # Step 2: Update VectorDB URIs
+            old_uri_stripped = old_uri.rstrip("/")
+            old_parent_uri = (
+                old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else ""
+            )
+            seq_vdb = tx.record_undo(
+                "vectordb_update_uri",
+                {
+                    "old_uri": old_uri,
+                    "new_uri": new_uri,
+                    "old_parent_uri": old_parent_uri,
+                    "uris": uris_to_move,
+                },
+            )
             await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
+            tx.mark_completed(seq_vdb)
+
+            await tx.commit()
             return result
-        except AGFSHTTPError as e:
-            if e.status_code == 404:
-                await self._delete_from_vector_store(uris_to_move, ctx=ctx)
-                logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
-            raise
 
     async def grep(
         self,
@@ -1040,6 +1138,33 @@ def _infer_context_type(self, uri: str):
 
     # ========== Vector Sync Helper Methods ==========
 
+    async def _snapshot_vector_records(
+        self, uris: List[str], ctx: Optional[RequestContext] = None
+    ) -> List[Dict[str, Any]]:
+        """Snapshot vector records for the given URIs (for rollback).
+
+        Queries VectorDB metadata (without embedding vectors) so that
+        records can be restored during rollback.
+        """
+        vector_store = self._get_vector_store()
+        if not vector_store:
+            return []
+
+        real_ctx = self._ctx_or_default(ctx)
+        snapshots = []
+        for uri in uris:
+            try:
+                records = await vector_store.get_context_by_uri(
+                    account_id=real_ctx.account_id,
+                    uri=uri,
+                    limit=10,
+                )
+                if records:
+                    snapshots.extend(records)
+            except Exception as e:
+                logger.debug(f"[VikingFS] Failed to snapshot vector record for {uri}: {e}")
+        return snapshots
+
     async def _collect_uris(
         self, path: str, recursive: bool, ctx: Optional[RequestContext] = None
     ) -> List[str]:
diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py
index c0627467..d6d043d9 100644
--- a/openviking/storage/viking_vector_index_backend.py
+++ b/openviking/storage/viking_vector_index_backend.py
@@ -442,14 +442,22 @@ async def update_uri_mapping(
         new_uri: str,
         new_parent_uri: str,
     ) -> bool:
+        # A directory URI may have multiple records (e.g. L0 abstract + L1 overview),
+        # so fetch and update all of them.
         records = await self.filter(
             filter=And([Eq("uri", uri), Eq("account_id", ctx.account_id)]),
-            limit=1,
+            limit=100,
         )
-        if not records or "id" not in records[0]:
+        if not records:
             return False
-        updated = {**records[0], "uri": new_uri, "parent_uri": new_parent_uri}
-        return bool(await self.upsert(updated))
+        success = False
+        for record in records:
+            if "id" not in record:
+                continue
+            updated = {**record, "uri": new_uri, "parent_uri": new_parent_uri}
+            if await self.upsert(updated):
+                success = True
+        return success
 
     async def increment_active_count(self, ctx: RequestContext, uris: List[str]) -> int:
         updated = 0
diff --git a/openviking/utils/agfs_utils.py b/openviking/utils/agfs_utils.py
index 9b3d2d57..2c4f5b3c 100644
--- a/openviking/utils/agfs_utils.py
+++ b/openviking/utils/agfs_utils.py
@@ -99,6 +99,10 @@ def mount_agfs_backend(agfs: Any, agfs_config: Any) -> None:
             local_dir = plugin_config["config"]["local_dir"]
             os.makedirs(local_dir, exist_ok=True)
             logger.debug(f"[AGFSUtils] Ensured local directory exists: {local_dir}")
+        # Ensure queuefs db_path parent directory exists before mounting
+        if plugin_name == "queuefs" and "db_path" in plugin_config.get("config", {}):
+            db_path = plugin_config["config"]["db_path"]
+            os.makedirs(os.path.dirname(db_path), exist_ok=True)
 
         try:
             agfs.unmount(mount_path)
diff --git a/openviking_cli/utils/config/storage_config.py b/openviking_cli/utils/config/storage_config.py
index 8daf6a79..b8b4bfea 100644
--- a/openviking_cli/utils/config/storage_config.py
+++ b/openviking_cli/utils/config/storage_config.py
@@ -8,6 +8,7 @@
 from openviking_cli.utils.logger import get_logger
 
 from .agfs_config import AGFSConfig
+from .transaction_config import TransactionConfig
 from .vectordb_config import VectorDBBackendConfig
 
 logger = get_logger(__name__)
@@ -25,6 +26,11 @@ class StorageConfig(BaseModel):
 
     agfs: AGFSConfig = Field(default_factory=lambda: AGFSConfig(), description="AGFS configuration")
 
+    transaction: TransactionConfig = Field(
+        default_factory=lambda: TransactionConfig(),
+        description="Transaction mechanism configuration",
+    )
+
     vectordb: VectorDBBackendConfig = Field(
         default_factory=lambda: VectorDBBackendConfig(),
         description="VectorDB backend configuration",
diff --git a/openviking_cli/utils/config/transaction_config.py b/openviking_cli/utils/config/transaction_config.py
new file mode 100644
index 00000000..fac8c2aa
--- /dev/null
+++ b/openviking_cli/utils/config/transaction_config.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+from pydantic import BaseModel, Field
+
+
+class TransactionConfig(BaseModel):
+    """Configuration for the transaction mechanism.
+
+    By default, lock acquisition does not wait (``lock_timeout=0``): if a
+    conflicting lock is held the operation fails immediately with
+    ``LockAcquisitionError``.  Set ``lock_timeout`` to a positive value to
+    allow the caller to block and retry for up to that many seconds.
+    """
+
+    lock_timeout: float = Field(
+        default=0.0,
+        description=(
+            "Path lock acquisition timeout (seconds). "
+            "0 = fail immediately if locked (default). "
+            "> 0 = wait/retry up to this many seconds before raising LockAcquisitionError."
+        ),
+    )
+
+    lock_expire: float = Field(
+        default=300.0,
+        description=(
+            "Stale lock expiry threshold (seconds). "
+            "Locks held longer than this by a crashed process are force-released."
+        ),
+    )
+
+    max_parallel_locks: int = Field(
+        default=8,
+        description="Maximum parallel lock operations during recursive rm/mv.",
+    )
+
+    model_config = {"extra": "forbid"}
diff --git a/tests/transaction/__init__.py b/tests/transaction/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py
new file mode 100644
index 00000000..db77bbdd
--- /dev/null
+++ b/tests/transaction/conftest.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Shared fixtures for transaction tests using real AGFS backend."""
+
+import os
+import shutil
+import uuid
+
+import pytest
+
+from openviking.agfs_manager import AGFSManager
+from openviking.utils.agfs_utils import create_agfs_client
+from openviking_cli.utils.config.agfs_config import AGFSConfig
+
+AGFS_CONF = AGFSConfig(
+    path="/tmp/ov-tx-test", backend="local", port=1834, url="http://localhost:1834", timeout=10
+)
+
+# Clean slate before session starts
+if os.path.exists(AGFS_CONF.path):
+    shutil.rmtree(AGFS_CONF.path)
+
+
+@pytest.fixture(scope="session")
+def agfs_manager():
+    manager = AGFSManager(config=AGFS_CONF)
+    manager.start()
+    yield manager
+    manager.stop()
+
+
+@pytest.fixture(scope="session")
+def agfs_client(agfs_manager):
+    return create_agfs_client(AGFS_CONF)
+
+
+def _mkdir_ok(agfs_client, path):
+    """Create directory, ignoring already-exists errors."""
+    try:
+        agfs_client.mkdir(path)
+    except Exception:
+        pass  # already exists
+
+
+@pytest.fixture
+def test_dir(agfs_client):
+    """每个测试独享隔离目录，自动清理。"""
+    path = f"/local/tx-tests/{uuid.uuid4().hex}"
+    _mkdir_ok(agfs_client, "/local")
+    _mkdir_ok(agfs_client, "/local/tx-tests")
+    _mkdir_ok(agfs_client, path)
+    yield path
+    try:
+        agfs_client.rm(path, recursive=True)
+    except Exception:
+        pass
diff --git a/tests/transaction/test_concurrent_lock.py b/tests/transaction/test_concurrent_lock.py
new file mode 100644
index 00000000..e98279e4
--- /dev/null
+++ b/tests/transaction/test_concurrent_lock.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for concurrent lock acquisition using real AGFS backend."""
+
+import asyncio
+import uuid
+
+from openviking.storage.transaction.path_lock import PathLock
+from openviking.storage.transaction.transaction_record import TransactionRecord
+
+
+class TestConcurrentLock:
+    async def test_point_mutual_exclusion_same_path(self, agfs_client, test_dir):
+        """两个任务竞争同一路径的 POINT 锁，均最终成功（串行执行）。"""
+        lock = PathLock(agfs_client)
+
+        results = {}
+
+        async def holder(tx_id):
+            tx = TransactionRecord(id=tx_id)
+            ok = await lock.acquire_point(test_dir, tx, timeout=5.0)
+            if ok:
+                await asyncio.sleep(0.3)
+                await lock.release(tx)
+            results[tx_id] = ok
+
+        await asyncio.gather(
+            holder("tx-conc-1"),
+            holder("tx-conc-2"),
+        )
+
+        # Both should eventually succeed (one waits for the other)
+        assert results["tx-conc-1"] is True
+        assert results["tx-conc-2"] is True
+
+    async def test_subtree_blocks_concurrent_point_child(self, agfs_client, test_dir):
+        """SUBTREE on parent 持锁期间，子目录的 POINT 被阻塞，释放后成功。"""
+        child = f"{test_dir}/child-{uuid.uuid4().hex}"
+        agfs_client.mkdir(child)
+
+        lock = PathLock(agfs_client)
+        parent_acquired = asyncio.Event()
+        parent_released = asyncio.Event()
+
+        child_result = {}
+
+        async def parent_holder():
+            tx = TransactionRecord(id="tx-sub-parent")
+            ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0)
+            assert ok is True
+            parent_acquired.set()
+            await asyncio.sleep(0.5)
+            await lock.release(tx)
+            parent_released.set()
+
+        async def child_worker():
+            await parent_acquired.wait()
+            tx = TransactionRecord(id="tx-sub-child")
+            ok = await lock.acquire_point(child, tx, timeout=5.0)
+            child_result["ok"] = ok
+            child_result["after_release"] = parent_released.is_set()
+            if ok:
+                await lock.release(tx)
+
+        await asyncio.gather(parent_holder(), child_worker())
+
+        assert child_result["ok"] is True
+        # Child should succeed only after parent released
+        assert child_result["after_release"] is True
+
+    async def test_point_child_blocks_concurrent_subtree_parent(self, agfs_client, test_dir):
+        """POINT on child 持锁期间，父目录的 SUBTREE 被阻塞，释放后成功。"""
+        child = f"{test_dir}/child-{uuid.uuid4().hex}"
+        agfs_client.mkdir(child)
+
+        lock = PathLock(agfs_client)
+        child_acquired = asyncio.Event()
+        child_released = asyncio.Event()
+
+        parent_result = {}
+
+        async def child_holder():
+            tx = TransactionRecord(id="tx-rev-child")
+            ok = await lock.acquire_point(child, tx, timeout=5.0)
+            assert ok is True
+            child_acquired.set()
+            await asyncio.sleep(0.5)
+            await lock.release(tx)
+            child_released.set()
+
+        async def parent_worker():
+            await child_acquired.wait()
+            tx = TransactionRecord(id="tx-rev-parent")
+            ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0)
+            parent_result["ok"] = ok
+            parent_result["after_release"] = child_released.is_set()
+            if ok:
+                await lock.release(tx)
+
+        await asyncio.gather(child_holder(), parent_worker())
+
+        assert parent_result["ok"] is True
+        assert parent_result["after_release"] is True
diff --git a/tests/transaction/test_context_manager.py b/tests/transaction/test_context_manager.py
new file mode 100644
index 00000000..f45a55cc
--- /dev/null
+++ b/tests/transaction/test_context_manager.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for TransactionContext."""
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from openviking.storage.errors import LockAcquisitionError
+from openviking.storage.transaction.context_manager import TransactionContext
+from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus
+
+
+def _make_tx_manager(lock_succeeds=True):
+    """Create a mock TransactionManager with async methods."""
+    tx_manager = MagicMock()
+    record = TransactionRecord(id="tx-test", status=TransactionStatus.INIT)
+
+    tx_manager.create_transaction.return_value = record
+    tx_manager.acquire_lock_point = AsyncMock(return_value=lock_succeeds)
+    tx_manager.acquire_lock_subtree = AsyncMock(return_value=lock_succeeds)
+    tx_manager.acquire_lock_mv = AsyncMock(return_value=lock_succeeds)
+    tx_manager.commit = AsyncMock(return_value=True)
+    tx_manager.rollback = AsyncMock(return_value=True)
+
+    journal = MagicMock()
+    tx_manager.journal = journal
+
+    return tx_manager, record
+
+
+class TestTransactionContextNormal:
+    async def test_commit_success(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx:
+            seq = tx.record_undo("fs_write_new", {"uri": "/path/file"})
+            tx.mark_completed(seq)
+            await tx.commit()
+
+        tx_manager.commit.assert_called_once_with("tx-test")
+        tx_manager.rollback.assert_not_called()
+
+    async def test_rollback_on_exception(self):
+        tx_manager, record = _make_tx_manager()
+
+        with pytest.raises(ValueError):
+            async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx:
+                seq = tx.record_undo("fs_write_new", {"uri": "/path/file"})
+                tx.mark_completed(seq)
+                raise ValueError("something went wrong")
+
+        tx_manager.rollback.assert_called_once_with("tx-test")
+        tx_manager.commit.assert_not_called()
+
+    async def test_rollback_on_no_commit(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx:
+            tx.record_undo("fs_write_new", {"uri": "/path/file"})
+            # Forgot to call tx.commit()
+
+        tx_manager.rollback.assert_called_once_with("tx-test")
+
+    async def test_lock_failure_raises(self):
+        tx_manager, record = _make_tx_manager(lock_succeeds=False)
+
+        with pytest.raises(LockAcquisitionError):
+            async with TransactionContext(tx_manager, "test_op", ["/path"]) as _tx:
+                pass
+
+
+class TestTransactionContextLockModes:
+    async def test_subtree_lock_mode(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "rm_op", ["/path"], lock_mode="subtree") as tx:
+            await tx.commit()
+
+        tx_manager.acquire_lock_subtree.assert_called_once()
+
+    async def test_mv_lock_mode(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(
+            tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path="/dst"
+        ) as tx:
+            await tx.commit()
+
+        tx_manager.acquire_lock_mv.assert_called_once_with("tx-test", "/src", "/dst")
+
+    async def test_point_lock_mode(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "write_op", ["/path"], lock_mode="point") as tx:
+            await tx.commit()
+
+        tx_manager.acquire_lock_point.assert_called_once()
+
+
+class TestTransactionContextUndoLog:
+    async def test_undo_entries_tracked(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
+            s0 = tx.record_undo("fs_mkdir", {"uri": "/a"})
+            s1 = tx.record_undo("fs_write_new", {"uri": "/a/f.txt"})
+            tx.mark_completed(s0)
+            tx.mark_completed(s1)
+            await tx.commit()
+
+        assert len(record.undo_log) == 2
+        assert record.undo_log[0].completed is True
+        assert record.undo_log[1].completed is True
+
+
+class TestTransactionContextPostActions:
+    async def test_post_actions_added(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
+            tx.add_post_action("enqueue_semantic", {"uri": "viking://test"})
+            await tx.commit()
+
+        assert len(record.post_actions) == 1
+        assert record.post_actions[0]["type"] == "enqueue_semantic"
+
+
+class TestTransactionContextEdgeCases:
+    async def test_commit_failure_raises_transaction_error(self):
+        """When TransactionManager.commit() returns False, TransactionError is raised."""
+        from openviking.storage.errors import TransactionError
+
+        tx_manager, record = _make_tx_manager()
+        tx_manager.commit = AsyncMock(return_value=False)
+
+        with pytest.raises(TransactionError, match="Failed to commit"):
+            async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
+                await tx.commit()
+
+    async def test_mv_mode_missing_dst_raises(self):
+        """mv lock mode without mv_dst_path raises TransactionError."""
+        from openviking.storage.errors import TransactionError
+
+        tx_manager, record = _make_tx_manager()
+
+        with pytest.raises(TransactionError, match="mv lock mode requires"):
+            async with TransactionContext(
+                tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path=None
+            ) as _tx:
+                pass
+
+    async def test_mark_completed_nonexistent_sequence_is_noop(self):
+        """mark_completed with a sequence not in undo_log doesn't crash."""
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
+            seq = tx.record_undo("fs_mkdir", {"uri": "/a"})
+            tx.mark_completed(999)  # Nonexistent sequence
+            # Original entry should remain unmarked
+            assert record.undo_log[0].completed is False
+            tx.mark_completed(seq)
+            assert record.undo_log[0].completed is True
+            await tx.commit()
+
+    async def test_journal_update_failure_does_not_break_transaction(self):
+        """Journal update failures during record_undo/mark_completed are silently ignored."""
+        tx_manager, record = _make_tx_manager()
+        tx_manager.journal.update.side_effect = Exception("disk full")
+
+        # Should not raise despite journal failures
+        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
+            seq = tx.record_undo("fs_mkdir", {"uri": "/a"})
+            tx.mark_completed(seq)
+            await tx.commit()
+
+        assert len(record.undo_log) == 1
+        assert record.undo_log[0].completed is True
+
+    async def test_record_property_before_enter_raises(self):
+        """Accessing tx.record before __aenter__ raises TransactionError."""
+        from openviking.storage.errors import TransactionError
+
+        tx_manager, _ = _make_tx_manager()
+        ctx = TransactionContext(tx_manager, "test", ["/path"])
+
+        with pytest.raises(TransactionError, match="Transaction not started"):
+            _ = ctx.record
+
+    async def test_multiple_undo_entries_sequence_increments(self):
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
+            s0 = tx.record_undo("fs_mkdir", {"uri": "/a"})
+            s1 = tx.record_undo("fs_write_new", {"uri": "/a/f"})
+            s2 = tx.record_undo("fs_mv", {"src": "/a", "dst": "/b"})
+            assert s0 == 0
+            assert s1 == 1
+            assert s2 == 2
+            await tx.commit()
+
+    async def test_multiple_lock_paths_point_mode(self):
+        """Multiple lock_paths in point mode: each path gets acquire_lock_point called."""
+        tx_manager, record = _make_tx_manager()
+
+        async with TransactionContext(
+            tx_manager, "multi", ["/path1", "/path2"], lock_mode="point"
+        ) as tx:
+            await tx.commit()
+
+        assert tx_manager.acquire_lock_point.call_count == 2
+
+    async def test_subtree_multiple_paths_stops_on_first_failure(self):
+        """If acquiring subtree lock on first path fails, second path is not attempted."""
+        tx_manager, record = _make_tx_manager(lock_succeeds=False)
+
+        with pytest.raises(LockAcquisitionError):
+            async with TransactionContext(
+                tx_manager, "rm", ["/path1", "/path2"], lock_mode="subtree"
+            ) as _tx:
+                pass
+
+        # Only called once (failed on first path)
+        assert tx_manager.acquire_lock_subtree.call_count == 1
diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py
new file mode 100644
index 00000000..85384574
--- /dev/null
+++ b/tests/transaction/test_crash_recovery.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Integration test: crash recovery from journal."""
+
+import time
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from openviking.storage.transaction.transaction_manager import TransactionManager
+
+
+class TestCrashRecovery:
+    def _make_manager(self, journal_entries=None):
+        """Create a TransactionManager with mocked AGFS and journal data."""
+        agfs = MagicMock()
+        manager = TransactionManager(agfs_client=agfs, timeout=3600)
+
+        if journal_entries:
+            manager._journal = MagicMock()
+            manager._journal.list_all.return_value = list(journal_entries.keys())
+            manager._journal.read.side_effect = lambda tx_id: journal_entries[tx_id]
+            manager._journal.delete = MagicMock()
+        else:
+            manager._journal = MagicMock()
+            manager._journal.list_all.return_value = []
+
+        return manager, agfs
+
+    async def test_recover_committed_with_post_actions(self):
+        """COMMIT + post_actions → replay post_actions, clean up."""
+        entries = {
+            "tx-1": {
+                "id": "tx-1",
+                "status": "COMMIT",
+                "locks": ["/local/test/.path.ovlock"],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [
+                    {
+                        "type": "enqueue_semantic",
+                        "params": {
+                            "uri": "viking://test",
+                            "context_type": "resource",
+                            "account_id": "acc",
+                        },
+                    }
+                ],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+
+        with patch(
+            "openviking.storage.transaction.transaction_manager.TransactionManager._execute_post_actions",
+            new_callable=AsyncMock,
+        ) as mock_post:
+            await manager._recover_pending_transactions()
+
+        mock_post.assert_called_once()
+        agfs.rm.assert_called_once_with("/local/test/.path.ovlock")
+        manager._journal.delete.assert_called_once_with("tx-1")
+
+    async def test_recover_committed_no_post_actions(self):
+        """COMMIT + no post_actions → just clean up, no rollback."""
+        entries = {
+            "tx-2": {
+                "id": "tx-2",
+                "status": "COMMIT",
+                "locks": [],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [
+                    # Even if undo_log has entries, COMMIT should NOT rollback
+                    {
+                        "sequence": 0,
+                        "op_type": "fs_mv",
+                        "params": {"src": "/a", "dst": "/b"},
+                        "completed": True,
+                    }
+                ],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        agfs.mv.assert_not_called()  # No rollback for committed transactions
+        manager._journal.delete.assert_called_once_with("tx-2")
+
+    async def test_recover_exec_triggers_rollback(self):
+        """EXEC status → execute rollback regardless of transaction age."""
+        entries = {
+            "tx-3": {
+                "id": "tx-3",
+                "status": "EXEC",
+                "locks": ["/local/x/.path.ovlock"],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [
+                    {
+                        "sequence": 0,
+                        "op_type": "fs_mv",
+                        "params": {"src": "/local/a", "dst": "/local/b"},
+                        "completed": True,
+                    }
+                ],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        agfs.mv.assert_called_once_with("/local/b", "/local/a")
+        manager._journal.delete.assert_called_once_with("tx-3")
+
+    async def test_recover_fail_triggers_rollback(self):
+        """FAIL status → execute rollback."""
+        entries = {
+            "tx-fail": {
+                "id": "tx-fail",
+                "status": "FAIL",
+                "locks": [],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [
+                    {
+                        "sequence": 0,
+                        "op_type": "fs_mkdir",
+                        "params": {"uri": "/local/newdir"},
+                        "completed": True,
+                    }
+                ],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        agfs.rm.assert_called_once_with("/local/newdir")
+        manager._journal.delete.assert_called_once_with("tx-fail")
+
+    async def test_recover_exec_recover_all_includes_incomplete(self):
+        """EXEC recovery uses recover_all=True: also reverses incomplete entries."""
+        entries = {
+            "tx-partial": {
+                "id": "tx-partial",
+                "status": "EXEC",
+                "locks": [],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [
+                    {
+                        "sequence": 0,
+                        "op_type": "fs_mv",
+                        "params": {"src": "/local/a", "dst": "/local/b"},
+                        "completed": False,  # not completed, but recover_all=True should still reverse it
+                    }
+                ],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        agfs.mv.assert_called_once_with("/local/b", "/local/a")
+        manager._journal.delete.assert_called_once_with("tx-partial")
+
+    async def test_recover_init_just_cleans_up(self):
+        """INIT status → no rollback (nothing executed), just release locks and clean journal."""
+        entries = {
+            "tx-4": {
+                "id": "tx-4",
+                "status": "INIT",
+                "locks": ["/local/y/.path.ovlock"],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        agfs.rm.assert_called_once_with("/local/y/.path.ovlock")
+        manager._journal.delete.assert_called_once_with("tx-4")
+
+    async def test_recover_multiple_transactions(self):
+        """Multiple journals are all recovered."""
+        entries = {
+            "tx-a": {
+                "id": "tx-a",
+                "status": "INIT",
+                "locks": [],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            },
+            "tx-b": {
+                "id": "tx-b",
+                "status": "COMMIT",
+                "locks": [],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            },
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+        assert manager._journal.delete.call_count == 2
+
+    async def test_recover_init_empty_locks_cleans_orphan_via_init_info(self):
+        """INIT with empty locks but init_info.lock_paths → clean up orphan lock files."""
+        entries = {
+            "tx-orphan": {
+                "id": "tx-orphan",
+                "status": "INIT",
+                "locks": [],  # Empty: crash happened before journal recorded locks
+                "init_info": {
+                    "operation": "rm",
+                    "lock_paths": ["/local/orphan-dir"],
+                    "lock_mode": "subtree",
+                },
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+
+        # Simulate: the lock file exists and is owned by this transaction
+        from openviking.storage.transaction.path_lock import _make_fencing_token
+
+        token = _make_fencing_token("tx-orphan", "S")
+        agfs.cat.return_value = token.encode("utf-8")
+
+        await manager._recover_pending_transactions()
+
+        # Should have removed the orphan lock file
+        agfs.rm.assert_called()
+        rm_paths = [call[0][0] for call in agfs.rm.call_args_list]
+        assert any(".path.ovlock" in p for p in rm_paths)
+        manager._journal.delete.assert_called_once_with("tx-orphan")
+
+    async def test_recover_init_orphan_lock_owned_by_other_tx_not_removed(self):
+        """INIT with orphan lock path, but lock file owned by a different tx → not removed."""
+        entries = {
+            "tx-innocent": {
+                "id": "tx-innocent",
+                "status": "INIT",
+                "locks": [],
+                "init_info": {
+                    "operation": "rm",
+                    "lock_paths": ["/local/shared-dir"],
+                    "lock_mode": "subtree",
+                },
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+
+        # Lock file owned by a different transaction
+        from openviking.storage.transaction.path_lock import _make_fencing_token
+
+        token = _make_fencing_token("tx-OTHER-owner", "S")
+        agfs.cat.return_value = token.encode("utf-8")
+
+        await manager._recover_pending_transactions()
+
+        # rm should NOT be called for the lock file (only journal delete)
+        rm_calls = [call[0][0] for call in agfs.rm.call_args_list] if agfs.rm.called else []
+        assert not any(".path.ovlock" in p for p in rm_calls)
+        manager._journal.delete.assert_called_once_with("tx-innocent")
+
+    async def test_recover_aquire_status(self):
+        """AQUIRE status → same as INIT, clean up only."""
+        entries = {
+            "tx-acq": {
+                "id": "tx-acq",
+                "status": "AQUIRE",
+                "locks": ["/local/z/.path.ovlock"],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        agfs.rm.assert_called_once_with("/local/z/.path.ovlock")
+        manager._journal.delete.assert_called_once_with("tx-acq")
+
+    async def test_recover_releasing_status_triggers_rollback(self):
+        """RELEASING status → process crashed while releasing, rollback undo log."""
+        entries = {
+            "tx-rel": {
+                "id": "tx-rel",
+                "status": "RELEASING",
+                "locks": ["/local/r/.path.ovlock"],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [
+                    {
+                        "sequence": 0,
+                        "op_type": "fs_mkdir",
+                        "params": {"uri": "/local/tmpdir"},
+                        "completed": True,
+                    }
+                ],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+        await manager._recover_pending_transactions()
+
+        # Should rollback the undo log
+        rm_paths = [call[0][0] for call in agfs.rm.call_args_list]
+        assert "/local/tmpdir" in rm_paths
+        manager._journal.delete.assert_called_once_with("tx-rel")
+
+    async def test_recover_mv_orphan_locks_include_dst(self):
+        """INIT mv operation with init_info → check both lock_paths and mv_dst_path for orphan locks."""
+        entries = {
+            "tx-mv-orphan": {
+                "id": "tx-mv-orphan",
+                "status": "INIT",
+                "locks": [],
+                "init_info": {
+                    "operation": "mv",
+                    "lock_paths": ["/local/src-dir"],
+                    "lock_mode": "mv",
+                    "mv_dst_path": "/local/dst-dir",
+                },
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            }
+        }
+        manager, agfs = self._make_manager(entries)
+
+        from openviking.storage.transaction.path_lock import _make_fencing_token
+
+        token = _make_fencing_token("tx-mv-orphan", "P")
+        agfs.cat.return_value = token.encode("utf-8")
+
+        await manager._recover_pending_transactions()
+
+        # Should check both src and dst paths for orphan locks
+        cat_paths = [call[0][0] for call in agfs.cat.call_args_list]
+        assert any("src-dir" in p for p in cat_paths)
+        assert any("dst-dir" in p for p in cat_paths)
+
+    async def test_recover_journal_read_failure_skips_gracefully(self):
+        """If reading a journal entry fails, skip that tx and continue with others."""
+        agfs = MagicMock()
+        manager = TransactionManager(agfs_client=agfs, timeout=3600)
+        manager._journal = MagicMock()
+        manager._journal.list_all.return_value = ["tx-bad", "tx-good"]
+
+        def read_side_effect(tx_id):
+            if tx_id == "tx-bad":
+                raise Exception("corrupted journal")
+            return {
+                "id": "tx-good",
+                "status": "INIT",
+                "locks": [],
+                "created_at": time.time(),
+                "updated_at": time.time(),
+                "undo_log": [],
+                "post_actions": [],
+            }
+
+        manager._journal.read.side_effect = read_side_effect
+        manager._journal.delete = MagicMock()
+
+        await manager._recover_pending_transactions()
+
+        # tx-good should still be cleaned up
+        manager._journal.delete.assert_called_once_with("tx-good")
diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py
new file mode 100644
index 00000000..88b6b5d6
--- /dev/null
+++ b/tests/transaction/test_e2e.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""End-to-end transaction tests using real AGFS backend.
+
+These tests exercise the full stack: TransactionContext → TransactionManager →
+PathLock → Journal → AGFS, verifying the complete acquire → operate → commit/rollback
+→ release → journal cleanup lifecycle.
+"""
+
+import uuid
+
+import pytest
+
+from openviking.storage.transaction.context_manager import TransactionContext
+from openviking.storage.transaction.journal import TransactionJournal
+from openviking.storage.transaction.path_lock import LOCK_FILE_NAME
+from openviking.storage.transaction.transaction_manager import TransactionManager
+
+
+@pytest.fixture
+def tx_manager(agfs_client):
+    """Create a real TransactionManager backed by the test AGFS."""
+    manager = TransactionManager(
+        agfs_client=agfs_client,
+        timeout=3600,
+        max_parallel_locks=8,
+        lock_timeout=5.0,
+        lock_expire=300.0,
+    )
+    return manager
+
+
+class TestE2ECommit:
+    async def test_full_commit_lifecycle(self, agfs_client, tx_manager, test_dir):
+        """Full lifecycle: context enter → record undo → commit → locks released → journal cleaned."""
+        async with TransactionContext(
+            tx_manager, "test_write", [test_dir], lock_mode="point"
+        ) as tx:
+            # Lock should be acquired
+            lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+            token = agfs_client.cat(lock_path)
+            assert token is not None
+
+            # Record some operations
+            seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/file.txt"})
+            agfs_client.write(f"{test_dir}/file.txt", b"hello")
+            tx.mark_completed(seq)
+
+            # Add post action
+            tx.add_post_action(
+                "enqueue_semantic",
+                {"uri": "viking://test", "context_type": "resource", "account_id": "default"},
+            )
+
+            await tx.commit()
+
+        # After commit: lock should be released
+        try:
+            agfs_client.cat(lock_path)
+            raise AssertionError("Lock file should be gone after commit")
+        except Exception:
+            pass  # Expected
+
+        # Transaction should be removed from manager
+        assert tx_manager.get_transaction(tx.record.id) is None
+
+    async def test_commit_file_persists(self, agfs_client, tx_manager, test_dir):
+        """Files written inside a committed transaction persist."""
+        file_path = f"{test_dir}/committed-file.txt"
+
+        async with TransactionContext(tx_manager, "write_op", [test_dir], lock_mode="point") as tx:
+            seq = tx.record_undo("fs_write_new", {"uri": file_path})
+            agfs_client.write(file_path, b"committed data")
+            tx.mark_completed(seq)
+            await tx.commit()
+
+        content = agfs_client.cat(file_path)
+        assert content == b"committed data"
+
+
+class TestE2ERollback:
+    async def test_explicit_exception_triggers_rollback(self, agfs_client, tx_manager, test_dir):
+        """Exception inside context → auto-rollback → undo operations reversed."""
+        new_dir = f"{test_dir}/to-be-rolled-back-{uuid.uuid4().hex}"
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "failing_op", [test_dir], lock_mode="point"
+            ) as tx:
+                seq = tx.record_undo("fs_mkdir", {"uri": new_dir})
+                agfs_client.mkdir(new_dir)
+                tx.mark_completed(seq)
+
+                raise RuntimeError("simulated failure")
+
+        # Directory should be removed by rollback
+        try:
+            agfs_client.stat(new_dir)
+            raise AssertionError("Directory should be removed by rollback")
+        except Exception:
+            pass
+
+        # Lock should be released
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+        try:
+            agfs_client.cat(lock_path)
+            raise AssertionError("Lock should be released after rollback")
+        except Exception:
+            pass
+
+    async def test_no_commit_triggers_rollback(self, agfs_client, tx_manager, test_dir):
+        """Exiting context without calling commit() triggers auto-rollback."""
+        new_dir = f"{test_dir}/forgot-commit-{uuid.uuid4().hex}"
+
+        async with TransactionContext(tx_manager, "no_commit", [test_dir], lock_mode="point") as tx:
+            seq = tx.record_undo("fs_mkdir", {"uri": new_dir})
+            agfs_client.mkdir(new_dir)
+            tx.mark_completed(seq)
+            # Intentionally not calling tx.commit()
+
+        # Directory should be removed by rollback
+        try:
+            agfs_client.stat(new_dir)
+            raise AssertionError("Directory should be removed by rollback")
+        except Exception:
+            pass
+
+
+class TestE2EMvLock:
+    async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_dir):
+        """mv lock mode acquires SUBTREE on source and POINT on destination."""
+        src = f"{test_dir}/mv-src-{uuid.uuid4().hex}"
+        dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}"
+        agfs_client.mkdir(src)
+        agfs_client.mkdir(dst)
+
+        async with TransactionContext(
+            tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst
+        ) as tx:
+            # Both lock files should exist
+            src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}")
+            dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}")
+            src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token
+            dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token
+
+            assert ":S" in src_token_str  # SUBTREE on source
+            assert ":P" in dst_token_str  # POINT on destination
+
+            await tx.commit()
+
+        # Both locks released
+        for path in [f"{src}/{LOCK_FILE_NAME}", f"{dst}/{LOCK_FILE_NAME}"]:
+            try:
+                agfs_client.cat(path)
+                raise AssertionError(f"Lock {path} should be gone")
+            except Exception:
+                pass
+
+
+class TestE2ESubtreeRollback:
+    async def test_subtree_lock_with_rollback(self, agfs_client, tx_manager, test_dir):
+        """Subtree lock + rollback: undo is executed and lock released."""
+        target = f"{test_dir}/sub-rb-{uuid.uuid4().hex}"
+        agfs_client.mkdir(target)
+
+        child = f"{target}/child-{uuid.uuid4().hex}"
+
+        with pytest.raises(ValueError):
+            async with TransactionContext(tx_manager, "rm_op", [target], lock_mode="subtree") as tx:
+                seq = tx.record_undo("fs_mkdir", {"uri": child})
+                agfs_client.mkdir(child)
+                tx.mark_completed(seq)
+
+                raise ValueError("abort rm")
+
+        # Child dir should be removed by rollback
+        try:
+            agfs_client.stat(child)
+            raise AssertionError("Child should be cleaned up")
+        except Exception:
+            pass
+
+        # Lock released
+        try:
+            agfs_client.cat(f"{target}/{LOCK_FILE_NAME}")
+            raise AssertionError("Lock should be released")
+        except Exception:
+            pass
+
+
+class TestE2EJournalCleanup:
+    async def test_journal_cleaned_after_commit(self, agfs_client, tx_manager, test_dir):
+        """After successful commit, the journal entry for the transaction is deleted."""
+        journal = TransactionJournal(agfs_client)
+
+        async with TransactionContext(
+            tx_manager, "journal_test", [test_dir], lock_mode="point"
+        ) as tx:
+            tx_id = tx.record.id
+            await tx.commit()
+
+        # Journal should be cleaned up
+        all_ids = journal.list_all()
+        assert tx_id not in all_ids
+
+    async def test_journal_cleaned_after_rollback(self, agfs_client, tx_manager, test_dir):
+        """After rollback, the journal entry is also cleaned up."""
+        journal = TransactionJournal(agfs_client)
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "journal_rb", [test_dir], lock_mode="point"
+            ) as tx:
+                tx_id = tx.record.id
+                raise RuntimeError("force rollback")
+
+        all_ids = journal.list_all()
+        assert tx_id not in all_ids
+
+
+class TestE2ESequentialTransactions:
+    async def test_sequential_transactions_on_same_path(self, agfs_client, tx_manager, test_dir):
+        """Two sequential transactions on the same path both succeed."""
+        for i in range(3):
+            async with TransactionContext(
+                tx_manager, f"seq_{i}", [test_dir], lock_mode="point"
+            ) as tx:
+                seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/f{i}.txt"})
+                agfs_client.write(f"{test_dir}/f{i}.txt", f"data-{i}".encode())
+                tx.mark_completed(seq)
+                await tx.commit()
+
+        # All files should exist
+        for i in range(3):
+            content = agfs_client.cat(f"{test_dir}/f{i}.txt")
+            assert content == f"data-{i}".encode()
+
+        assert tx_manager.get_transaction_count() == 0
diff --git a/tests/transaction/test_journal.py b/tests/transaction/test_journal.py
new file mode 100644
index 00000000..57f1e483
--- /dev/null
+++ b/tests/transaction/test_journal.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for transaction journal."""
+
+import json
+import uuid
+from unittest.mock import MagicMock
+
+from openviking.storage.transaction.journal import TransactionJournal
+
+
+class TestTransactionJournal:
+    def _make_journal(self) -> tuple:
+        agfs = MagicMock()
+        journal = TransactionJournal(agfs)
+        return journal, agfs
+
+    def test_write_calls_agfs_write_with_correct_data(self):
+        journal, agfs = self._make_journal()
+        data = {"id": "tx-1", "status": "INIT", "locks": []}
+
+        journal.write(data)
+
+        # Should call agfs.write with the journal path and serialized data
+        agfs.write.assert_called_once()
+        path, payload = agfs.write.call_args[0]
+        assert "tx-1" in path
+        assert path.endswith("journal.json")
+        parsed = json.loads(payload.decode("utf-8"))
+        assert parsed["id"] == "tx-1"
+        assert parsed["status"] == "INIT"
+
+    def test_write_ensures_directories_exist(self):
+        journal, agfs = self._make_journal()
+        data = {"id": "tx-1", "status": "INIT", "locks": []}
+
+        journal.write(data)
+
+        # Should call mkdir at least once (for parent dirs)
+        assert agfs.mkdir.called
+
+    def test_update_overwrites(self):
+        journal, agfs = self._make_journal()
+        data = {"id": "tx-2", "status": "EXEC", "locks": []}
+
+        journal.update(data)
+
+        agfs.write.assert_called_once()
+        path, payload = agfs.write.call_args[0]
+        assert json.loads(payload.decode("utf-8"))["status"] == "EXEC"
+
+    def test_read_parses_json(self):
+        journal, agfs = self._make_journal()
+        agfs.cat.return_value = json.dumps({"id": "tx-3", "status": "EXEC"}).encode("utf-8")
+
+        result = journal.read("tx-3")
+        assert result["id"] == "tx-3"
+        assert result["status"] == "EXEC"
+
+    def test_read_handles_string_response(self):
+        """Some AGFS backends may return str instead of bytes."""
+        journal, agfs = self._make_journal()
+        agfs.cat.return_value = json.dumps({"id": "tx-str", "status": "INIT"})
+
+        result = journal.read("tx-str")
+        assert result["id"] == "tx-str"
+
+    def test_delete_removes_directory(self):
+        journal, agfs = self._make_journal()
+        journal.delete("tx-4")
+        agfs.rm.assert_called_once()
+        path = agfs.rm.call_args[0][0]
+        assert "tx-4" in path
+
+    def test_list_all_returns_tx_ids(self):
+        journal, agfs = self._make_journal()
+        agfs.ls.return_value = [
+            {"name": "tx-a", "isDir": True},
+            {"name": "tx-b", "isDir": True},
+            {"name": ".", "isDir": True},
+        ]
+
+        result = journal.list_all()
+        assert "tx-a" in result
+        assert "tx-b" in result
+        assert "." not in result
+
+    def test_list_all_filters_dotdot(self):
+        journal, agfs = self._make_journal()
+        agfs.ls.return_value = [
+            {"name": "..", "isDir": True},
+            {"name": "tx-real", "isDir": True},
+        ]
+
+        result = journal.list_all()
+        assert ".." not in result
+        assert "tx-real" in result
+
+    def test_list_all_empty_on_error(self):
+        journal, agfs = self._make_journal()
+        agfs.ls.side_effect = Exception("not found")
+
+        result = journal.list_all()
+        assert result == []
+
+    def test_delete_tolerates_missing(self):
+        journal, agfs = self._make_journal()
+        agfs.rm.side_effect = Exception("not found")
+        # Should not raise
+        journal.delete("tx-missing")
+
+    def test_write_with_post_actions(self):
+        journal, agfs = self._make_journal()
+        data = {
+            "id": "tx-5",
+            "status": "COMMIT",
+            "locks": [],
+            "post_actions": [
+                {"type": "enqueue_semantic", "params": {"uri": "viking://test"}},
+            ],
+        }
+        journal.write(data)
+        path, payload = agfs.write.call_args[0]
+        parsed = json.loads(payload.decode("utf-8"))
+        assert len(parsed["post_actions"]) == 1
+        assert parsed["post_actions"][0]["type"] == "enqueue_semantic"
+
+    def test_write_with_undo_log(self):
+        journal, agfs = self._make_journal()
+        data = {
+            "id": "tx-6",
+            "status": "EXEC",
+            "locks": [],
+            "undo_log": [
+                {
+                    "sequence": 0,
+                    "op_type": "fs_mv",
+                    "params": {"src": "/a", "dst": "/b"},
+                    "completed": True,
+                },
+            ],
+        }
+        journal.write(data)
+        _, payload = agfs.write.call_args[0]
+        parsed = json.loads(payload.decode("utf-8"))
+        assert len(parsed["undo_log"]) == 1
+        assert parsed["undo_log"][0]["op_type"] == "fs_mv"
+
+
+class TestTransactionJournalIntegration:
+    """Integration tests using real AGFS backend to verify persistence behavior."""
+
+    def test_write_read_roundtrip(self, agfs_client):
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-int-{uuid.uuid4().hex}"
+        data = {"id": tx_id, "status": "INIT", "locks": [], "undo_log": []}
+
+        journal.write(data)
+        result = journal.read(tx_id)
+
+        assert result["id"] == tx_id
+        assert result["status"] == "INIT"
+
+        journal.delete(tx_id)
+
+    def test_update_overwrites(self, agfs_client):
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-int-{uuid.uuid4().hex}"
+
+        journal.write({"id": tx_id, "status": "INIT", "locks": []})
+        journal.update({"id": tx_id, "status": "EXEC", "locks": []})
+
+        result = journal.read(tx_id)
+        assert result["status"] == "EXEC"
+
+        journal.delete(tx_id)
+
+    def test_delete_removes_journal(self, agfs_client):
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-int-{uuid.uuid4().hex}"
+
+        journal.write({"id": tx_id, "status": "INIT", "locks": []})
+        journal.delete(tx_id)
+
+        try:
+            journal.read(tx_id)
+            raise AssertionError("Should have raised after deletion")
+        except Exception:
+            pass  # Expected
+
+    def test_list_all_returns_written_ids(self, agfs_client):
+        journal = TransactionJournal(agfs_client)
+        tx_id_a = f"tx-int-{uuid.uuid4().hex}"
+        tx_id_b = f"tx-int-{uuid.uuid4().hex}"
+
+        journal.write({"id": tx_id_a, "status": "INIT", "locks": []})
+        journal.write({"id": tx_id_b, "status": "INIT", "locks": []})
+
+        result = journal.list_all()
+        assert tx_id_a in result
+        assert tx_id_b in result
+
+        journal.delete(tx_id_a)
+        journal.delete(tx_id_b)
+
+    def test_list_all_empty_when_none(self, agfs_client):
+        """After cleanup, list_all should not include previously deleted entries."""
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-int-{uuid.uuid4().hex}"
+
+        journal.write({"id": tx_id, "status": "INIT", "locks": []})
+        journal.delete(tx_id)
+
+        result = journal.list_all()
+        assert tx_id not in result
diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py
new file mode 100644
index 00000000..e9af3fdc
--- /dev/null
+++ b/tests/transaction/test_path_lock.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for path lock with fencing tokens."""
+
+import time
+from unittest.mock import MagicMock
+
+from openviking.storage.transaction.path_lock import (
+    LOCK_FILE_NAME,
+    LOCK_TYPE_POINT,
+    LOCK_TYPE_SUBTREE,
+    PathLock,
+    _make_fencing_token,
+    _parse_fencing_token,
+)
+from openviking.storage.transaction.transaction_record import TransactionRecord
+
+
+class TestFencingToken:
+    def test_make_parse_roundtrip(self):
+        token = _make_fencing_token("tx-123")
+        tx_id, ts, lock_type = _parse_fencing_token(token)
+        assert tx_id == "tx-123"
+        assert ts > 0
+        assert lock_type == LOCK_TYPE_POINT
+
+    def test_make_parse_subtree_roundtrip(self):
+        token = _make_fencing_token("tx-456", LOCK_TYPE_SUBTREE)
+        tx_id, ts, lock_type = _parse_fencing_token(token)
+        assert tx_id == "tx-456"
+        assert ts > 0
+        assert lock_type == LOCK_TYPE_SUBTREE
+
+    def test_parse_legacy_format_two_part(self):
+        """Legacy two-part token "{tx_id}:{ts}" defaults to POINT."""
+        tx_id, ts, lock_type = _parse_fencing_token("tx-old:1234567890")
+        assert tx_id == "tx-old"
+        assert ts == 1234567890
+        assert lock_type == LOCK_TYPE_POINT
+
+    def test_parse_legacy_format_plain(self):
+        """Plain tx_id (no colon) defaults to ts=0, lock_type=POINT."""
+        tx_id, ts, lock_type = _parse_fencing_token("tx-bare")
+        assert tx_id == "tx-bare"
+        assert ts == 0
+        assert lock_type == LOCK_TYPE_POINT
+
+    def test_tokens_are_unique(self):
+        t1 = _make_fencing_token("tx-1")
+        time.sleep(0.001)
+        t2 = _make_fencing_token("tx-1")
+        assert t1 != t2
+
+
+class TestPathLockStale:
+    def test_is_lock_stale_no_file(self):
+        agfs = MagicMock()
+        agfs.cat.side_effect = Exception("not found")
+        lock = PathLock(agfs)
+        assert lock.is_lock_stale("/test/.path.ovlock") is True
+
+    def test_is_lock_stale_legacy_token(self):
+        agfs = MagicMock()
+        agfs.cat.return_value = b"tx-old-format"
+        lock = PathLock(agfs)
+        assert lock.is_lock_stale("/test/.path.ovlock") is True
+
+    def test_is_lock_stale_recent_token(self):
+        agfs = MagicMock()
+        token = _make_fencing_token("tx-1")
+        agfs.cat.return_value = token.encode("utf-8")
+        lock = PathLock(agfs)
+        assert lock.is_lock_stale("/test/.path.ovlock", expire_seconds=300.0) is False
+
+
+class TestPathLockBehavior:
+    """Behavioral tests using real AGFS backend."""
+
+    async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir):
+        lock = PathLock(agfs_client)
+        tx = TransactionRecord(id="tx-point-1")
+
+        ok = await lock.acquire_point(test_dir, tx, timeout=3.0)
+        assert ok is True
+
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+        content = agfs_client.cat(lock_path)
+        token = content.decode("utf-8") if isinstance(content, bytes) else content
+        assert ":P" in token
+        assert "tx-point-1" in token
+
+        await lock.release(tx)
+
+    async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir):
+        lock = PathLock(agfs_client)
+        tx = TransactionRecord(id="tx-subtree-1")
+
+        ok = await lock.acquire_subtree(test_dir, tx, timeout=3.0)
+        assert ok is True
+
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+        content = agfs_client.cat(lock_path)
+        token = content.decode("utf-8") if isinstance(content, bytes) else content
+        assert ":S" in token
+        assert "tx-subtree-1" in token
+
+        await lock.release(tx)
+
+    async def test_acquire_point_dir_not_found(self, agfs_client):
+        lock = PathLock(agfs_client)
+        tx = TransactionRecord(id="tx-no-dir")
+
+        ok = await lock.acquire_point("/local/nonexistent-path-xyz", tx, timeout=0.5)
+        assert ok is False
+        assert len(tx.locks) == 0
+
+    async def test_release_removes_lock_file(self, agfs_client, test_dir):
+        lock = PathLock(agfs_client)
+        tx = TransactionRecord(id="tx-release-1")
+
+        await lock.acquire_point(test_dir, tx, timeout=3.0)
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+
+        await lock.release(tx)
+
+        # Lock file should be gone
+        try:
+            agfs_client.cat(lock_path)
+            raise AssertionError("Lock file should have been removed")
+        except Exception:
+            pass  # Expected: file not found
+
+    async def test_sequential_acquire_works(self, agfs_client, test_dir):
+        lock = PathLock(agfs_client)
+
+        tx1 = TransactionRecord(id="tx-seq-1")
+        ok1 = await lock.acquire_point(test_dir, tx1, timeout=3.0)
+        assert ok1 is True
+
+        await lock.release(tx1)
+
+        tx2 = TransactionRecord(id="tx-seq-2")
+        ok2 = await lock.acquire_point(test_dir, tx2, timeout=3.0)
+        assert ok2 is True
+
+        await lock.release(tx2)
+
+    async def test_point_blocked_by_ancestor_subtree(self, agfs_client, test_dir):
+        """POINT on child blocked while ancestor holds SUBTREE lock."""
+        import uuid as _uuid
+
+        child = f"{test_dir}/child-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(child)
+
+        lock = PathLock(agfs_client)
+        tx_parent = TransactionRecord(id="tx-parent-subtree")
+        ok = await lock.acquire_subtree(test_dir, tx_parent, timeout=3.0)
+        assert ok is True
+
+        tx_child = TransactionRecord(id="tx-child-point")
+        blocked = await lock.acquire_point(child, tx_child, timeout=0.5)
+        assert blocked is False
+
+        await lock.release(tx_parent)
+
+    async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir):
+        """SUBTREE on parent blocked while descendant holds POINT lock."""
+        import uuid as _uuid
+
+        child = f"{test_dir}/child-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(child)
+
+        lock = PathLock(agfs_client)
+        tx_child = TransactionRecord(id="tx-desc-point")
+        ok = await lock.acquire_point(child, tx_child, timeout=3.0)
+        assert ok is True
+
+        tx_parent = TransactionRecord(id="tx-parent-sub")
+        blocked = await lock.acquire_subtree(test_dir, tx_parent, timeout=0.5)
+        assert blocked is False
+
+        await lock.release(tx_child)
+
+    async def test_acquire_mv_creates_subtree_and_point(self, agfs_client, test_dir):
+        """acquire_mv puts SUBTREE on src and POINT on dst."""
+        import uuid as _uuid
+
+        src = f"{test_dir}/src-{_uuid.uuid4().hex}"
+        dst = f"{test_dir}/dst-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(src)
+        agfs_client.mkdir(dst)
+
+        lock = PathLock(agfs_client)
+        tx = TransactionRecord(id="tx-mv-1")
+        ok = await lock.acquire_mv(src, dst, tx, timeout=3.0)
+        assert ok is True
+
+        src_token_bytes = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}")
+        src_token = (
+            src_token_bytes.decode("utf-8")
+            if isinstance(src_token_bytes, bytes)
+            else src_token_bytes
+        )
+        assert ":S" in src_token
+
+        dst_token_bytes = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}")
+        dst_token = (
+            dst_token_bytes.decode("utf-8")
+            if isinstance(dst_token_bytes, bytes)
+            else dst_token_bytes
+        )
+        assert ":P" in dst_token
+
+        await lock.release(tx)
+
+    async def test_point_does_not_block_sibling_point(self, agfs_client, test_dir):
+        """POINT locks on different directories do not conflict."""
+        import uuid as _uuid
+
+        dir_a = f"{test_dir}/sibling-a-{_uuid.uuid4().hex}"
+        dir_b = f"{test_dir}/sibling-b-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(dir_a)
+        agfs_client.mkdir(dir_b)
+
+        lock = PathLock(agfs_client)
+        tx_a = TransactionRecord(id="tx-sib-a")
+        tx_b = TransactionRecord(id="tx-sib-b")
+
+        ok_a = await lock.acquire_point(dir_a, tx_a, timeout=3.0)
+        ok_b = await lock.acquire_point(dir_b, tx_b, timeout=3.0)
+
+        assert ok_a is True
+        assert ok_b is True
+
+        await lock.release(tx_a)
+        await lock.release(tx_b)
+
+    async def test_stale_lock_auto_removed_on_acquire(self, agfs_client, test_dir):
+        """A stale lock (expired fencing token) is auto-removed, allowing a new acquire."""
+        import uuid as _uuid
+
+        target = f"{test_dir}/stale-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(target)
+
+        lock_path = f"{target}/{LOCK_FILE_NAME}"
+
+        # Write a lock file with a very old timestamp (simulate crashed process)
+        old_ts = time.time_ns() - int(600 * 1e9)  # 600 seconds ago
+        stale_token = f"tx-dead:{old_ts}:{LOCK_TYPE_POINT}"
+        agfs_client.write(lock_path, stale_token.encode("utf-8"))
+
+        # New transaction should succeed by auto-removing the stale lock
+        lock = PathLock(agfs_client, lock_expire=300.0)
+        tx = TransactionRecord(id="tx-new-owner")
+        ok = await lock.acquire_point(target, tx, timeout=2.0)
+        assert ok is True
+
+        # Verify new lock is owned by our transaction
+        content = agfs_client.cat(lock_path)
+        token = content.decode("utf-8") if isinstance(content, bytes) else content
+        assert "tx-new-owner" in token
+
+        await lock.release(tx)
+
+    async def test_stale_subtree_ancestor_auto_removed(self, agfs_client, test_dir):
+        """A stale SUBTREE lock on ancestor is auto-removed when child acquires POINT."""
+        import uuid as _uuid
+
+        child = f"{test_dir}/child-stale-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(child)
+
+        # Write stale SUBTREE lock on parent
+        parent_lock = f"{test_dir}/{LOCK_FILE_NAME}"
+        old_ts = time.time_ns() - int(600 * 1e9)
+        stale_token = f"tx-dead-parent:{old_ts}:{LOCK_TYPE_SUBTREE}"
+        agfs_client.write(parent_lock, stale_token.encode("utf-8"))
+
+        lock = PathLock(agfs_client, lock_expire=300.0)
+        tx = TransactionRecord(id="tx-child-new")
+        ok = await lock.acquire_point(child, tx, timeout=2.0)
+        assert ok is True
+
+        await lock.release(tx)
+        # Clean up stale parent lock if still present
+        try:
+            agfs_client.rm(parent_lock)
+        except Exception:
+            pass
+
+    async def test_point_same_path_no_wait_fails_immediately(self, agfs_client, test_dir):
+        """With timeout=0, a conflicting lock fails immediately."""
+        import uuid as _uuid
+
+        target = f"{test_dir}/nowait-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(target)
+
+        lock = PathLock(agfs_client)
+        tx1 = TransactionRecord(id="tx-hold")
+        ok1 = await lock.acquire_point(target, tx1, timeout=3.0)
+        assert ok1 is True
+
+        # Second acquire with timeout=0 should fail immediately
+        tx2 = TransactionRecord(id="tx-blocked")
+        t0 = time.monotonic()
+        ok2 = await lock.acquire_point(target, tx2, timeout=0.0)
+        elapsed = time.monotonic() - t0
+
+        assert ok2 is False
+        assert elapsed < 1.0  # Should not wait
+
+        await lock.release(tx1)
+
+    async def test_subtree_same_path_mutual_exclusion(self, agfs_client, test_dir):
+        """Two SUBTREE locks on the same path: second one blocked until first releases."""
+        import uuid as _uuid
+
+        target = f"{test_dir}/sub-excl-{_uuid.uuid4().hex}"
+        agfs_client.mkdir(target)
+
+        lock = PathLock(agfs_client)
+        tx1 = TransactionRecord(id="tx-sub1")
+        ok1 = await lock.acquire_subtree(target, tx1, timeout=3.0)
+        assert ok1 is True
+
+        tx2 = TransactionRecord(id="tx-sub2")
+        ok2 = await lock.acquire_subtree(target, tx2, timeout=0.5)
+        assert ok2 is False
+
+        await lock.release(tx1)
+
+        # Now tx2 should succeed
+        ok2_retry = await lock.acquire_subtree(target, tx2, timeout=3.0)
+        assert ok2_retry is True
+        await lock.release(tx2)
diff --git a/tests/transaction/test_post_actions.py b/tests/transaction/test_post_actions.py
new file mode 100644
index 00000000..2ae3c12b
--- /dev/null
+++ b/tests/transaction/test_post_actions.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for post_actions execution and replay."""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from openviking.storage.transaction.transaction_manager import TransactionManager
+
+
+class TestPostActions:
+    def _make_manager(self):
+        agfs = MagicMock()
+        manager = TransactionManager(agfs_client=agfs, timeout=3600)
+        manager._journal = MagicMock()
+        return manager, agfs
+
+    async def test_execute_enqueue_semantic(self):
+        manager, _ = self._make_manager()
+
+        mock_queue = AsyncMock()
+        mock_queue_manager = MagicMock()
+        mock_queue_manager.get_queue.return_value = mock_queue
+
+        with patch(
+            "openviking.storage.queuefs.get_queue_manager",
+            return_value=mock_queue_manager,
+        ):
+            await manager._execute_post_actions(
+                [
+                    {
+                        "type": "enqueue_semantic",
+                        "params": {
+                            "uri": "viking://resources/test",
+                            "context_type": "resource",
+                            "account_id": "acc-1",
+                        },
+                    }
+                ]
+            )
+
+        mock_queue.enqueue.assert_called_once()
+        msg = mock_queue.enqueue.call_args[0][0]
+        assert msg.uri == "viking://resources/test"
+        assert msg.context_type == "resource"
+        assert msg.account_id == "acc-1"
+
+    async def test_execute_unknown_action_logged(self):
+        manager, _ = self._make_manager()
+        # Should not raise, just log
+        await manager._execute_post_actions(
+            [
+                {"type": "unknown_action", "params": {}},
+            ]
+        )
+
+    async def test_execute_multiple_actions(self):
+        manager, _ = self._make_manager()
+
+        mock_queue = AsyncMock()
+        mock_queue_manager = MagicMock()
+        mock_queue_manager.get_queue.return_value = mock_queue
+
+        with patch(
+            "openviking.storage.queuefs.get_queue_manager",
+            return_value=mock_queue_manager,
+        ):
+            await manager._execute_post_actions(
+                [
+                    {
+                        "type": "enqueue_semantic",
+                        "params": {
+                            "uri": "viking://a",
+                            "context_type": "resource",
+                            "account_id": "acc-1",
+                        },
+                    },
+                    {
+                        "type": "enqueue_semantic",
+                        "params": {
+                            "uri": "viking://b",
+                            "context_type": "memory",
+                            "account_id": "acc-2",
+                        },
+                    },
+                ]
+            )
+
+        assert mock_queue.enqueue.call_count == 2
+
+    async def test_post_action_failure_does_not_crash(self):
+        manager, _ = self._make_manager()
+
+        mock_queue_manager = MagicMock()
+        mock_queue_manager.get_queue.side_effect = Exception("queue not available")
+
+        with patch(
+            "openviking.storage.queuefs.get_queue_manager",
+            return_value=mock_queue_manager,
+        ):
+            # Should not raise
+            await manager._execute_post_actions(
+                [
+                    {
+                        "type": "enqueue_semantic",
+                        "params": {
+                            "uri": "viking://test",
+                            "context_type": "resource",
+                            "account_id": "",
+                        },
+                    },
+                ]
+            )
diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py
new file mode 100644
index 00000000..ee28b7e7
--- /dev/null
+++ b/tests/transaction/test_rm_rollback.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Integration tests: multi-step rollback covering FS + VectorDB coordination."""
+
+from unittest.mock import AsyncMock, MagicMock
+
+from openviking.storage.transaction.undo import UndoEntry, execute_rollback
+
+
+class TestRmRollback:
+    def test_vectordb_records_restored_on_fs_failure(self):
+        """When FS rm fails (incomplete), VectorDB delete is rolled back via snapshot."""
+        agfs = MagicMock()
+        vector_store = AsyncMock()
+        ctx = MagicMock()
+
+        snapshot = [{"id": "r1", "uri": "viking://a", "content": "data"}]
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_delete",
+                params={"uris": ["viking://a"], "records_snapshot": snapshot},
+                completed=True,  # VectorDB delete succeeded
+            ),
+            UndoEntry(
+                sequence=1,
+                op_type="fs_rm",
+                params={"uri": "/local/test", "recursive": True},
+                completed=False,  # FS rm never ran
+            ),
+        ]
+
+        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+
+        # Only vectordb_delete (completed=True) is reversed
+        vector_store.upsert.assert_called_once_with(snapshot[0])
+        # fs_rm is incomplete, so it's skipped (also fs_rm is never reversible anyway)
+        agfs.rm.assert_not_called()
+
+    def test_fs_rm_not_reversible_even_when_completed(self):
+        """fs_rm is intentionally irreversible: even completed=True is skipped."""
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="fs_rm",
+                params={"uri": "/local/test"},
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs)
+        agfs.rm.assert_not_called()
+        agfs.mv.assert_not_called()
+
+
+class TestMvRollback:
+    def test_file_moved_back_on_vectordb_failure(self):
+        """When VectorDB update fails (incomplete), FS mv is reversed."""
+        agfs = MagicMock()
+
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="fs_mv",
+                params={"src": "/local/a", "dst": "/local/b"},
+                completed=True,  # FS mv succeeded
+            ),
+            UndoEntry(
+                sequence=1,
+                op_type="vectordb_update_uri",
+                params={
+                    "old_uri": "viking://a",
+                    "new_uri": "viking://b",
+                    "old_parent_uri": "viking://",
+                },
+                completed=False,  # VectorDB update never ran
+            ),
+        ]
+
+        execute_rollback(undo_log, agfs)
+
+        # Only fs_mv (completed=True) is reversed
+        agfs.mv.assert_called_once_with("/local/b", "/local/a")
+
+
+class TestRecoverAll:
+    def test_recover_all_reverses_incomplete_entries(self):
+        """recover_all=True (crash recovery mode) also reverses incomplete entries."""
+        agfs = MagicMock()
+
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="fs_mkdir",
+                params={"uri": "/local/newdir"},
+                completed=True,
+            ),
+            UndoEntry(
+                sequence=1,
+                op_type="fs_mv",
+                params={"src": "/local/a", "dst": "/local/b"},
+                completed=False,  # Crash happened mid-operation
+            ),
+        ]
+
+        execute_rollback(undo_log, agfs, recover_all=True)
+
+        # Both entries should be reversed (in reverse sequence order)
+        assert agfs.mv.call_count == 1
+        agfs.mv.assert_called_once_with("/local/b", "/local/a")
+        agfs.rm.assert_called_once_with("/local/newdir")
+
+    def test_recover_all_false_skips_incomplete(self):
+        """recover_all=False (normal rollback) skips incomplete entries."""
+        agfs = MagicMock()
+
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="fs_mv",
+                params={"src": "/local/a", "dst": "/local/b"},
+                completed=False,
+            ),
+        ]
+
+        execute_rollback(undo_log, agfs, recover_all=False)
+        agfs.mv.assert_not_called()
+
+
+class TestVectorDBRollbackEdgeCases:
+    def test_multi_record_vectordb_delete_rollback(self):
+        """Multiple VectorDB records in snapshot should all be restored."""
+        agfs = MagicMock()
+        vector_store = AsyncMock()
+        ctx = MagicMock()
+
+        snapshot = [
+            {"id": "r1", "uri": "viking://a", "content": "data1"},
+            {"id": "r2", "uri": "viking://b", "content": "data2"},
+            {"id": "r3", "uri": "viking://c", "content": "data3"},
+        ]
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_delete",
+                params={
+                    "uris": ["viking://a", "viking://b", "viking://c"],
+                    "records_snapshot": snapshot,
+                },
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+
+        assert vector_store.upsert.call_count == 3
+
+    def test_empty_snapshot_vectordb_delete_rollback(self):
+        """Empty snapshot → nothing to restore, no error."""
+        agfs = MagicMock()
+        vector_store = AsyncMock()
+        ctx = MagicMock()
+
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_delete",
+                params={"uris": [], "records_snapshot": []},
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+        vector_store.upsert.assert_not_called()
+
+    def test_vectordb_delete_partial_restore_failure(self):
+        """If restoring one record fails, others should still be attempted."""
+        agfs = MagicMock()
+        vector_store = AsyncMock()
+        ctx = MagicMock()
+
+        call_count = 0
+
+        async def upsert_side_effect(record):
+            nonlocal call_count
+            call_count += 1
+            if record["id"] == "r2":
+                raise Exception("upsert failed")
+
+        vector_store.upsert = AsyncMock(side_effect=upsert_side_effect)
+
+        snapshot = [
+            {"id": "r1", "uri": "viking://a"},
+            {"id": "r2", "uri": "viking://b"},  # This one fails
+            {"id": "r3", "uri": "viking://c"},
+        ]
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_delete",
+                params={"records_snapshot": snapshot},
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+
+        # All 3 should be attempted (best-effort per record)
+        assert call_count == 3
+
+    def test_vectordb_upsert_rollback_without_vector_store_is_noop(self):
+        """vectordb_upsert rollback without vector_store does nothing."""
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_upsert",
+                params={"record_id": "r1"},
+                completed=True,
+            ),
+        ]
+        # Should not raise
+        execute_rollback(undo_log, agfs, vector_store=None)
+
+    def test_unknown_op_type_does_not_crash(self):
+        """Unknown op_type is logged but doesn't raise."""
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="some_future_op",
+                params={"foo": "bar"},
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs)
diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py
new file mode 100644
index 00000000..ab9d5256
--- /dev/null
+++ b/tests/transaction/test_transaction_manager.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for TransactionManager: CRUD, lifecycle, commit/rollback flows, timeout cleanup."""
+
+import time
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from openviking.storage.transaction.transaction_manager import TransactionManager
+from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus
+
+
+def _make_manager(**kwargs):
+    """Create a TransactionManager with mocked AGFS and journal."""
+    agfs = MagicMock()
+    defaults = {"agfs_client": agfs, "timeout": 3600, "lock_timeout": 0.0, "lock_expire": 300.0}
+    defaults.update(kwargs)
+    manager = TransactionManager(**defaults)
+    manager._journal = MagicMock()
+    manager._journal.list_all.return_value = []
+    return manager, agfs
+
+
+class TestCreateAndGet:
+    def test_create_transaction_returns_record(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction(init_info={"operation": "rm"})
+        assert isinstance(tx, TransactionRecord)
+        assert tx.status == TransactionStatus.INIT
+        assert tx.init_info == {"operation": "rm"}
+
+    def test_create_assigns_unique_ids(self):
+        manager, _ = _make_manager()
+        tx1 = manager.create_transaction()
+        tx2 = manager.create_transaction()
+        assert tx1.id != tx2.id
+
+    def test_get_transaction_found(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        assert manager.get_transaction(tx.id) is tx
+
+    def test_get_transaction_not_found(self):
+        manager, _ = _make_manager()
+        assert manager.get_transaction("nonexistent") is None
+
+    def test_get_transaction_count(self):
+        manager, _ = _make_manager()
+        assert manager.get_transaction_count() == 0
+        manager.create_transaction()
+        assert manager.get_transaction_count() == 1
+        manager.create_transaction()
+        assert manager.get_transaction_count() == 2
+
+    def test_get_active_transactions(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        active = manager.get_active_transactions()
+        assert tx.id in active
+        # Returned copy, not the internal dict
+        active.pop(tx.id)
+        assert manager.get_transaction(tx.id) is tx
+
+
+class TestBegin:
+    async def test_begin_updates_status(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        ok = await manager.begin(tx.id)
+        assert ok is True
+        assert tx.status == TransactionStatus.AQUIRE
+
+    async def test_begin_unknown_tx(self):
+        manager, _ = _make_manager()
+        ok = await manager.begin("unknown-tx")
+        assert ok is False
+
+
+class TestCommitFlow:
+    async def test_commit_full_lifecycle(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+
+        # Simulate lock acquisition
+        tx.update_status(TransactionStatus.EXEC)
+        tx.add_lock("/test/.path.ovlock")
+
+        ok = await manager.commit(tx.id)
+        assert ok is True
+        assert tx.status == TransactionStatus.RELEASED
+        # Removed from active transactions
+        assert manager.get_transaction(tx.id) is None
+        # Journal cleaned up
+        manager._journal.delete.assert_called_once_with(tx.id)
+
+    async def test_commit_persists_journal_before_release(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+
+        call_order = []
+        original_update = manager._journal.update
+
+        def track_update(data):
+            call_order.append(("journal_update", data.get("status")))
+            return original_update(data)
+
+        manager._journal.update = track_update
+        manager._journal.delete = MagicMock(
+            side_effect=lambda _: call_order.append(("journal_delete",))
+        )
+
+        await manager.commit(tx.id)
+        # Journal update (COMMIT) happens before delete
+        assert call_order[0] == ("journal_update", "COMMIT")
+
+    async def test_commit_executes_post_actions(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+        tx.post_actions.append({"type": "enqueue_semantic", "params": {"uri": "viking://x"}})
+
+        with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post:
+            await manager.commit(tx.id)
+        mock_post.assert_called_once()
+
+    async def test_commit_unknown_tx(self):
+        manager, _ = _make_manager()
+        ok = await manager.commit("nonexistent")
+        assert ok is False
+
+    async def test_commit_releases_locks(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+        tx.add_lock("/a/.path.ovlock")
+        tx.add_lock("/b/.path.ovlock")
+
+        with patch.object(manager._path_lock, "release", new_callable=AsyncMock) as mock_release:
+            await manager.commit(tx.id)
+        mock_release.assert_called_once()
+
+
+class TestRollbackFlow:
+    async def test_rollback_executes_undo_log(self):
+        manager, agfs = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+
+        from openviking.storage.transaction.undo import UndoEntry
+
+        tx.undo_log.append(
+            UndoEntry(
+                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+            )
+        )
+
+        ok = await manager.rollback(tx.id)
+        assert ok is True
+        assert tx.status == TransactionStatus.RELEASED
+        agfs.mv.assert_called_once_with("/b", "/a")
+
+    async def test_rollback_removes_from_active(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+
+        await manager.rollback(tx.id)
+        assert manager.get_transaction(tx.id) is None
+
+    async def test_rollback_cleans_journal(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+
+        await manager.rollback(tx.id)
+        manager._journal.delete.assert_called_once_with(tx.id)
+
+    async def test_rollback_unknown_tx(self):
+        manager, _ = _make_manager()
+        ok = await manager.rollback("nonexistent")
+        assert ok is False
+
+    async def test_rollback_undo_failure_does_not_prevent_cleanup(self):
+        """Undo failure is best-effort; lock release and journal cleanup still happen."""
+        manager, agfs = _make_manager()
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+
+        from openviking.storage.transaction.undo import UndoEntry
+
+        tx.undo_log.append(
+            UndoEntry(
+                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+            )
+        )
+        agfs.mv.side_effect = Exception("disk error")
+
+        ok = await manager.rollback(tx.id)
+        assert ok is True
+        manager._journal.delete.assert_called_once()
+
+
+class TestLockAcquisitionWrappers:
+    async def test_acquire_lock_point_success_transitions_to_exec(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+
+        with patch.object(
+            manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=True
+        ):
+            ok = await manager.acquire_lock_point(tx.id, "/test")
+        assert ok is True
+        assert tx.status == TransactionStatus.EXEC
+
+    async def test_acquire_lock_point_failure_transitions_to_fail(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+
+        with patch.object(
+            manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=False
+        ):
+            ok = await manager.acquire_lock_point(tx.id, "/test")
+        assert ok is False
+        assert tx.status == TransactionStatus.FAIL
+
+    async def test_acquire_lock_subtree_success(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+
+        with patch.object(
+            manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True
+        ):
+            ok = await manager.acquire_lock_subtree(tx.id, "/test")
+        assert ok is True
+        assert tx.status == TransactionStatus.EXEC
+
+    async def test_acquire_lock_subtree_uses_config_timeout(self):
+        manager, _ = _make_manager(lock_timeout=5.0)
+        tx = manager.create_transaction()
+
+        with patch.object(
+            manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True
+        ) as mock_acquire:
+            await manager.acquire_lock_subtree(tx.id, "/test")
+        mock_acquire.assert_called_once_with("/test", tx, timeout=5.0)
+
+    async def test_acquire_lock_subtree_override_timeout(self):
+        manager, _ = _make_manager(lock_timeout=5.0)
+        tx = manager.create_transaction()
+
+        with patch.object(
+            manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True
+        ) as mock_acquire:
+            await manager.acquire_lock_subtree(tx.id, "/test", timeout=10.0)
+        mock_acquire.assert_called_once_with("/test", tx, timeout=10.0)
+
+    async def test_acquire_lock_mv_success(self):
+        manager, _ = _make_manager()
+        tx = manager.create_transaction()
+
+        with patch.object(
+            manager._path_lock, "acquire_mv", new_callable=AsyncMock, return_value=True
+        ):
+            ok = await manager.acquire_lock_mv(tx.id, "/src", "/dst")
+        assert ok is True
+        assert tx.status == TransactionStatus.EXEC
+
+    async def test_acquire_lock_unknown_tx(self):
+        manager, _ = _make_manager()
+        ok = await manager.acquire_lock_point("nonexistent", "/test")
+        assert ok is False
+
+
+class TestLifecycle:
+    async def test_start_sets_running(self):
+        manager, _ = _make_manager()
+        await manager.start()
+        assert manager._running is True
+        manager.stop()
+
+    async def test_start_idempotent(self):
+        manager, _ = _make_manager()
+        await manager.start()
+        await manager.start()  # Should not error
+        assert manager._running is True
+        manager.stop()
+
+    async def test_stop_clears_state(self):
+        manager, _ = _make_manager()
+        await manager.start()
+        manager.create_transaction()
+        manager.stop()
+        assert manager._running is False
+        assert manager.get_transaction_count() == 0
+
+    async def test_stop_idempotent(self):
+        manager, _ = _make_manager()
+        manager.stop()
+        manager.stop()  # Should not error
+
+
+class TestTimeoutCleanup:
+    async def test_cleanup_timed_out_rolls_back(self):
+        manager, _ = _make_manager(timeout=1)
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+        # Simulate old updated_at
+        tx.updated_at = time.time() - 10
+
+        with patch.object(
+            manager, "rollback", new_callable=AsyncMock, return_value=True
+        ) as mock_rb:
+            await manager._cleanup_timed_out()
+        mock_rb.assert_called_once_with(tx.id)
+
+    async def test_cleanup_skips_fresh_transactions(self):
+        manager, _ = _make_manager(timeout=3600)
+        tx = manager.create_transaction()
+        tx.update_status(TransactionStatus.EXEC)
+
+        with patch.object(manager, "rollback", new_callable=AsyncMock) as mock_rb:
+            await manager._cleanup_timed_out()
+        mock_rb.assert_not_called()
diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py
new file mode 100644
index 00000000..d67063d1
--- /dev/null
+++ b/tests/transaction/test_undo.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for undo log and rollback executor."""
+
+from unittest.mock import AsyncMock, MagicMock
+
+from openviking.storage.transaction.undo import UndoEntry, execute_rollback
+
+
+class TestUndoEntry:
+    def test_to_dict(self):
+        entry = UndoEntry(sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"})
+        d = entry.to_dict()
+        assert d["sequence"] == 0
+        assert d["op_type"] == "fs_mv"
+        assert d["params"] == {"src": "/a", "dst": "/b"}
+        assert d["completed"] is False
+
+    def test_from_dict(self):
+        data = {"sequence": 1, "op_type": "fs_rm", "params": {"uri": "/x"}, "completed": True}
+        entry = UndoEntry.from_dict(data)
+        assert entry.sequence == 1
+        assert entry.op_type == "fs_rm"
+        assert entry.completed is True
+
+    def test_roundtrip(self):
+        entry = UndoEntry(
+            sequence=5, op_type="vectordb_upsert", params={"record_id": "r1"}, completed=True
+        )
+        restored = UndoEntry.from_dict(entry.to_dict())
+        assert restored.sequence == entry.sequence
+        assert restored.op_type == entry.op_type
+        assert restored.params == entry.params
+        assert restored.completed == entry.completed
+
+
+class TestExecuteRollback:
+    def test_rollback_fs_mv(self):
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+            ),
+        ]
+        execute_rollback(undo_log, agfs)
+        agfs.mv.assert_called_once_with("/b", "/a")
+
+    def test_rollback_fs_rm_skipped(self):
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(sequence=0, op_type="fs_rm", params={"uri": "/a"}, completed=True),
+        ]
+        execute_rollback(undo_log, agfs)
+        agfs.mv.assert_not_called()
+        agfs.rm.assert_not_called()
+
+    def test_rollback_fs_mkdir(self):
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": "/a/b"}, completed=True),
+        ]
+        execute_rollback(undo_log, agfs)
+        agfs.rm.assert_called_once_with("/a/b")
+
+    def test_rollback_fs_write_new(self):
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0, op_type="fs_write_new", params={"uri": "/a/f.txt"}, completed=True
+            ),
+        ]
+        execute_rollback(undo_log, agfs)
+        agfs.rm.assert_called_once_with("/a/f.txt", recursive=True)
+
+    def test_rollback_vectordb_upsert(self):
+        agfs = MagicMock()
+        vector_store = AsyncMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_upsert",
+                params={"record_id": "r1"},
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs, vector_store=vector_store)
+        vector_store.delete.assert_called_once_with(["r1"])
+
+    def test_rollback_vectordb_update_uri(self):
+        agfs = MagicMock()
+        ctx = MagicMock()
+        vector_store = AsyncMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="vectordb_update_uri",
+                params={
+                    "old_uri": "viking://a",
+                    "new_uri": "viking://b",
+                    "old_parent_uri": "viking://",
+                },
+                completed=True,
+            ),
+        ]
+        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+        vector_store.update_uri_mapping.assert_called_once_with(
+            ctx=ctx, uri="viking://b", new_uri="viking://a", new_parent_uri="viking://"
+        )
+
+    def test_rollback_reverse_order(self):
+        """Rollback should process entries in reverse sequence order."""
+        agfs = MagicMock()
+        call_order = []
+        original_mv = agfs.mv
+        original_rm = agfs.rm
+
+        def track_mv(*args):
+            call_order.append(("mv", args))
+            return original_mv(*args)
+
+        def track_rm(*args, **kwargs):
+            call_order.append(("rm", args))
+            return original_rm(*args, **kwargs)
+
+        agfs.mv = track_mv
+        agfs.rm = track_rm
+
+        undo_log = [
+            UndoEntry(
+                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+            ),
+            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True),
+        ]
+        execute_rollback(undo_log, agfs)
+        # seq=1 should be rolled back first (mkdir→rm), then seq=0 (mv→reverse mv)
+        assert call_order[0][0] == "rm"
+        assert call_order[1][0] == "mv"
+
+    def test_rollback_skips_incomplete(self):
+        agfs = MagicMock()
+        undo_log = [
+            UndoEntry(
+                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=False
+            ),
+        ]
+        execute_rollback(undo_log, agfs)
+        agfs.mv.assert_not_called()
+
+    def test_rollback_best_effort(self):
+        """A failing rollback entry should not prevent others from running."""
+        agfs = MagicMock()
+        agfs.rm.side_effect = Exception("boom")
+        agfs.mv = MagicMock()
+
+        undo_log = [
+            UndoEntry(
+                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+            ),
+            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True),
+        ]
+        execute_rollback(undo_log, agfs)
+        # fs_mkdir rollback failed (rm raises), but fs_mv rollback should still run
+        agfs.mv.assert_called_once_with("/b", "/a")
diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go
index f2ccde99..c20fdc66 100644
--- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go
+++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/backend.go
@@ -24,9 +24,18 @@ type QueueBackend interface {
 	// Enqueue adds a message to a queue
 	Enqueue(queueName string, msg QueueMessage) error
 
-	// Dequeue removes and returns the first message from a queue
+	// Dequeue marks the first pending message as 'processing' and returns it.
+	// Call Ack after successful processing to permanently delete the message.
 	Dequeue(queueName string) (QueueMessage, bool, error)
 
+	// Ack permanently deletes a message that has been successfully processed.
+	Ack(queueName string, messageID string) error
+
+	// RecoverStale resets messages stuck in 'processing' state back to 'pending'.
+	// staleSec: minimum age in seconds; pass 0 to reset all processing messages.
+	// Returns the number of messages recovered.
+	RecoverStale(staleSec int64) (int, error)
+
 	// Peek returns the first message without removing it
 	Peek(queueName string) (QueueMessage, bool, error)
 
@@ -124,6 +133,16 @@ func (b *MemoryBackend) Dequeue(queueName string) (QueueMessage, bool, error) {
 	return msg, true, nil
 }
 
+// Ack is a no-op for the memory backend (messages are already removed on Dequeue).
+func (b *MemoryBackend) Ack(queueName string, messageID string) error {
+	return nil
+}
+
+// RecoverStale is a no-op for the memory backend (no persistence across restarts).
+func (b *MemoryBackend) RecoverStale(staleSec int64) (int, error) {
+	return 0, nil
+}
+
 func (b *MemoryBackend) Peek(queueName string) (QueueMessage, bool, error) {
 	queue, exists := b.queues[queueName]
 	if !exists {
@@ -345,6 +364,16 @@ func (b *TiDBBackend) Enqueue(queueName string, msg QueueMessage) error {
 	return nil
 }
 
+// Ack is not yet implemented for TiDB backend (messages are already soft-deleted on Dequeue).
+func (b *TiDBBackend) Ack(queueName string, messageID string) error {
+	return nil
+}
+
+// RecoverStale is not yet implemented for TiDB backend.
+func (b *TiDBBackend) RecoverStale(staleSec int64) (int, error) {
+	return 0, nil
+}
+
 func (b *TiDBBackend) Dequeue(queueName string) (QueueMessage, bool, error) {
 	// Get table name from cache (lazy loading)
 	tableName, err := b.getTableName(queueName, false)
diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go
index 03b7342f..9639531c 100644
--- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go
+++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/db_backend.go
@@ -63,16 +63,22 @@ func (b *SQLiteDBBackend) GetInitSQL() []string {
 			last_updated INTEGER DEFAULT (strftime('%s', 'now'))
 		)`,
 		// Queue messages table
+		// status: 'pending' (waiting) | 'processing' (dequeued, not yet acked)
+		// processing_started_at: Unix timestamp when dequeued; NULL if pending
 		`CREATE TABLE IF NOT EXISTS queue_messages (
 			id INTEGER PRIMARY KEY AUTOINCREMENT,
 			queue_name TEXT NOT NULL,
 			message_id TEXT NOT NULL,
 			data TEXT NOT NULL,
 			timestamp INTEGER NOT NULL,
+			status TEXT NOT NULL DEFAULT 'pending',
+			processing_started_at INTEGER,
 			created_at INTEGER DEFAULT (strftime('%s', 'now'))
 		)`,
 		`CREATE INDEX IF NOT EXISTS idx_queue_name ON queue_messages(queue_name)`,
 		`CREATE INDEX IF NOT EXISTS idx_queue_order ON queue_messages(queue_name, id)`,
+		`CREATE INDEX IF NOT EXISTS idx_queue_status ON queue_messages(queue_name, status, id)`,
+		`CREATE INDEX IF NOT EXISTS idx_queue_message_id ON queue_messages(queue_name, message_id)`,
 	}
 }
 
diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go
index d8d481b0..052a8f19 100644
--- a/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go
+++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/queuefs.go
@@ -137,7 +137,9 @@ func (q *QueueFSPlugin) Initialize(cfg map[string]interface{}) error {
 	switch backendType {
 	case "memory":
 		backend = NewMemoryBackend()
-	case "tidb", "mysql", "sqlite", "sqlite3":
+	case "sqlite", "sqlite3":
+		backend = NewSQLiteQueueBackend()
+	case "tidb", "mysql":
 		backend = NewTiDBBackend()
 	default:
 		return fmt.Errorf("unsupported backend: %s", backendType)
@@ -384,6 +386,7 @@ var queueOperations = map[string]bool{
 	"peek":    true,
 	"size":    true,
 	"clear":   true,
+	"ack":     true, // write message_id to confirm processing complete (at-least-once delivery)
 }
 
 // parseQueuePath parses a path like "/queue_name/operation" or "/dir/queue_name/operation"
@@ -529,7 +532,7 @@ func (qfs *queueFS) Read(path string, offset int64, size int64) ([]byte, error)
 		data, err = qfs.peek(queueName)
 	case "size":
 		data, err = qfs.size(queueName)
-	case "enqueue", "clear":
+	case "enqueue", "clear", "ack":
 		// Write-only files
 		return []byte(""), fmt.Errorf("permission denied: %s is write-only", path)
 	default:
@@ -573,6 +576,12 @@ func (qfs *queueFS) Write(path string, data []byte, offset int64, flags filesyst
 			return 0, err
 		}
 		return 0, nil
+	case "ack":
+		msgID := strings.TrimSpace(string(data))
+		if err := qfs.ackMessage(queueName, msgID); err != nil {
+			return 0, err
+		}
+		return int64(len(data)), nil
 	default:
 		return 0, fmt.Errorf("cannot write to: %s", path)
 	}
@@ -844,7 +853,7 @@ func (qfs *queueFS) Stat(p string) (*filesystem.FileInfo, error) {
 	}
 
 	mode := uint32(0644)
-	if operation == "enqueue" || operation == "clear" {
+	if operation == "enqueue" || operation == "clear" || operation == "ack" {
 		mode = 0222
 	} else {
 		mode = 0444
@@ -992,6 +1001,13 @@ func (qfs *queueFS) clear(queueName string) error {
 	return qfs.plugin.backend.Clear(queueName)
 }
 
+func (qfs *queueFS) ackMessage(queueName string, msgID string) error {
+	qfs.plugin.mu.Lock()
+	defer qfs.plugin.mu.Unlock()
+
+	return qfs.plugin.backend.Ack(queueName, msgID)
+}
+
 // Ensure QueueFSPlugin implements ServicePlugin
 var _ plugin.ServicePlugin = (*QueueFSPlugin)(nil)
 var _ filesystem.FileSystem = (*queueFS)(nil)
diff --git a/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go b/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go
new file mode 100644
index 00000000..2a0c4dbe
--- /dev/null
+++ b/third_party/agfs/agfs-server/pkg/plugins/queuefs/sqlite_backend.go
@@ -0,0 +1,321 @@
+package queuefs
+
+import (
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+)
+
+// SQLiteQueueBackend implements QueueBackend using SQLite with a single-table schema.
+//
+// Schema:
+//   - queue_metadata: tracks all queues (including empty ones created via mkdir)
+//   - queue_messages: stores all messages, filtered by queue_name column
+//     - status: 'pending' (waiting to be processed) | 'processing' (dequeued, awaiting ack)
+//     - processing_started_at: Unix timestamp when dequeued; NULL while pending
+//
+// Delivery semantics: at-least-once
+//   - Dequeue marks message as 'processing' (does NOT delete)
+//   - Ack deletes the message after successful processing
+//   - On startup, RecoverStale resets all 'processing' messages back to 'pending'
+//     so that messages from a previous crashed run are automatically retried
+type SQLiteQueueBackend struct {
+	db *sql.DB
+}
+
+func NewSQLiteQueueBackend() *SQLiteQueueBackend {
+	return &SQLiteQueueBackend{}
+}
+
+func (b *SQLiteQueueBackend) Initialize(config map[string]interface{}) error {
+	dbBackend := NewSQLiteDBBackend()
+
+	db, err := dbBackend.Open(config)
+	if err != nil {
+		return fmt.Errorf("failed to open SQLite database: %w", err)
+	}
+	b.db = db
+
+	for _, sqlStmt := range dbBackend.GetInitSQL() {
+		if _, err := db.Exec(sqlStmt); err != nil {
+			db.Close()
+			return fmt.Errorf("failed to initialize schema: %w", err)
+		}
+	}
+
+	// Migrate existing databases: add new columns if they don't exist yet.
+	b.runMigrations()
+
+	// Reset any messages left in 'processing' state by a previous crashed process.
+	// staleSec=0 resets ALL processing messages — safe at startup because no workers
+	// are running yet.
+	if n, err := b.RecoverStale(0); err != nil {
+		log.Warnf("[queuefs] Failed to recover stale messages on startup: %v", err)
+	} else if n > 0 {
+		log.Infof("[queuefs] Recovered %d in-flight message(s) from previous run", n)
+	}
+
+	log.Info("[queuefs] SQLite backend initialized")
+	return nil
+}
+
+// runMigrations applies schema changes needed to upgrade an existing database.
+// Each ALTER TABLE is executed and "duplicate column name" errors are silently ignored.
+func (b *SQLiteQueueBackend) runMigrations() {
+	migrations := []string{
+		`ALTER TABLE queue_messages ADD COLUMN status TEXT NOT NULL DEFAULT 'pending'`,
+		`ALTER TABLE queue_messages ADD COLUMN processing_started_at INTEGER`,
+		`CREATE INDEX IF NOT EXISTS idx_queue_status ON queue_messages(queue_name, status, id)`,
+		`CREATE INDEX IF NOT EXISTS idx_queue_message_id ON queue_messages(queue_name, message_id)`,
+	}
+	for _, stmt := range migrations {
+		if _, err := b.db.Exec(stmt); err != nil {
+			// "duplicate column name" means the column already exists — that's fine.
+			if !strings.Contains(err.Error(), "duplicate column name") &&
+				!strings.Contains(err.Error(), "already exists") {
+				log.Warnf("[queuefs] Migration warning: %v", err)
+			}
+		}
+	}
+}
+
+func (b *SQLiteQueueBackend) Close() error {
+	if b.db != nil {
+		return b.db.Close()
+	}
+	return nil
+}
+
+func (b *SQLiteQueueBackend) GetType() string {
+	return "sqlite"
+}
+
+func (b *SQLiteQueueBackend) Enqueue(queueName string, msg QueueMessage) error {
+	msgData, err := json.Marshal(msg)
+	if err != nil {
+		return fmt.Errorf("failed to marshal message: %w", err)
+	}
+
+	_, err = b.db.Exec(
+		"INSERT INTO queue_messages (queue_name, message_id, data, timestamp, status) VALUES (?, ?, ?, ?, 'pending')",
+		queueName, msg.ID, string(msgData), msg.Timestamp.Unix(),
+	)
+	if err != nil {
+		return fmt.Errorf("failed to enqueue message: %w", err)
+	}
+	return nil
+}
+
+// Dequeue marks the first pending message as 'processing' and returns it.
+// The message remains in the database until Ack is called.
+// If the process crashes before Ack, RecoverStale on the next startup will
+// reset the message back to 'pending' so it is retried.
+func (b *SQLiteQueueBackend) Dequeue(queueName string) (QueueMessage, bool, error) {
+	tx, err := b.db.Begin()
+	if err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to start transaction: %w", err)
+	}
+	defer tx.Rollback()
+
+	var id int64
+	var data string
+	err = tx.QueryRow(
+		"SELECT id, data FROM queue_messages WHERE queue_name = ? AND status = 'pending' ORDER BY id LIMIT 1",
+		queueName,
+	).Scan(&id, &data)
+
+	if err == sql.ErrNoRows {
+		return QueueMessage{}, false, nil
+	} else if err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to query message: %w", err)
+	}
+
+	// Mark as processing instead of deleting.
+	_, err = tx.Exec(
+		"UPDATE queue_messages SET status = 'processing', processing_started_at = ? WHERE id = ?",
+		time.Now().Unix(), id,
+	)
+	if err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to mark message as processing: %w", err)
+	}
+
+	if err := tx.Commit(); err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to commit transaction: %w", err)
+	}
+
+	var msg QueueMessage
+	if err := json.Unmarshal([]byte(data), &msg); err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to unmarshal message: %w", err)
+	}
+
+	return msg, true, nil
+}
+
+// Ack deletes a message that has been successfully processed.
+// Should be called after the consumer has finished processing the message.
+func (b *SQLiteQueueBackend) Ack(queueName string, messageID string) error {
+	result, err := b.db.Exec(
+		"DELETE FROM queue_messages WHERE queue_name = ? AND message_id = ? AND status = 'processing'",
+		queueName, messageID,
+	)
+	if err != nil {
+		return fmt.Errorf("failed to ack message: %w", err)
+	}
+	rows, _ := result.RowsAffected()
+	if rows == 0 {
+		log.Warnf("[queuefs] Ack found no matching processing message: queue=%s msg=%s", queueName, messageID)
+	}
+	return nil
+}
+
+// RecoverStale resets messages stuck in 'processing' state back to 'pending'.
+// staleSec is the minimum age (in seconds) of a processing message before it
+// is considered stale.  Pass 0 to reset ALL processing messages immediately
+// (appropriate at startup before any workers have started).
+// Returns the number of messages recovered.
+func (b *SQLiteQueueBackend) RecoverStale(staleSec int64) (int, error) {
+	cutoff := time.Now().Unix() - staleSec
+	result, err := b.db.Exec(
+		"UPDATE queue_messages SET status = 'pending', processing_started_at = NULL WHERE status = 'processing' AND processing_started_at <= ?",
+		cutoff,
+	)
+	if err != nil {
+		return 0, fmt.Errorf("failed to recover stale messages: %w", err)
+	}
+	n, _ := result.RowsAffected()
+	return int(n), nil
+}
+
+func (b *SQLiteQueueBackend) Peek(queueName string) (QueueMessage, bool, error) {
+	var data string
+	err := b.db.QueryRow(
+		"SELECT data FROM queue_messages WHERE queue_name = ? AND status = 'pending' ORDER BY id LIMIT 1",
+		queueName,
+	).Scan(&data)
+
+	if err == sql.ErrNoRows {
+		return QueueMessage{}, false, nil
+	} else if err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to peek message: %w", err)
+	}
+
+	var msg QueueMessage
+	if err := json.Unmarshal([]byte(data), &msg); err != nil {
+		return QueueMessage{}, false, fmt.Errorf("failed to unmarshal message: %w", err)
+	}
+
+	return msg, true, nil
+}
+
+// Size returns the number of pending (not yet dequeued) messages.
+func (b *SQLiteQueueBackend) Size(queueName string) (int, error) {
+	var count int
+	err := b.db.QueryRow(
+		"SELECT COUNT(*) FROM queue_messages WHERE queue_name = ? AND status = 'pending'",
+		queueName,
+	).Scan(&count)
+	if err != nil {
+		return 0, fmt.Errorf("failed to get queue size: %w", err)
+	}
+	return count, nil
+}
+
+func (b *SQLiteQueueBackend) Clear(queueName string) error {
+	_, err := b.db.Exec("DELETE FROM queue_messages WHERE queue_name = ?", queueName)
+	if err != nil {
+		return fmt.Errorf("failed to clear queue: %w", err)
+	}
+	return nil
+}
+
+func (b *SQLiteQueueBackend) ListQueues(prefix string) ([]string, error) {
+	var query string
+	var args []interface{}
+
+	if prefix == "" {
+		query = "SELECT queue_name FROM queue_metadata"
+	} else {
+		query = "SELECT queue_name FROM queue_metadata WHERE queue_name = ? OR queue_name LIKE ?"
+		args = []interface{}{prefix, prefix + "/%"}
+	}
+
+	rows, err := b.db.Query(query, args...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list queues: %w", err)
+	}
+	defer rows.Close()
+
+	var queues []string
+	for rows.Next() {
+		var qName string
+		if err := rows.Scan(&qName); err != nil {
+			return nil, fmt.Errorf("failed to scan queue name: %w", err)
+		}
+		queues = append(queues, qName)
+	}
+	return queues, nil
+}
+
+func (b *SQLiteQueueBackend) GetLastEnqueueTime(queueName string) (time.Time, error) {
+	var timestamp sql.NullInt64
+	err := b.db.QueryRow(
+		"SELECT MAX(timestamp) FROM queue_messages WHERE queue_name = ? AND status = 'pending'",
+		queueName,
+	).Scan(&timestamp)
+
+	if err != nil || !timestamp.Valid {
+		return time.Time{}, nil
+	}
+	return time.Unix(timestamp.Int64, 0), nil
+}
+
+func (b *SQLiteQueueBackend) RemoveQueue(queueName string) error {
+	if queueName == "" {
+		if _, err := b.db.Exec("DELETE FROM queue_messages"); err != nil {
+			return err
+		}
+		_, err := b.db.Exec("DELETE FROM queue_metadata")
+		return err
+	}
+
+	if _, err := b.db.Exec(
+		"DELETE FROM queue_messages WHERE queue_name = ? OR queue_name LIKE ?",
+		queueName, queueName+"/%",
+	); err != nil {
+		return fmt.Errorf("failed to remove queue messages: %w", err)
+	}
+
+	_, err := b.db.Exec(
+		"DELETE FROM queue_metadata WHERE queue_name = ? OR queue_name LIKE ?",
+		queueName, queueName+"/%",
+	)
+	return err
+}
+
+func (b *SQLiteQueueBackend) CreateQueue(queueName string) error {
+	_, err := b.db.Exec(
+		"INSERT OR IGNORE INTO queue_metadata (queue_name) VALUES (?)",
+		queueName,
+	)
+	if err != nil {
+		return fmt.Errorf("failed to create queue: %w", err)
+	}
+	log.Infof("[queuefs] Created queue '%s' (SQLite)", queueName)
+	return nil
+}
+
+func (b *SQLiteQueueBackend) QueueExists(queueName string) (bool, error) {
+	var count int
+	err := b.db.QueryRow(
+		"SELECT COUNT(*) FROM queue_metadata WHERE queue_name = ?",
+		queueName,
+	).Scan(&count)
+	if err != nil {
+		return false, fmt.Errorf("failed to check queue existence: %w", err)
+	}
+	return count > 0, nil
+}

From b9a51c2da9e5ed96bec56cae545c24b964c0bcb5 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Thu, 5 Mar 2026 15:54:49 +0800
Subject: [PATCH 02/18] test(transaction): add e2e rollback tests for mv and
 multi-step operations

Add end-to-end tests covering rollback scenarios that were missing:
- mv rollback: file moved back to original location on failure
- mv commit: file persists at new location
- Multi-step rollback: mkdir + write + mkdir all reversed in order
- Partial step rollback: only completed entries are reversed
- Nested directory rollback: child removed before parent
- Best-effort rollback: single step failure does not block others

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/transaction/test_e2e.py | 199 ++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)

diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py
index 88b6b5d6..8f67ea96 100644
--- a/tests/transaction/test_e2e.py
+++ b/tests/transaction/test_e2e.py
@@ -218,6 +218,205 @@ async def test_journal_cleaned_after_rollback(self, agfs_client, tx_manager, tes
         assert tx_id not in all_ids
 
 
+class TestE2EMvRollback:
+    async def test_mv_rollback_moves_file_back(self, agfs_client, tx_manager, test_dir):
+        """mv commit 前失败 → 文件被移回原位。"""
+        src = f"{test_dir}/mv-rb-src-{uuid.uuid4().hex}"
+        dst_parent = f"{test_dir}/mv-rb-dst-{uuid.uuid4().hex}"
+        agfs_client.mkdir(src)
+        agfs_client.mkdir(dst_parent)
+
+        # Write a file inside src
+        agfs_client.write(f"{src}/data.txt", b"important")
+
+        dst = f"{dst_parent}/moved"
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent
+            ) as tx:
+                seq = tx.record_undo("fs_mv", {"src": src, "dst": dst})
+                agfs_client.mv(src, dst)
+                tx.mark_completed(seq)
+
+                raise RuntimeError("abort after mv")
+
+        # src should be restored (mv reversed: dst → src)
+        content = agfs_client.cat(f"{src}/data.txt")
+        assert content == b"important"
+
+        # dst should no longer exist
+        try:
+            agfs_client.stat(dst)
+            raise AssertionError("dst should not exist after rollback")
+        except Exception:
+            pass
+
+    async def test_mv_commit_persists(self, agfs_client, tx_manager, test_dir):
+        """mv commit 成功 → 文件在新位置，旧位置不存在。"""
+        src = f"{test_dir}/mv-ok-src-{uuid.uuid4().hex}"
+        dst_parent = f"{test_dir}/mv-ok-dst-{uuid.uuid4().hex}"
+        agfs_client.mkdir(src)
+        agfs_client.mkdir(dst_parent)
+        agfs_client.write(f"{src}/data.txt", b"moved-data")
+
+        dst = f"{dst_parent}/moved"
+
+        async with TransactionContext(
+            tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent
+        ) as tx:
+            seq = tx.record_undo("fs_mv", {"src": src, "dst": dst})
+            agfs_client.mv(src, dst)
+            tx.mark_completed(seq)
+            await tx.commit()
+
+        # File at new location
+        content = agfs_client.cat(f"{dst}/data.txt")
+        assert content == b"moved-data"
+
+        # Old location gone
+        try:
+            agfs_client.stat(src)
+            raise AssertionError("src should not exist after committed mv")
+        except Exception:
+            pass
+
+
+class TestE2EMultiStepRollback:
+    async def test_multi_step_rollback_reverses_all(self, agfs_client, tx_manager, test_dir):
+        """多步操作（mkdir + write + mkdir），中间失败 → 全部反序回滚。
+
+        执行顺序：seq0 mkdir /a → seq1 write /a/f.txt → seq2 mkdir /a/sub
+        在 seq2 完成后抛异常。
+        回滚顺序：seq2 rm /a/sub → seq1 rm /a/f.txt → seq0 rm /a
+        """
+        dir_a = f"{test_dir}/multi-a-{uuid.uuid4().hex}"
+        file_f = f"{dir_a}/f.txt"
+        dir_sub = f"{dir_a}/sub"
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "multi_step", [test_dir], lock_mode="point"
+            ) as tx:
+                s0 = tx.record_undo("fs_mkdir", {"uri": dir_a})
+                agfs_client.mkdir(dir_a)
+                tx.mark_completed(s0)
+
+                s1 = tx.record_undo("fs_write_new", {"uri": file_f})
+                agfs_client.write(file_f, b"content")
+                tx.mark_completed(s1)
+
+                s2 = tx.record_undo("fs_mkdir", {"uri": dir_sub})
+                agfs_client.mkdir(dir_sub)
+                tx.mark_completed(s2)
+
+                raise RuntimeError("abort after all steps")
+
+        # Everything should be cleaned up in reverse order
+        for path in [dir_sub, file_f, dir_a]:
+            try:
+                agfs_client.stat(path)
+                raise AssertionError(f"{path} should not exist after rollback")
+            except Exception:
+                pass
+
+    async def test_partial_step_rollback(self, agfs_client, tx_manager, test_dir):
+        """两步操作，第二步执行到一半崩溃（未 mark_completed）→ 只回滚第一步。
+
+        seq0 mkdir (completed=True) → seq1 write (completed=False，异常在 mark 前抛出）
+        回滚只处理 seq0。
+        """
+        dir_a = f"{test_dir}/partial-{uuid.uuid4().hex}"
+        file_f = f"{dir_a}/f.txt"
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "partial", [test_dir], lock_mode="point"
+            ) as tx:
+                s0 = tx.record_undo("fs_mkdir", {"uri": dir_a})
+                agfs_client.mkdir(dir_a)
+                tx.mark_completed(s0)
+
+                _s1 = tx.record_undo("fs_write_new", {"uri": file_f})
+                agfs_client.write(file_f, b"half-done")
+                # NOT calling tx.mark_completed(s1) — simulates crash mid-operation
+                raise RuntimeError("crash before marking s1 completed")
+
+        # dir_a (seq0, completed) should be rolled back
+        try:
+            agfs_client.stat(dir_a)
+            raise AssertionError("dir_a should be rolled back")
+        except Exception:
+            pass
+
+        # file_f was written but undo entry not marked completed → not rolled back by normal mode
+        # However, file_f is inside dir_a which was removed, so it's gone too
+
+    async def test_rollback_order_matters_nested_dirs(self, agfs_client, tx_manager, test_dir):
+        """嵌套目录回滚顺序：必须先删子目录再删父目录。
+
+        seq0 mkdir /parent → seq1 mkdir /parent/child
+        回滚必须 seq1 (rm child) → seq0 (rm parent)，否则 parent 非空删除失败。
+        """
+        parent = f"{test_dir}/nested-parent-{uuid.uuid4().hex}"
+        child = f"{parent}/child"
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "nested", [test_dir], lock_mode="point"
+            ) as tx:
+                s0 = tx.record_undo("fs_mkdir", {"uri": parent})
+                agfs_client.mkdir(parent)
+                tx.mark_completed(s0)
+
+                s1 = tx.record_undo("fs_mkdir", {"uri": child})
+                agfs_client.mkdir(child)
+                tx.mark_completed(s1)
+
+                raise RuntimeError("abort nested")
+
+        # Both gone (child first, then parent)
+        for path in [child, parent]:
+            try:
+                agfs_client.stat(path)
+                raise AssertionError(f"{path} should not exist")
+            except Exception:
+                pass
+
+    async def test_rollback_failure_best_effort_continues(self, agfs_client, tx_manager, test_dir):
+        """回滚中某步失败，后续步骤仍然执行（best-effort）。
+
+        seq0 mkdir /a → seq1 mkdir /b
+        手动删除 /b（模拟回滚 seq1 时目标已不存在），seq0 的回滚仍应执行。
+        """
+        dir_a = f"{test_dir}/be-a-{uuid.uuid4().hex}"
+        dir_b = f"{test_dir}/be-b-{uuid.uuid4().hex}"
+
+        with pytest.raises(RuntimeError):
+            async with TransactionContext(
+                tx_manager, "best_effort", [test_dir], lock_mode="point"
+            ) as tx:
+                s0 = tx.record_undo("fs_mkdir", {"uri": dir_a})
+                agfs_client.mkdir(dir_a)
+                tx.mark_completed(s0)
+
+                s1 = tx.record_undo("fs_mkdir", {"uri": dir_b})
+                agfs_client.mkdir(dir_b)
+                tx.mark_completed(s1)
+
+                # Manually remove dir_b before rollback — simulates external interference
+                agfs_client.rm(dir_b)
+
+                raise RuntimeError("abort")
+
+        # dir_b removal during rollback "fails" (already gone), but dir_a should still be rolled back
+        try:
+            agfs_client.stat(dir_a)
+            raise AssertionError("dir_a should be rolled back despite dir_b failure")
+        except Exception:
+            pass
+
+
 class TestE2ESequentialTransactions:
     async def test_sequential_transactions_on_same_path(self, agfs_client, tx_manager, test_dir):
         """Two sequential transactions on the same path both succeed."""

From b52ccbfeff92a328bb551769e22cc4f3a3865239 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Fri, 13 Mar 2026 19:52:19 +0800
Subject: [PATCH 03/18] feat(storage): add transaction support with path
 locking and journal

Implement transaction system for VikingFS with ACID-like guarantees:
- TransactionManager with configurable lock timeout and journal-based recovery
- PathLock supporting point, subtree, and mv lock modes
- Refactor VikingFS mv to use cp+rm to prevent lock files from being carried
- Fix stale lock detection returning false for missing lock files
- Update ragas eval to use LangchainLLMWrapper

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openviking/async_client.py                    |   5 +
 openviking/eval/ragas/__init__.py             |  17 +-
 openviking/service/core.py                    |   2 +-
 openviking/storage/local_fs.py                |   3 +-
 openviking/storage/queuefs/queue_manager.py   |  22 +-
 openviking/storage/queuefs/semantic_dag.py    |  55 ++--
 openviking/storage/transaction/__init__.py    |   2 +
 .../storage/transaction/context_manager.py    |   7 +-
 openviking/storage/transaction/journal.py     |   3 +-
 openviking/storage/transaction/path_lock.py   | 282 ++++--------------
 .../transaction/transaction_manager.py        |  30 +-
 openviking/storage/viking_fs.py               |  53 +++-
 tests/agfs/test_fs_s3.py                      |   3 +-
 tests/client/test_resource_management.py      |   2 +-
 tests/integration/test_add_resource_index.py  |  11 +-
 tests/integration/test_full_workflow.py       |  16 +-
 tests/server/test_api_filesystem.py           |  45 ++-
 tests/server/test_api_resources.py            |   4 +-
 tests/storage/test_semantic_dag_stats.py      |  21 ++
 tests/transaction/test_e2e.py                 |   4 +-
 tests/transaction/test_transaction_manager.py |   8 +-
 21 files changed, 256 insertions(+), 339 deletions(-)

diff --git a/openviking/async_client.py b/openviking/async_client.py
index 67dfa696..3f410a13 100644
--- a/openviking/async_client.py
+++ b/openviking/async_client.py
@@ -96,6 +96,11 @@ async def reset(cls) -> None:
                 await cls._instance.close()
                 cls._instance = None
 
+        # Also reset transaction manager singleton
+        from openviking.storage.transaction import reset_transaction_manager
+
+        reset_transaction_manager()
+
     # ============= Session methods =============
 
     def session(self, session_id: Optional[str] = None, must_exist: bool = False) -> Session:
diff --git a/openviking/eval/ragas/__init__.py b/openviking/eval/ragas/__init__.py
index 03336bc7..df295210 100644
--- a/openviking/eval/ragas/__init__.py
+++ b/openviking/eval/ragas/__init__.py
@@ -111,8 +111,8 @@ def _create_ragas_llm_from_config() -> Optional[Any]:
         RAGAS LLM instance or None if VLM is not configured.
     """
     try:
-        from openai import OpenAI
-        from ragas.llms import llm_factory
+        from langchain_openai import ChatOpenAI
+        from ragas.llms import LangchainLLMWrapper
     except ImportError:
         return None
 
@@ -124,11 +124,12 @@ def _create_ragas_llm_from_config() -> Optional[Any]:
 
         logger.info(f"Using RAGAS LLM from environment: model={model_name}, base_url={api_base}")
 
-        client = OpenAI(
+        openai_model = ChatOpenAI(
+            model=model_name,
             api_key=api_key,
             base_url=api_base,
         )
-        return llm_factory(model_name, client=client)
+        return LangchainLLMWrapper(openai_model)
 
     try:
         from openviking_cli.utils.config import get_openviking_config
@@ -151,13 +152,13 @@ def _create_ragas_llm_from_config() -> Optional[Any]:
         )
         return None
 
-    client = OpenAI(
+    model_name = vlm_config.model or "gpt-4o-mini"
+    openai_model = ChatOpenAI(
+        model=model_name,
         api_key=vlm_config.api_key,
         base_url=vlm_config.api_base,
     )
-
-    model_name = vlm_config.model or "gpt-4o-mini"
-    return llm_factory(model_name, client=client)
+    return LangchainLLMWrapper(openai_model)
 
 
 class RagasEvaluator(BaseEvaluator):
diff --git a/openviking/service/core.py b/openviking/service/core.py
index 093db8f9..912f421a 100644
--- a/openviking/service/core.py
+++ b/openviking/service/core.py
@@ -307,7 +307,7 @@ async def initialize(self) -> None:
     async def close(self) -> None:
         """Close OpenViking and release resources."""
         if self._transaction_manager:
-            self._transaction_manager.stop()
+            await self._transaction_manager.stop()
             self._transaction_manager = None
 
         if self._vikingdb_manager:
diff --git a/openviking/storage/local_fs.py b/openviking/storage/local_fs.py
index 0181c873..88a20720 100644
--- a/openviking/storage/local_fs.py
+++ b/openviking/storage/local_fs.py
@@ -11,6 +11,7 @@
 from openviking.server.identity import RequestContext
 from openviking.storage.queuefs import EmbeddingQueue, get_queue_manager
 from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter
+from openviking_cli.exceptions import NotFoundError
 from openviking_cli.utils.logger import get_logger
 from openviking_cli.utils.uri import VikingURI
 
@@ -176,7 +177,7 @@ async def import_ovpack(
                     f"Resource already exists at {root_uri}. Use force=True to overwrite."
                 )
             logger.info(f"[local_fs] Overwriting existing resource at {root_uri}")
-        except FileNotFoundError:
+        except NotFoundError:
             # Path does not exist, safe to import
             pass
 
diff --git a/openviking/storage/queuefs/queue_manager.py b/openviking/storage/queuefs/queue_manager.py
index b5a68af4..52b42476 100644
--- a/openviking/storage/queuefs/queue_manager.py
+++ b/openviking/storage/queuefs/queue_manager.py
@@ -245,9 +245,21 @@ async def process_one(data: Dict[str, Any]) -> None:
 
             await asyncio.sleep(self._poll_interval)
 
-        # Drain remaining in-flight tasks on shutdown
+        # Drain remaining in-flight tasks on shutdown (with timeout)
         if active_tasks:
-            await asyncio.gather(*active_tasks, return_exceptions=True)
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*active_tasks, return_exceptions=True),
+                    timeout=5.0,
+                )
+            except asyncio.TimeoutError:
+                logger.warning(
+                    f"[QueueManager] Drain timeout for {queue.name}, "
+                    f"cancelling {len(active_tasks)} in-flight task(s)"
+                )
+                for t in active_tasks:
+                    t.cancel()
+                await asyncio.gather(*active_tasks, return_exceptions=True)
 
     def stop(self) -> None:
         """Stop QueueManager and release resources."""
@@ -258,8 +270,10 @@ def stop(self) -> None:
         # Stop queue workers
         for stop_event in self._queue_stop_events.values():
             stop_event.set()
-        for thread in self._queue_threads.values():
-            thread.join()
+        for name, thread in self._queue_threads.items():
+            thread.join(timeout=10.0)
+            if thread.is_alive():
+                logger.warning(f"[QueueManager] Worker thread {name} did not exit in time")
         self._queue_threads.clear()
         self._queue_stop_events.clear()
 
diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index 0e894474..3a05487e 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -237,6 +237,28 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]:
                 results.append(item)
         return results
 
+    async def _execute_overview(
+        self,
+        dir_uri: str,
+        file_summaries: List[Dict[str, str]],
+        children_abstracts: List[Dict[str, str]],
+    ) -> str:
+        """Generate overview/abstract, write files, and vectorize a directory."""
+        async with self._llm_sem:
+            overview = await self._processor._generate_overview(
+                dir_uri, file_summaries, children_abstracts
+            )
+        abstract = self._processor._extract_abstract_from_overview(overview)
+        await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)
+        await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx)
+        try:
+            await self._processor._vectorize_directory_simple(
+                dir_uri, self._context_type, abstract, overview, ctx=self._ctx
+            )
+        except Exception as e:
+            logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True)
+        return abstract
+
     async def _overview_task(self, dir_uri: str) -> None:
         from openviking.storage.errors import LockAcquisitionError
         from openviking.storage.transaction import TransactionContext, get_transaction_manager
@@ -250,43 +272,26 @@ async def _overview_task(self, dir_uri: str) -> None:
             children_abstracts = self._finalize_children_abstracts(node)
 
         abstract = ""
-        dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx)
-
         try:
+            dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx)
             async with TransactionContext(
                 get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point"
             ) as tx:
-                async with self._llm_sem:
-                    overview = await self._processor._generate_overview(
-                        dir_uri, file_summaries, children_abstracts
-                    )
-                abstract = self._processor._extract_abstract_from_overview(overview)
-                await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)
-                await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx)
-                try:
-                    await self._processor._vectorize_directory_simple(
-                        dir_uri, self._context_type, abstract, overview, ctx=self._ctx
-                    )
-                except Exception as e:
-                    logger.error(f"Failed to vectorize directory {dir_uri}: {e}", exc_info=True)
+                abstract = await self._execute_overview(dir_uri, file_summaries, children_abstracts)
                 await tx.commit()
         except LockAcquisitionError:
             logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping")
-            abstract = ""
         except Exception as e:
             logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True)
-            abstract = ""
         finally:
             self._stats.done_nodes += 1
             self._stats.in_progress_nodes = max(0, self._stats.in_progress_nodes - 1)
-
-        parent_uri = self._parent.get(dir_uri)
-        if parent_uri is None:
-            if self._root_done:
-                self._root_done.set()
-            return
-
-        await self._on_child_done(parent_uri, dir_uri, abstract)
+            parent_uri = self._parent.get(dir_uri)
+            if parent_uri is None:
+                if self._root_done:
+                    self._root_done.set()
+            else:
+                await self._on_child_done(parent_uri, dir_uri, abstract)
 
     def get_stats(self) -> DagStats:
         return DagStats(
diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py
index 2730cd2e..afbc3e1e 100644
--- a/openviking/storage/transaction/__init__.py
+++ b/openviking/storage/transaction/__init__.py
@@ -13,6 +13,7 @@
     TransactionManager,
     get_transaction_manager,
     init_transaction_manager,
+    reset_transaction_manager,
 )
 from openviking.storage.transaction.transaction_record import (
     TransactionRecord,
@@ -31,4 +32,5 @@
     "execute_rollback",
     "get_transaction_manager",
     "init_transaction_manager",
+    "reset_transaction_manager",
 ]
diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py
index 10107dde..68ad9784 100644
--- a/openviking/storage/transaction/context_manager.py
+++ b/openviking/storage/transaction/context_manager.py
@@ -36,12 +36,14 @@ def __init__(
         lock_paths: List[str],
         lock_mode: str = "point",
         mv_dst_path: Optional[str] = None,
+        src_is_dir: bool = True,
     ):
         self._tx_manager = tx_manager
         self._operation = operation
         self._lock_paths = lock_paths
         self._lock_mode = lock_mode
         self._mv_dst_path = mv_dst_path
+        self._src_is_dir = src_is_dir
         self._record: Optional[TransactionRecord] = None
         self._committed = False
         self._sequence = 0
@@ -81,7 +83,10 @@ async def __aenter__(self) -> "TransactionContext":
             if len(self._lock_paths) < 1 or not self._mv_dst_path:
                 raise TransactionError("mv lock mode requires lock_paths[0] and mv_dst_path")
             success = await self._tx_manager.acquire_lock_mv(
-                tx_id, self._lock_paths[0], self._mv_dst_path
+                tx_id,
+                self._lock_paths[0],
+                self._mv_dst_path,
+                src_is_dir=self._src_is_dir,
             )
         else:
             # "point" mode (default)
diff --git a/openviking/storage/transaction/journal.py b/openviking/storage/transaction/journal.py
index d641e905..6cb14474 100644
--- a/openviking/storage/transaction/journal.py
+++ b/openviking/storage/transaction/journal.py
@@ -10,8 +10,7 @@
 import json
 from typing import Any, Dict, List
 
-from pyagfs import AGFSClient
-
+from openviking.pyagfs import AGFSClient
 from openviking_cli.utils.logger import get_logger
 
 logger = get_logger(__name__)
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index 7cae0d9c..e2879694 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Path lock implementation for transaction management.
-
-Provides path-based locking mechanism to prevent concurrent directory operations.
-Lock protocol: viking://resources/.../.path.ovlock file exists = locked
-
-Lock files contain a fencing token in the format ``{tx_id}:{time_ns}:{lock_type}`` so that
-stale locks (left by crashed processes) can be detected and removed.
-
-Two lock types:
-  POINT (P): Locks a specific directory for write/semantic operations.
-             Blocks if any ancestor holds a SUBTREE lock.
-  SUBTREE (S): Locks an entire directory subtree for rm/mv-source operations.
-               Blocks if any descendant holds any lock.
-
-Livelock prevention: after both parties write their lock files and detect a conflict,
-the "later" one (larger (timestamp, tx_id)) backs off and retries.
-
-# TODO(multi-node): File-based locks only work correctly when all nodes share the
-# same AGFS backend with strong read-write consistency. For multi-node deployments
-# with replicated or partitioned storage, replace this implementation with a
-# distributed lock backend (e.g. etcd txn+lease, ZooKeeper ephemeral nodes).
-# The PathLock interface should be extracted to allow swappable backends.
-# Key requirements for a distributed backend:
-#   - Atomic compare-and-set (to avoid write-write races on lock acquisition)
-#   - Session-bound leases (so crashed nodes auto-release without TTL polling)
-#   - Monotonically increasing fencing tokens (etcd revision works well)
-"""
-
 import asyncio
 import time
 from typing import Optional, Tuple
@@ -51,40 +20,13 @@
 
 
 def _make_fencing_token(tx_id: str, lock_type: str = LOCK_TYPE_POINT) -> str:
-    """Create a fencing token for a transaction.
-
-    Format: ``{tx_id}:{time_ns}:{lock_type}`` where time_ns is the current
-    wall-clock time in nanoseconds and lock_type is P or S.
-
-    Args:
-        tx_id: Transaction ID
-        lock_type: Lock type, either LOCK_TYPE_POINT ("P") or LOCK_TYPE_SUBTREE ("S")
-
-    Returns:
-        Fencing token string
-    """
     return f"{tx_id}:{time.time_ns()}:{lock_type}"
 
 
 def _parse_fencing_token(token: str) -> Tuple[str, int, str]:
-    """Parse a fencing token into (tx_id, timestamp_ns, lock_type).
-
-    Supports:
-    - New format: ``{tx_id}:{time_ns}:P`` or ``{tx_id}:{time_ns}:S``
-    - Legacy format: ``{tx_id}:{time_ns}`` (defaults to POINT)
-    - Very legacy: plain tx_id (ts=0, defaults to POINT)
-
-    Args:
-        token: Fencing token string
-
-    Returns:
-        (tx_id, timestamp_ns, lock_type) — timestamp_ns is 0 for legacy tokens,
-        lock_type defaults to LOCK_TYPE_POINT for legacy tokens.
-    """
-    # New format ends with ":P" or ":S"
     if token.endswith(f":{LOCK_TYPE_POINT}") or token.endswith(f":{LOCK_TYPE_SUBTREE}"):
         lock_type = token[-1]
-        rest = token[:-2]  # strip ":{lock_type}"
+        rest = token[:-2]
         idx = rest.rfind(":")
         if idx >= 0:
             tx_id_part = rest[:idx]
@@ -95,7 +37,6 @@ def _parse_fencing_token(token: str) -> Tuple[str, int, str]:
                 pass
         return rest, 0, lock_type
 
-    # Legacy format: {tx_id}:{time_ns}
     if ":" in token:
         idx = token.rfind(":")
         tx_id_part = token[:idx]
@@ -109,34 +50,15 @@ def _parse_fencing_token(token: str) -> Tuple[str, int, str]:
 
 
 class PathLock:
-    """Path lock manager for transaction-based directory locking.
-
-    Implements path-based locking using lock files (.path.ovlock) to prevent
-    concurrent operations on the same directory tree.
-
-    Two lock types:
-      POINT (P): Used for write and semantic processing operations.
-      SUBTREE (S): Used for rm and mv-source operations.
-    """
-
     def __init__(self, agfs_client: AGFSClient, lock_expire: float = 300.0):
-        """Initialize path lock manager.
-
-        Args:
-            agfs_client: AGFS client for file system operations
-            lock_expire: Stale lock expiry threshold in seconds (default: 300s).
-                Locks held longer than this by a crashed process are force-released.
-        """
         self._agfs = agfs_client
         self._lock_expire = lock_expire
 
     def _get_lock_path(self, path: str) -> str:
-        """Get lock file path for a directory."""
         path = path.rstrip("/")
         return f"{path}/{LOCK_FILE_NAME}"
 
     def _get_parent_path(self, path: str) -> Optional[str]:
-        """Get parent directory path."""
         path = path.rstrip("/")
         if "/" not in path:
             return None
@@ -144,17 +66,17 @@ def _get_parent_path(self, path: str) -> Optional[str]:
         return parent if parent else None
 
     def _read_token(self, lock_path: str) -> Optional[str]:
-        """Read fencing token from lock file, returning None if absent."""
         try:
             content = self._agfs.cat(lock_path)
             if isinstance(content, bytes):
-                return content.decode("utf-8").strip()
-            return str(content).strip()
+                token = content.decode("utf-8").strip()
+            else:
+                token = str(content).strip()
+            return token if token else None
         except Exception:
             return None
 
     async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool:
-        """Check if path is locked by another transaction (any lock type)."""
         token = self._read_token(lock_path)
         if token is None:
             return False
@@ -164,59 +86,36 @@ async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool
     async def _create_lock_file(
         self, lock_path: str, transaction_id: str, lock_type: str = LOCK_TYPE_POINT
     ) -> None:
-        """Create lock file with fencing token."""
         token = _make_fencing_token(transaction_id, lock_type)
         self._agfs.write(lock_path, token.encode("utf-8"))
 
     async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool:
-        """Verify lock file is owned by current transaction."""
         token = self._read_token(lock_path)
         if token is None:
             return False
         lock_owner, _, _ = _parse_fencing_token(token)
         return lock_owner == transaction_id
 
-    async def _remove_lock_file(self, lock_path: str) -> None:
-        """Remove lock file."""
+    async def _remove_lock_file(self, lock_path: str) -> bool:
         try:
             self._agfs.rm(lock_path)
-        except Exception:
-            pass
+            return True
+        except Exception as e:
+            if "not found" in str(e).lower():
+                return True
+            return False
 
     def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool:
-        """Check if a lock file is stale (left by a crashed process).
-
-        A lock is considered stale if:
-        - The lock file does not exist (already cleaned up)
-        - The lock file contains a legacy token (no timestamp)
-        - The lock has been held longer than ``expire_seconds``
-
-        Args:
-            lock_path: Lock file path
-            expire_seconds: Lock expiry threshold in seconds (default: 5 minutes)
-
-        Returns:
-            True if the lock is stale, False if it is still fresh
-        """
         token = self._read_token(lock_path)
         if token is None:
-            return True  # No file = stale
+            return True
         _, ts, _ = _parse_fencing_token(token)
         if ts == 0:
-            return True  # Legacy format = consider stale
+            return True
         age = (time.time_ns() - ts) / 1e9
         return age > expire_seconds
 
     async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> Optional[str]:
-        """Walk all ancestor directories and return the first SUBTREE lock held by another tx.
-
-        Args:
-            path: Starting directory path (its ancestors are checked, not itself)
-            exclude_tx_id: Transaction ID to exclude from conflict detection
-
-        Returns:
-            Lock file path of the conflicting SUBTREE lock, or None if no conflict
-        """
         parent = self._get_parent_path(path)
         while parent:
             lock_path = self._get_lock_path(parent)
@@ -229,15 +128,6 @@ async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> O
         return None
 
     async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Optional[str]:
-        """Recursively scan all descendant directories for locks held by another tx.
-
-        Args:
-            path: Root directory path to scan (its own lock is NOT checked here)
-            exclude_tx_id: Transaction ID to exclude from conflict detection
-
-        Returns:
-            Lock file path of the first conflicting lock found, or None if no conflict
-        """
         try:
             entries = self._agfs.ls(path)
             if not isinstance(entries, list):
@@ -257,7 +147,6 @@ async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Op
                     owner_id, _, _ = _parse_fencing_token(token)
                     if owner_id != exclude_tx_id:
                         return subdir_lock
-                # Recurse into subdir
                 result = await self._scan_descendants_for_locks(subdir, exclude_tx_id)
                 if result:
                     return result
@@ -268,37 +157,10 @@ async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Op
     async def acquire_point(
         self, path: str, transaction: TransactionRecord, timeout: float = 0.0
     ) -> bool:
-        """Acquire POINT lock for write/semantic-processing operations.
-
-        A POINT lock is placed on a single directory. It conflicts with:
-        - Any lock (P or S) on the same directory by another transaction
-        - Any SUBTREE (S) lock on any ancestor directory
-
-        Lock acquisition flow:
-        1. Check target directory exists
-        2. Check if target directory is locked by another transaction → wait/stale-remove
-        3. Check if any ancestor holds a SUBTREE lock → wait/stale-remove
-        4. Write POINT(P) lock file
-        5. TOCTOU double-check: re-scan ancestors for SUBTREE locks
-           - Conflict found: compare (ts, tx_id); later one backs off and retries
-        6. Verify lock ownership
-        7. Return success
-
-        Args:
-            path: Directory path to lock
-            transaction: Transaction record
-            timeout: Maximum time to wait for the lock in seconds.
-                0 (default) = fail immediately if locked.
-                > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout.
-
-        Returns:
-            True if lock acquired successfully, False if timeout exceeded
-        """
         transaction_id = transaction.id
         lock_path = self._get_lock_path(path)
         deadline = asyncio.get_event_loop().time() + timeout
 
-        # Step 1: Check target directory exists (once, before polling)
         try:
             self._agfs.stat(path)
         except Exception:
@@ -306,11 +168,14 @@ async def acquire_point(
             return False
 
         while True:
-            # Step 2: Check if target directory is locked by another transaction
             if await self._is_locked_by_other(lock_path, transaction_id):
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[POINT] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
+                    if asyncio.get_event_loop().time() >= deadline:
+                        logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
+                        return False
+                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_event_loop().time() >= deadline:
                     logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
@@ -318,7 +183,6 @@ async def acquire_point(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Step 3: Check all ancestors for SUBTREE locks
             ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id)
             if ancestor_conflict:
                 if self.is_lock_stale(ancestor_conflict, self._lock_expire):
@@ -326,6 +190,12 @@ async def acquire_point(
                         f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
                     )
                     await self._remove_lock_file(ancestor_conflict)
+                    if asyncio.get_event_loop().time() >= deadline:
+                        logger.warning(
+                            f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
+                        )
+                        return False
+                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_event_loop().time() >= deadline:
                     logger.warning(
@@ -335,14 +205,12 @@ async def acquire_point(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Step 4: Write POINT lock file
             try:
                 await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_POINT)
             except Exception as e:
                 logger.error(f"[POINT] Failed to create lock file: {e}")
                 return False
 
-            # Step 5: TOCTOU double-check ancestors for SUBTREE locks
             backed_off = False
             conflict_after = await self._check_ancestors_for_subtree(path, transaction_id)
             if conflict_after:
@@ -353,13 +221,10 @@ async def acquire_point(
                     _, my_ts, _ = (
                         _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT)
                     )
-                    # Later one (larger (ts, tx_id)) backs off
                     if (my_ts, transaction_id) > (their_ts, their_tx_id):
                         logger.debug(f"[POINT] Backing off (livelock guard) on {path}")
                         await self._remove_lock_file(lock_path)
                         backed_off = True
-                # Either: I backed off, or they will back off.
-                # In both cases restart the outer loop after a brief wait.
                 if asyncio.get_event_loop().time() >= deadline:
                     if not backed_off:
                         await self._remove_lock_file(lock_path)
@@ -367,7 +232,6 @@ async def acquire_point(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Step 6: Verify lock ownership
             if not await self._verify_lock_ownership(lock_path, transaction_id):
                 logger.debug(f"[POINT] Lock ownership verification failed: {path}")
                 if asyncio.get_event_loop().time() >= deadline:
@@ -375,7 +239,6 @@ async def acquire_point(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Success
             transaction.add_lock(lock_path)
             logger.debug(f"[POINT] Lock acquired: {lock_path}")
             return True
@@ -383,38 +246,10 @@ async def acquire_point(
     async def acquire_subtree(
         self, path: str, transaction: TransactionRecord, timeout: float = 0.0
     ) -> bool:
-        """Acquire SUBTREE lock for rm/mv-source operations.
-
-        A SUBTREE lock is placed on a single directory (the root of the subtree).
-        It conflicts with:
-        - Any lock (P or S) on the same directory by another transaction
-        - Any lock (P or S) on any descendant directory by another transaction
-
-        Lock acquisition flow:
-        1. Check target directory exists
-        2. Check if target directory is locked by another transaction → wait/stale-remove
-        3. Scan all descendants for any locks → wait/stale-remove
-        4. Write SUBTREE(S) lock file (only one file, at the root path)
-        5. TOCTOU double-check: re-scan descendants for any new locks
-           - Conflict found: compare (ts, tx_id); later one backs off and retries
-        6. Verify lock ownership
-        7. Return success
-
-        Args:
-            path: Directory path to lock (root of the subtree)
-            transaction: Transaction record
-            timeout: Maximum time to wait for the lock in seconds.
-                0 (default) = fail immediately if locked.
-                > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout.
-
-        Returns:
-            True if lock acquired successfully, False if timeout exceeded
-        """
         transaction_id = transaction.id
         lock_path = self._get_lock_path(path)
         deadline = asyncio.get_event_loop().time() + timeout
 
-        # Step 1: Check target directory exists
         try:
             self._agfs.stat(path)
         except Exception:
@@ -422,11 +257,14 @@ async def acquire_subtree(
             return False
 
         while True:
-            # Step 2: Check if target directory is locked by another transaction
             if await self._is_locked_by_other(lock_path, transaction_id):
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
+                    if asyncio.get_event_loop().time() >= deadline:
+                        logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
+                        return False
+                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_event_loop().time() >= deadline:
                     logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
@@ -434,12 +272,17 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Step 3: Scan all descendants for any locks by other transactions
             desc_conflict = await self._scan_descendants_for_locks(path, transaction_id)
             if desc_conflict:
                 if self.is_lock_stale(desc_conflict, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}")
                     await self._remove_lock_file(desc_conflict)
+                    if asyncio.get_event_loop().time() >= deadline:
+                        logger.warning(
+                            f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}"
+                        )
+                        return False
+                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_event_loop().time() >= deadline:
                     logger.warning(
@@ -449,14 +292,12 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Step 4: Write SUBTREE lock file (only one file)
             try:
                 await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_SUBTREE)
             except Exception as e:
                 logger.error(f"[SUBTREE] Failed to create lock file: {e}")
                 return False
 
-            # Step 5: TOCTOU double-check descendants
             backed_off = False
             conflict_after = await self._scan_descendants_for_locks(path, transaction_id)
             if conflict_after:
@@ -467,13 +308,10 @@ async def acquire_subtree(
                     _, my_ts, _ = (
                         _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE)
                     )
-                    # Later one (larger (ts, tx_id)) backs off
                     if (my_ts, transaction_id) > (their_ts, their_tx_id):
                         logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}")
                         await self._remove_lock_file(lock_path)
                         backed_off = True
-                # Either: I backed off, or they will back off.
-                # In both cases restart the outer loop after a brief wait.
                 if asyncio.get_event_loop().time() >= deadline:
                     if not backed_off:
                         await self._remove_lock_file(lock_path)
@@ -481,7 +319,6 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Step 6: Verify lock ownership
             if not await self._verify_lock_ownership(lock_path, transaction_id):
                 logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}")
                 if asyncio.get_event_loop().time() >= deadline:
@@ -489,7 +326,6 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Success
             transaction.add_lock(lock_path)
             logger.debug(f"[SUBTREE] Lock acquired: {lock_path}")
             return True
@@ -500,46 +336,30 @@ async def acquire_mv(
         dst_path: str,
         transaction: TransactionRecord,
         timeout: float = 0.0,
+        src_is_dir: bool = True,
     ) -> bool:
-        """Acquire path lock for mv operation.
-
-        Lock acquisition flow for mv operations:
-        1. Acquire SUBTREE lock on source directory
-        2. Acquire POINT lock on destination parent directory
-
-        Args:
-            src_path: Source directory path
-            dst_path: Destination parent directory path
-            transaction: Transaction record
-            timeout: Maximum time to wait for each lock in seconds.
-                0 (default) = fail immediately if locked.
-                > 0 = poll every _POLL_INTERVAL seconds until acquired or timeout.
-
-        Returns:
-            True if all locks acquired successfully, False otherwise
-        """
-        # Step 1: Lock source directory with SUBTREE lock
-        if not await self.acquire_subtree(src_path, transaction, timeout=timeout):
-            logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}")
-            return False
-
-        # Step 2: Lock destination parent directory with POINT lock
-        if not await self.acquire_point(dst_path, transaction, timeout=timeout):
-            logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}")
-            # Release source lock
-            await self.release(transaction)
-            return False
+        if src_is_dir:
+            if not await self.acquire_subtree(src_path, transaction, timeout=timeout):
+                logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}")
+                return False
+            if not await self.acquire_subtree(dst_path, transaction, timeout=timeout):
+                logger.warning(f"[MV] Failed to acquire SUBTREE lock on destination: {dst_path}")
+                await self.release(transaction)
+                return False
+        else:
+            src_parent = src_path.rsplit("/", 1)[0] if "/" in src_path else src_path
+            if not await self.acquire_point(src_parent, transaction, timeout=timeout):
+                logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}")
+                return False
+            if not await self.acquire_point(dst_path, transaction, timeout=timeout):
+                logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}")
+                await self.release(transaction)
+                return False
 
         logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}")
         return True
 
     async def release(self, transaction: TransactionRecord) -> None:
-        """Release all locks held by the transaction.
-
-        Args:
-            transaction: Transaction record
-        """
-        # Release locks in reverse order (LIFO)
         for lock_path in reversed(transaction.locks):
             await self._remove_lock_file(lock_path)
             transaction.remove_lock(lock_path)
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index d83d8464..28dfe64d 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -100,7 +100,7 @@ async def start(self) -> None:
 
         logger.info("TransactionManager started")
 
-    def stop(self) -> None:
+    async def stop(self) -> None:
         """Stop transaction manager.
 
         Stops the background cleanup task and releases all resources.
@@ -114,11 +114,17 @@ def stop(self) -> None:
         # Cancel cleanup task
         if self._cleanup_task:
             self._cleanup_task.cancel()
+            try:
+                await self._cleanup_task
+            except asyncio.CancelledError:
+                pass
             self._cleanup_task = None
 
-        # Release all active transactions
+        # Release all active transactions' locks
         for tx_id in list(self._transactions.keys()):
-            self._transactions.pop(tx_id, None)
+            tx = self._transactions.pop(tx_id, None)
+            if tx:
+                await self._path_lock.release(tx)
 
         logger.info("TransactionManager stopped")
 
@@ -508,14 +514,16 @@ async def acquire_lock_mv(
         src_path: str,
         dst_path: str,
         timeout: Optional[float] = None,
+        src_is_dir: bool = True,
     ) -> bool:
         """Acquire path lock for mv operation.
 
         Args:
             transaction_id: Transaction ID
-            src_path: Source directory path
+            src_path: Source path
             dst_path: Destination parent directory path
             timeout: Maximum time to wait for each lock in seconds (default: from config)
+            src_is_dir: Whether the source is a directory
 
         Returns:
             True if lock acquired successfully, False otherwise
@@ -528,7 +536,7 @@ async def acquire_lock_mv(
         tx.update_status(TransactionStatus.AQUIRE)
         effective_timeout = timeout if timeout is not None else self._lock_timeout
         success = await self._path_lock.acquire_mv(
-            src_path, dst_path, tx, timeout=effective_timeout
+            src_path, dst_path, tx, timeout=effective_timeout, src_is_dir=src_is_dir
         )
 
         if success:
@@ -603,3 +611,15 @@ def get_transaction_manager() -> Optional[TransactionManager]:
         TransactionManager instance or None if not initialized
     """
     return _transaction_manager
+
+
+def reset_transaction_manager() -> None:
+    """Reset the transaction manager singleton (for testing).
+
+    This function should ONLY be used in tests to clean up state between tests.
+    It clears the global singleton instance without performing cleanup - make sure
+    to call stop() first if the manager is still running.
+    """
+    global _transaction_manager
+    with _lock:
+        _transaction_manager = None
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index f7395238..0e954ca9 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -324,9 +324,10 @@ async def mv(
     ) -> Dict[str, Any]:
         """Move file/directory + recursively update vector index.
 
-        Wrapped in a transaction: performs FS mv first, then VectorDB URI update.
-        On rollback, the file is moved back and VectorDB mappings are restored.
+        Implemented as cp + rm to avoid lock files being carried by FS mv.
+        On rollback, the copy is deleted and the source remains intact.
         """
+        from openviking.pyagfs.helpers import cp as agfs_cp
         from openviking.storage.transaction import TransactionContext, get_transaction_manager
 
         self._ensure_access(old_uri, ctx)
@@ -350,30 +351,43 @@ async def mv(
                     logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
                 raise
 
-        # Verify source exists before locking
+        # Verify source exists and determine type before locking
         try:
-            self.agfs.stat(old_path)
+            stat = self.agfs.stat(old_path)
+            is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False
         except Exception:
             raise FileNotFoundError(f"mv source not found: {old_uri}")
 
-        # Lock source and destination's parent (dst doesn't exist yet)
         dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path
 
         async with TransactionContext(
-            tx_manager, "mv", [old_path], lock_mode="mv", mv_dst_path=dst_parent
+            tx_manager,
+            "mv",
+            [old_path],
+            lock_mode="mv",
+            mv_dst_path=dst_parent,
+            src_is_dir=is_dir,
         ) as tx:
-            # Step 1: FS move
-            seq_mv = tx.record_undo("fs_mv", {"src": old_path, "dst": new_path})
+            # Step 1: Copy source to destination
+            seq_cp = tx.record_undo("fs_write_new", {"uri": new_path})
             try:
-                result = self.agfs.mv(old_path, new_path)
-            except AGFSHTTPError as e:
-                if e.status_code == 404:
+                agfs_cp(self.agfs, old_path, new_path, recursive=is_dir)
+            except Exception as e:
+                if "not found" in str(e).lower():
                     await self._delete_from_vector_store(uris_to_move, ctx=ctx)
                     logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
                 raise
-            tx.mark_completed(seq_mv)
+            tx.mark_completed(seq_cp)
+
+            # Step 2: Remove carried lock file from the copy (directory only)
+            if is_dir:
+                carried_lock = new_path.rstrip("/") + "/.path.ovlock"
+                try:
+                    self.agfs.rm(carried_lock)
+                except Exception:
+                    pass
 
-            # Step 2: Update VectorDB URIs
+            # Step 3: Update VectorDB URIs
             old_uri_stripped = old_uri.rstrip("/")
             old_parent_uri = (
                 old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else ""
@@ -390,8 +404,13 @@ async def mv(
             await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
             tx.mark_completed(seq_vdb)
 
+            # Step 4: Remove source (lock file gets deleted along with it)
+            seq_rm = tx.record_undo("fs_rm", {"uri": old_path, "recursive": is_dir})
+            self.agfs.rm(old_path, recursive=is_dir)
+            tx.mark_completed(seq_rm)
+
             await tx.commit()
-            return result
+            return {}
 
     async def grep(
         self,
@@ -1380,6 +1399,12 @@ async def read_file(
         """
         self._ensure_access(uri, ctx)
         path = self._uri_to_path(uri, ctx=ctx)
+        # Verify the file exists before reading, because AGFS read returns
+        # empty bytes for non-existent files instead of raising an error.
+        try:
+            self.agfs.stat(path)
+        except Exception:
+            raise NotFoundError(uri, "file")
         try:
             content = self.agfs.read(path)
         except Exception:
diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py
index 330c7089..ff9647e4 100644
--- a/tests/agfs/test_fs_s3.py
+++ b/tests/agfs/test_fs_s3.py
@@ -46,7 +46,8 @@ def load_agfs_config() -> AGFSConfig:
 
 
 AGFS_CONF = load_agfs_config()
-AGFS_CONF.mode = "http-client"
+if AGFS_CONF is not None:
+    AGFS_CONF.mode = "http-client"
 
 # 2. Skip tests if no S3 config found or backend is not S3
 pytestmark = pytest.mark.skipif(
diff --git a/tests/client/test_resource_management.py b/tests/client/test_resource_management.py
index 294ce8d4..8a24c8e0 100644
--- a/tests/client/test_resource_management.py
+++ b/tests/client/test_resource_management.py
@@ -51,7 +51,7 @@ async def test_add_resource_with_target(
         """Test adding resource to specified target"""
         result = await client.add_resource(
             path=str(sample_markdown_file),
-            target="viking://resources/custom/",
+            to="viking://resources/custom/sample",
             reason="Test resource",
         )
 
diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py
index 32421e69..27d6e234 100644
--- a/tests/integration/test_add_resource_index.py
+++ b/tests/integration/test_add_resource_index.py
@@ -1,10 +1,8 @@
-import pytest
-import asyncio
-import os
 import json
-import shutil
-from pathlib import Path
-from unittest.mock import MagicMock, AsyncMock, patch
+import os
+from unittest.mock import AsyncMock, patch
+
+import pytest
 
 from openviking.async_client import AsyncOpenViking
 from openviking_cli.utils.config.open_viking_config import OpenVikingConfigSingleton
@@ -96,6 +94,7 @@ async def test_add_resource_indexing_logic(test_config, tmp_path):
         patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs),
         patch("openviking.agfs_manager.AGFSManager.start"),
         patch("openviking.agfs_manager.AGFSManager.stop"),
+        patch("openviking.storage.transaction.get_transaction_manager", return_value=None),
     ):
         mock_summarize.return_value = {"status": "success"}
 
diff --git a/tests/integration/test_full_workflow.py b/tests/integration/test_full_workflow.py
index 3f86b559..823cefd7 100644
--- a/tests/integration/test_full_workflow.py
+++ b/tests/integration/test_full_workflow.py
@@ -67,11 +67,17 @@ async def test_add_search_read_workflow(
 
         # 3. Read searched resource
         if search_result.resources:
-            res = await client.tree(search_result.resources[0].uri)
-            for data in res:
-                if not data["isDir"]:
-                    content = await client.read(data["uri"])
-                    assert len(content) > 0
+            uri = search_result.resources[0].uri
+            info = await client.stat(uri)
+            if info.get("isDir"):
+                res = await client.tree(uri)
+                for data in res:
+                    if not data["isDir"]:
+                        content = await client.read(data["uri"])
+                        assert len(content) > 0
+            else:
+                content = await client.read(uri)
+                assert len(content) > 0
 
 
 class TestSessionWorkflow:
diff --git a/tests/server/test_api_filesystem.py b/tests/server/test_api_filesystem.py
index 79058d37..3a0da611 100644
--- a/tests/server/test_api_filesystem.py
+++ b/tests/server/test_api_filesystem.py
@@ -66,14 +66,6 @@ async def test_tree(client: httpx.AsyncClient):
     assert body["status"] == "ok"
 
 
-async def test_stat_after_add_resource(client_with_resource):
-    client, uri = client_with_resource
-    resp = await client.get("/api/v1/fs/stat", params={"uri": uri})
-    assert resp.status_code == 200
-    body = resp.json()
-    assert body["status"] == "ok"
-
-
 async def test_stat_not_found(client: httpx.AsyncClient):
     resp = await client.get(
         "/api/v1/fs/stat",
@@ -84,18 +76,28 @@ async def test_stat_not_found(client: httpx.AsyncClient):
     assert body["status"] == "error"
 
 
-async def test_rm_resource(client_with_resource):
+async def test_resource_ops(client_with_resource):
+    """Test stat, ls_recursive, mv, rm on a single shared resource."""
+    import uuid
+
     client, uri = client_with_resource
-    resp = await client.request("DELETE", "/api/v1/fs", params={"uri": uri, "recursive": True})
+
+    # stat
+    resp = await client.get("/api/v1/fs/stat", params={"uri": uri})
     assert resp.status_code == 200
     assert resp.json()["status"] == "ok"
 
+    # ls recursive
+    resp = await client.get(
+        "/api/v1/fs/ls",
+        params={"uri": "viking://", "recursive": True},
+    )
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["status"] == "ok"
+    assert isinstance(body["result"], list)
 
-async def test_mv_resource(client_with_resource):
-    import uuid
-
-    client, uri = client_with_resource
-    # Use a unique name to avoid conflicts with leftover data
+    # mv
     unique = uuid.uuid4().hex[:8]
     new_uri = uri.rstrip("/") + f"_mv_{unique}/"
     resp = await client.post(
@@ -105,14 +107,7 @@ async def test_mv_resource(client_with_resource):
     assert resp.status_code == 200
     assert resp.json()["status"] == "ok"
 
-
-async def test_ls_recursive(client_with_resource):
-    client, _ = client_with_resource
-    resp = await client.get(
-        "/api/v1/fs/ls",
-        params={"uri": "viking://", "recursive": True},
-    )
+    # rm (on the moved uri)
+    resp = await client.request("DELETE", "/api/v1/fs", params={"uri": new_uri, "recursive": True})
     assert resp.status_code == 200
-    body = resp.json()
-    assert body["status"] == "ok"
-    assert isinstance(body["result"], list)
+    assert resp.json()["status"] == "ok"
diff --git a/tests/server/test_api_resources.py b/tests/server/test_api_resources.py
index 013c6baa..16ed1a71 100644
--- a/tests/server/test_api_resources.py
+++ b/tests/server/test_api_resources.py
@@ -5,8 +5,6 @@
 
 import httpx
 
-from tests.server.conftest import SAMPLE_MD_CONTENT
-
 
 async def test_add_resource_success(client: httpx.AsyncClient, sample_markdown_file):
     resp = await client.post(
@@ -55,7 +53,7 @@ async def test_add_resource_with_target(client: httpx.AsyncClient, sample_markdo
         "/api/v1/resources",
         json={
             "path": str(sample_markdown_file),
-            "target": "viking://resources/custom/",
+            "to": "viking://resources/custom/sample",
             "reason": "test resource",
         },
     )
diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py
index 10f06c22..202db790 100644
--- a/tests/storage/test_semantic_dag_stats.py
+++ b/tests/storage/test_semantic_dag_stats.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
 
+from unittest.mock import AsyncMock, MagicMock
+
 import pytest
 
 from openviking.server.identity import RequestContext, Role
@@ -19,6 +21,9 @@ async def ls(self, uri, ctx=None):
     async def write_file(self, path, content, ctx=None):
         self.writes.append((path, content))
 
+    def _uri_to_path(self, uri, ctx=None):
+        return uri.replace("viking://", "/local/acc1/")
+
 
 class _FakeProcessor:
     def __init__(self):
@@ -59,6 +64,22 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch):
     fake_fs = _FakeVikingFS(tree)
     monkeypatch.setattr("openviking.storage.queuefs.semantic_dag.get_viking_fs", lambda: fake_fs)
 
+    # Mock transaction layer: TransactionContext as no-op passthrough
+    mock_tx = MagicMock()
+    mock_tx.commit = AsyncMock()
+    monkeypatch.setattr(
+        "openviking.storage.transaction.context_manager.TransactionContext.__aenter__",
+        AsyncMock(return_value=mock_tx),
+    )
+    monkeypatch.setattr(
+        "openviking.storage.transaction.context_manager.TransactionContext.__aexit__",
+        AsyncMock(return_value=False),
+    )
+    monkeypatch.setattr(
+        "openviking.storage.transaction.transaction_manager.get_transaction_manager",
+        lambda: MagicMock(),
+    )
+
     processor = _FakeProcessor()
     ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER)
     executor = SemanticDagExecutor(
diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py
index 8f67ea96..d498fb59 100644
--- a/tests/transaction/test_e2e.py
+++ b/tests/transaction/test_e2e.py
@@ -24,8 +24,8 @@ def tx_manager(agfs_client):
         agfs_client=agfs_client,
         timeout=3600,
         max_parallel_locks=8,
-        lock_timeout=5.0,
-        lock_expire=300.0,
+        lock_timeout=1.0,
+        lock_expire=1.0,
     )
     return manager
 
diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py
index ab9d5256..3d6fd198 100644
--- a/tests/transaction/test_transaction_manager.py
+++ b/tests/transaction/test_transaction_manager.py
@@ -283,20 +283,20 @@ async def test_start_idempotent(self):
         await manager.start()
         await manager.start()  # Should not error
         assert manager._running is True
-        manager.stop()
+        await manager.stop()
 
     async def test_stop_clears_state(self):
         manager, _ = _make_manager()
         await manager.start()
         manager.create_transaction()
-        manager.stop()
+        await manager.stop()
         assert manager._running is False
         assert manager.get_transaction_count() == 0
 
     async def test_stop_idempotent(self):
         manager, _ = _make_manager()
-        manager.stop()
-        manager.stop()  # Should not error
+        await manager.stop()
+        await manager.stop()  # Should not error
 
 
 class TestTimeoutCleanup:

From 6cab58fb990f093cf337a1f7d07c7594003b9115 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 14:28:09 +0800
Subject: [PATCH 04/18] fix: tests

---
 docs/en/concepts/09-transaction.md            |  8 ++--
 docs/zh/concepts/09-transaction.md            |  8 ++--
 openviking/service/core.py                    |  1 +
 openviking/storage/local_fs.py                |  5 ++-
 .../storage/observers/transaction_observer.py |  6 +--
 openviking/storage/queuefs/semantic_dag.py    |  2 +
 .../storage/queuefs/semantic_processor.py     |  2 +
 .../storage/transaction/context_manager.py    |  2 +-
 openviking/storage/transaction/path_lock.py   | 26 +++++++++++-
 .../transaction/transaction_manager.py        | 41 +++++++++++++++----
 .../storage/transaction/transaction_record.py |  4 +-
 openviking/storage/viking_fs.py               | 13 ------
 .../test_hierarchical_retriever_rerank.py     |  4 +-
 tests/server/conftest.py                      | 21 ++++++++++
 tests/transaction/test_context_manager.py     |  4 +-
 tests/transaction/test_crash_recovery.py      |  6 +--
 tests/transaction/test_e2e.py                 |  4 +-
 tests/transaction/test_path_lock.py           |  6 +--
 tests/transaction/test_transaction_manager.py |  2 +-
 19 files changed, 115 insertions(+), 50 deletions(-)

diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
index 65ec4c3b..3469ed2f 100644
--- a/docs/en/concepts/09-transaction.md
+++ b/docs/en/concepts/09-transaction.md
@@ -251,7 +251,7 @@ Contains: transaction ID, status, lock paths, init_info, undo_log, post_actions.
 
 ```
 Create transaction -> write journal (INIT)
-Acquire lock       -> update journal (AQUIRE -> EXEC)
+Acquire lock       -> update journal (ACQUIRE -> EXEC)
 Execute changes    -> update journal per step (mark undo entry completed)
 Commit             -> update journal (COMMIT + post_actions)
                    -> execute post_actions -> release locks -> delete journal
@@ -267,7 +267,7 @@ Rollback           -> execute undo log -> release locks -> delete journal
 | `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal |
 | `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal |
 | `EXEC` / `FAIL` / `RELEASING` | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal |
-| `INIT` / `AQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) |
+| `INIT` / `ACQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) |
 
 ### Defense Summary
 
@@ -283,13 +283,13 @@ Rollback           -> execute undo log -> release locks -> delete journal
 ## Transaction State Machine
 
 ```
-INIT -> AQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED
+INIT -> ACQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED
                     |
                    FAIL -> RELEASING -> RELEASED
 ```
 
 - `INIT`: Transaction created, waiting for lock
-- `AQUIRE`: Acquiring lock
+- `ACQUIRE`: Acquiring lock
 - `EXEC`: Transaction operations executing
 - `COMMIT`: Committed, post_actions may be pending
 - `FAIL`: Execution failed, entering rollback
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index 99723042..6397cd2d 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -251,7 +251,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 
 ```
 创建事务 → 写 journal（INIT）
-获取锁   → 更新 journal（AQUIRE → EXEC）
+获取锁   → 更新 journal（ACQUIRE → EXEC）
 执行变更 → 每步更新 journal（标记 undo entry completed）
 提交     → 更新 journal（COMMIT + post_actions）
          → 执行 post_actions → 删锁 → 删 journal
@@ -267,7 +267,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 | `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal |
 | `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal |
 | `EXEC` / `FAIL` / `RELEASING` | 执行 undo log 回滚（`recover_all=True`） → 删锁 → 删 journal |
-| `INIT` / `AQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal（变更未执行） |
+| `INIT` / `ACQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal（变更未执行） |
 
 ### 防线总结
 
@@ -283,13 +283,13 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 ## 事务状态机
 
 ```
-INIT → AQUIRE → EXEC → COMMIT → RELEASING → RELEASED
+INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED
                    ↓
                   FAIL → RELEASING → RELEASED
 ```
 
 - `INIT`：事务已创建，等待锁获取
-- `AQUIRE`：正在获取锁
+- `ACQUIRE`：正在获取锁
 - `EXEC`：事务操作执行中
 - `COMMIT`：已提交，可能有 post_actions 待执行
 - `FAIL`：执行失败，进入回滚
diff --git a/openviking/service/core.py b/openviking/service/core.py
index a43da906..b86f9697 100644
--- a/openviking/service/core.py
+++ b/openviking/service/core.py
@@ -149,6 +149,7 @@ def _init_storage(
             max_parallel_locks=tx_cfg.max_parallel_locks,
             lock_timeout=tx_cfg.lock_timeout,
             lock_expire=tx_cfg.lock_expire,
+            vector_store=self._vikingdb_manager,
         )
 
     @property
diff --git a/openviking/storage/local_fs.py b/openviking/storage/local_fs.py
index 40497487..3d23566d 100644
--- a/openviking/storage/local_fs.py
+++ b/openviking/storage/local_fs.py
@@ -205,9 +205,10 @@ async def import_ovpack(
             if not zip_path:
                 continue
 
-            # Normalize path separators to handle Windows-created ZIPs
-            zip_path = zip_path.replace("\\", "/")
+            # Validate before normalization so backslash paths are rejected
             safe_zip_path = _validate_ovpack_member_path(zip_path, base_name)
+            # Normalize path separators to handle Windows-created ZIPs
+            safe_zip_path = safe_zip_path.replace("\\", "/")
 
             # Handle directory entries
             if safe_zip_path.endswith("/"):
diff --git a/openviking/storage/observers/transaction_observer.py b/openviking/storage/observers/transaction_observer.py
index dce4555d..e29b7665 100644
--- a/openviking/storage/observers/transaction_observer.py
+++ b/openviking/storage/observers/transaction_observer.py
@@ -81,7 +81,7 @@ def _format_status_as_table(self, transactions: Dict[str, Any]) -> str:
         # Group transactions by status
         status_counts = {
             TransactionStatus.INIT: 0,
-            TransactionStatus.AQUIRE: 0,
+            TransactionStatus.ACQUIRE: 0,
             TransactionStatus.EXEC: 0,
             TransactionStatus.COMMIT: 0,
             TransactionStatus.FAIL: 0,
@@ -107,7 +107,7 @@ def _format_status_as_table(self, transactions: Dict[str, Any]) -> str:
 
         status_priority = {
             TransactionStatus.EXEC: 0,
-            TransactionStatus.AQUIRE: 1,
+            TransactionStatus.ACQUIRE: 1,
             TransactionStatus.RELEASING: 2,
             TransactionStatus.INIT: 3,
             TransactionStatus.COMMIT: 4,
@@ -206,7 +206,7 @@ def get_status_summary(self) -> Dict[str, int]:
 
         summary = {
             "INIT": 0,
-            "AQUIRE": 0,
+            "ACQUIRE": 0,
             "EXEC": 0,
             "COMMIT": 0,
             "FAIL": 0,
diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index aa6ab5d3..b792833d 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -279,6 +279,8 @@ async def _overview_task(self, dir_uri: str) -> None:
         abstract = ""
         try:
             dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx)
+            # No undo entries recorded: semantic files (.overview.md / .abstract.md) are
+            # regenerable, so residual writes after a crash are acceptable.
             async with TransactionContext(
                 get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point"
             ) as tx:
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index b595c4e1..e7e108f2 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -335,6 +335,8 @@ async def _process_single_directory(
         dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx)
 
         try:
+            # No undo entries recorded: semantic files (.overview.md / .abstract.md) are
+            # regenerable, so residual writes after a crash are acceptable.
             async with TransactionContext(
                 get_transaction_manager(), "semantic", [dir_path], lock_mode="point"
             ) as tx:
diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py
index 68ad9784..5d63658e 100644
--- a/openviking/storage/transaction/context_manager.py
+++ b/openviking/storage/transaction/context_manager.py
@@ -145,7 +145,7 @@ def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None:
         self.record.post_actions.append({"type": action_type, "params": params})
 
     async def commit(self) -> None:
-        self._committed = True
         success = await self._tx_manager.commit(self._record.id)
         if not success:
             raise TransactionError(f"Failed to commit transaction {self._record.id}")
+        self._committed = True
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index e2879694..856a288c 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -272,6 +272,29 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
+            # Check ancestor paths for SUBTREE locks held by other transactions
+            ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id)
+            if ancestor_conflict:
+                if self.is_lock_stale(ancestor_conflict, self._lock_expire):
+                    logger.warning(
+                        f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
+                    )
+                    await self._remove_lock_file(ancestor_conflict)
+                    if asyncio.get_event_loop().time() >= deadline:
+                        logger.warning(
+                            f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
+                        )
+                        return False
+                    await asyncio.sleep(_POLL_INTERVAL)
+                    continue
+                if asyncio.get_event_loop().time() >= deadline:
+                    logger.warning(
+                        f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
+                    )
+                    return False
+                await asyncio.sleep(_POLL_INTERVAL)
+                continue
+
             desc_conflict = await self._scan_descendants_for_locks(path, transaction_id)
             if desc_conflict:
                 if self.is_lock_stale(desc_conflict, self._lock_expire):
@@ -360,8 +383,9 @@ async def acquire_mv(
         return True
 
     async def release(self, transaction: TransactionRecord) -> None:
+        lock_count = len(transaction.locks)
         for lock_path in reversed(transaction.locks):
             await self._remove_lock_file(lock_path)
             transaction.remove_lock(lock_path)
 
-        logger.debug(f"Released {len(transaction.locks)} locks for transaction {transaction.id}")
+        logger.debug(f"Released {lock_count} locks for transaction {transaction.id}")
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index 28dfe64d..9cd0cd9f 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -44,6 +44,7 @@ def __init__(
         max_parallel_locks: int = 8,
         lock_timeout: float = 0.0,
         lock_expire: float = 300.0,
+        vector_store: Optional[Any] = None,
     ):
         """Initialize transaction manager.
 
@@ -55,6 +56,7 @@ def __init__(
                 0 (default) = fail immediately if locked.
                 > 0 = wait/retry up to this many seconds.
             lock_expire: Stale lock expiry threshold in seconds (default: 300s).
+            vector_store: Optional vector store for VectorDB rollback operations.
         """
         from openviking.storage.transaction.journal import TransactionJournal
 
@@ -62,6 +64,7 @@ def __init__(
         self._timeout = timeout
         self._max_parallel_locks = max_parallel_locks
         self._lock_timeout = lock_timeout
+        self._vector_store = vector_store
         self._path_lock = PathLock(agfs_client, lock_expire=lock_expire)
         self._journal = TransactionJournal(agfs_client)
 
@@ -205,7 +208,7 @@ async def _recover_one(self, tx_id: str) -> None:
                     await self._execute_post_actions(tx.post_actions)
                 except Exception as e:
                     logger.warning(f"Post-action replay failed for tx {tx_id}: {e}")
-        elif tx.status in (TransactionStatus.INIT, TransactionStatus.AQUIRE):
+        elif tx.status in (TransactionStatus.INIT, TransactionStatus.ACQUIRE):
             # Transaction never executed any operations — nothing to rollback.
             # However, locks may have been created before the journal was updated
             # with the actual locks list. Use init_info.lock_paths to find and
@@ -218,7 +221,12 @@ async def _recover_one(self, tx_id: str) -> None:
             # Pass recover_all=True so partial (completed=False) ops are also reversed,
             # e.g. a directory mv that started but never finished still leaves residue.
             try:
-                execute_rollback(tx.undo_log, self._agfs, recover_all=True)
+                execute_rollback(
+                    tx.undo_log,
+                    self._agfs,
+                    vector_store=self._vector_store,
+                    recover_all=True,
+                )
             except Exception as e:
                 logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}")
 
@@ -306,7 +314,7 @@ async def begin(self, transaction_id: str) -> bool:
             logger.error(f"Transaction not found: {transaction_id}")
             return False
 
-        tx.update_status(TransactionStatus.AQUIRE)
+        tx.update_status(TransactionStatus.ACQUIRE)
         logger.debug(f"Transaction begun: {transaction_id}")
         return True
 
@@ -389,7 +397,11 @@ async def rollback(self, transaction_id: str) -> bool:
         # Execute undo log (best-effort)
         if tx.undo_log:
             try:
-                execute_rollback(tx.undo_log, self._agfs)
+                execute_rollback(
+                    tx.undo_log,
+                    self._agfs,
+                    vector_store=self._vector_store,
+                )
             except Exception as e:
                 logger.warning(
                     f"Undo log execution failed during rollback of {transaction_id}: {e}"
@@ -447,10 +459,20 @@ async def _post_enqueue_semantic(self, params: Dict[str, Any]) -> None:
         uri = params.get("uri")
         context_type = params.get("context_type", "resource")
         account_id = params.get("account_id", "default")
+        user_id = params.get("user_id", "default")
+        agent_id = params.get("agent_id", "default")
+        role = params.get("role", "root")
         if not uri:
             return
 
-        msg = SemanticMsg(uri=uri, context_type=context_type, account_id=account_id)
+        msg = SemanticMsg(
+            uri=uri,
+            context_type=context_type,
+            account_id=account_id,
+            user_id=user_id,
+            agent_id=agent_id,
+            role=role,
+        )
         semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC)
         await semantic_queue.enqueue(msg)
 
@@ -469,7 +491,7 @@ async def acquire_lock_point(self, transaction_id: str, path: str) -> bool:
             logger.error(f"Transaction not found: {transaction_id}")
             return False
 
-        tx.update_status(TransactionStatus.AQUIRE)
+        tx.update_status(TransactionStatus.ACQUIRE)
         success = await self._path_lock.acquire_point(path, tx, timeout=self._lock_timeout)
 
         if success:
@@ -497,7 +519,7 @@ async def acquire_lock_subtree(
             logger.error(f"Transaction not found: {transaction_id}")
             return False
 
-        tx.update_status(TransactionStatus.AQUIRE)
+        tx.update_status(TransactionStatus.ACQUIRE)
         effective_timeout = timeout if timeout is not None else self._lock_timeout
         success = await self._path_lock.acquire_subtree(path, tx, timeout=effective_timeout)
 
@@ -533,7 +555,7 @@ async def acquire_lock_mv(
             logger.error(f"Transaction not found: {transaction_id}")
             return False
 
-        tx.update_status(TransactionStatus.AQUIRE)
+        tx.update_status(TransactionStatus.ACQUIRE)
         effective_timeout = timeout if timeout is not None else self._lock_timeout
         success = await self._path_lock.acquire_mv(
             src_path, dst_path, tx, timeout=effective_timeout, src_is_dir=src_is_dir
@@ -569,6 +591,7 @@ def init_transaction_manager(
     max_parallel_locks: int = 8,
     lock_timeout: float = 0.0,
     lock_expire: float = 300.0,
+    vector_store: Optional[Any] = None,
 ) -> TransactionManager:
     """Initialize transaction manager singleton.
 
@@ -580,6 +603,7 @@ def init_transaction_manager(
             0 (default) = fail immediately if locked.
             > 0 = wait/retry up to this many seconds.
         lock_expire: Stale lock expiry threshold in seconds (default: 300s).
+        vector_store: Optional vector store for VectorDB rollback operations.
 
     Returns:
         TransactionManager instance
@@ -598,6 +622,7 @@ def init_transaction_manager(
             max_parallel_locks=max_parallel_locks,
             lock_timeout=lock_timeout,
             lock_expire=lock_expire,
+            vector_store=vector_store,
         )
 
         logger.info("TransactionManager initialized as singleton")
diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py
index c73775de..b9eb0656 100644
--- a/openviking/storage/transaction/transaction_record.py
+++ b/openviking/storage/transaction/transaction_record.py
@@ -16,11 +16,11 @@
 class TransactionStatus(str, Enum):
     """Transaction status enumeration.
 
-    Status machine: INIT -> AQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED
+    Status machine: INIT -> ACQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED
     """
 
     INIT = "INIT"  # Transaction initialized, waiting for lock acquisition
-    AQUIRE = "AQUIRE"  # Acquiring lock resources
+    ACQUIRE = "ACQUIRE"  # Acquiring lock resources
     EXEC = "EXEC"  # Transaction operation in progress
     COMMIT = "COMMIT"  # Transaction completed successfully
     FAIL = "FAIL"  # Transaction failed
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 47368f8e..56621f60 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -1172,19 +1172,6 @@ def _handle_agfs_content(self, result: Union[bytes, Any, None]) -> str:
                 return str(result)
             except Exception:
                 return ""
-        """Handle AGFSClient content return types consistently."""
-        if isinstance(result, bytes):
-            return result.decode("utf-8")
-        elif hasattr(result, "content"):
-            return result.content.decode("utf-8")
-        elif result is None:
-            return ""
-        else:
-            # Try to convert to string
-            try:
-                return str(result)
-            except Exception:
-                return ""
 
     def _infer_context_type(self, uri: str):
         """Infer context_type from URI. Returns None when ambiguous."""
diff --git a/tests/retrieve/test_hierarchical_retriever_rerank.py b/tests/retrieve/test_hierarchical_retriever_rerank.py
index ffaea6a8..f72682b3 100644
--- a/tests/retrieve/test_hierarchical_retriever_rerank.py
+++ b/tests/retrieve/test_hierarchical_retriever_rerank.py
@@ -180,8 +180,8 @@ def test_merge_starting_points_prefers_rerank_scores_in_thinking_mode(monkeypatc
         "hello",
         ["viking://resources"],
         [
-            {"uri": "viking://resources/root-a", "abstract": "root A", "_score": 0.2},
-            {"uri": "viking://resources/root-b", "abstract": "root B", "_score": 0.8},
+            {"uri": "viking://resources/root-a", "abstract": "root A", "_score": 0.2, "level": 1},
+            {"uri": "viking://resources/root-b", "abstract": "root B", "_score": 0.8, "level": 1},
         ],
         mode=RetrieverMode.THINKING,
     )
diff --git a/tests/server/conftest.py b/tests/server/conftest.py
index 627798b4..78dbb63e 100644
--- a/tests/server/conftest.py
+++ b/tests/server/conftest.py
@@ -20,8 +20,10 @@
 from openviking.server.config import ServerConfig
 from openviking.server.identity import RequestContext, Role
 from openviking.service.core import OpenVikingService
+from openviking.storage.transaction import reset_transaction_manager
 from openviking_cli.session.user_id import UserIdentifier
 from openviking_cli.utils.config.embedding_config import EmbeddingConfig
+from openviking_cli.utils.config.vlm_config import VLMConfig
 
 # ---------------------------------------------------------------------------
 # Paths
@@ -67,6 +69,20 @@ def get_dimension(self) -> int:
     return FakeEmbedder
 
 
+def _install_fake_vlm(monkeypatch):
+    """Use a fake VLM so server tests never hit external LLM APIs."""
+
+    async def _fake_get_completion(self, prompt, thinking=False, max_retries=0):
+        return "# Test Summary\n\nFake summary for testing.\n\n## Details\nTest content."
+
+    async def _fake_get_vision_completion(self, prompt, images, thinking=False):
+        return "Fake image description for testing."
+
+    monkeypatch.setattr(VLMConfig, "is_available", lambda self: True)
+    monkeypatch.setattr(VLMConfig, "get_completion_async", _fake_get_completion)
+    monkeypatch.setattr(VLMConfig, "get_vision_completion_async", _fake_get_vision_completion)
+
+
 # ---------------------------------------------------------------------------
 # Core fixtures: service + app + async client (HTTP API tests, in-process)
 # ---------------------------------------------------------------------------
@@ -94,7 +110,9 @@ def sample_markdown_file(temp_dir: Path) -> Path:
 @pytest_asyncio.fixture(scope="function")
 async def service(temp_dir: Path, monkeypatch):
     """Create and initialize an OpenVikingService in embedded mode."""
+    reset_transaction_manager()
     fake_embedder_cls = _install_fake_embedder(monkeypatch)
+    _install_fake_vlm(monkeypatch)
     svc = OpenVikingService(
         path=str(temp_dir / "data"), user=UserIdentifier.the_default_user("test_user")
     )
@@ -102,6 +120,7 @@ async def service(temp_dir: Path, monkeypatch):
     svc.viking_fs.query_embedder = fake_embedder_cls()
     yield svc
     await svc.close()
+    reset_transaction_manager()
 
 
 @pytest_asyncio.fixture(scope="function")
@@ -146,7 +165,9 @@ async def client_with_resource(client, service, sample_markdown_file):
 async def running_server(temp_dir: Path, monkeypatch):
     """Start a real uvicorn server in a background thread."""
     await AsyncOpenViking.reset()
+    reset_transaction_manager()
     fake_embedder_cls = _install_fake_embedder(monkeypatch)
+    _install_fake_vlm(monkeypatch)
 
     svc = OpenVikingService(
         path=str(temp_dir / "sdk_data"), user=UserIdentifier.the_default_user("sdk_test_user")
diff --git a/tests/transaction/test_context_manager.py b/tests/transaction/test_context_manager.py
index f45a55cc..bf077bf9 100644
--- a/tests/transaction/test_context_manager.py
+++ b/tests/transaction/test_context_manager.py
@@ -87,7 +87,9 @@ async def test_mv_lock_mode(self):
         ) as tx:
             await tx.commit()
 
-        tx_manager.acquire_lock_mv.assert_called_once_with("tx-test", "/src", "/dst")
+        tx_manager.acquire_lock_mv.assert_called_once_with(
+            "tx-test", "/src", "/dst", src_is_dir=True
+        )
 
     async def test_point_lock_mode(self):
         tx_manager, record = _make_tx_manager()
diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py
index 85384574..a8e3d993 100644
--- a/tests/transaction/test_crash_recovery.py
+++ b/tests/transaction/test_crash_recovery.py
@@ -276,12 +276,12 @@ async def test_recover_init_orphan_lock_owned_by_other_tx_not_removed(self):
         assert not any(".path.ovlock" in p for p in rm_calls)
         manager._journal.delete.assert_called_once_with("tx-innocent")
 
-    async def test_recover_aquire_status(self):
-        """AQUIRE status → same as INIT, clean up only."""
+    async def test_recover_acquire_status(self):
+        """ACQUIRE status → same as INIT, clean up only."""
         entries = {
             "tx-acq": {
                 "id": "tx-acq",
-                "status": "AQUIRE",
+                "status": "ACQUIRE",
                 "locks": ["/local/z/.path.ovlock"],
                 "created_at": time.time(),
                 "updated_at": time.time(),
diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py
index d498fb59..d7b850c4 100644
--- a/tests/transaction/test_e2e.py
+++ b/tests/transaction/test_e2e.py
@@ -128,7 +128,7 @@ async def test_no_commit_triggers_rollback(self, agfs_client, tx_manager, test_d
 
 class TestE2EMvLock:
     async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_dir):
-        """mv lock mode acquires SUBTREE on source and POINT on destination."""
+        """mv lock mode acquires SUBTREE on both source and destination."""
         src = f"{test_dir}/mv-src-{uuid.uuid4().hex}"
         dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}"
         agfs_client.mkdir(src)
@@ -144,7 +144,7 @@ async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_d
             dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token
 
             assert ":S" in src_token_str  # SUBTREE on source
-            assert ":P" in dst_token_str  # POINT on destination
+            assert ":S" in dst_token_str  # SUBTREE on destination
 
             await tx.commit()
 
diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py
index e9af3fdc..2f3b6afc 100644
--- a/tests/transaction/test_path_lock.py
+++ b/tests/transaction/test_path_lock.py
@@ -181,8 +181,8 @@ async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir):
 
         await lock.release(tx_child)
 
-    async def test_acquire_mv_creates_subtree_and_point(self, agfs_client, test_dir):
-        """acquire_mv puts SUBTREE on src and POINT on dst."""
+    async def test_acquire_mv_creates_subtree_locks(self, agfs_client, test_dir):
+        """acquire_mv puts SUBTREE on both src and dst."""
         import uuid as _uuid
 
         src = f"{test_dir}/src-{_uuid.uuid4().hex}"
@@ -209,7 +209,7 @@ async def test_acquire_mv_creates_subtree_and_point(self, agfs_client, test_dir)
             if isinstance(dst_token_bytes, bytes)
             else dst_token_bytes
         )
-        assert ":P" in dst_token
+        assert ":S" in dst_token
 
         await lock.release(tx)
 
diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py
index 3d6fd198..ef0f0b3e 100644
--- a/tests/transaction/test_transaction_manager.py
+++ b/tests/transaction/test_transaction_manager.py
@@ -67,7 +67,7 @@ async def test_begin_updates_status(self):
         tx = manager.create_transaction()
         ok = await manager.begin(tx.id)
         assert ok is True
-        assert tx.status == TransactionStatus.AQUIRE
+        assert tx.status == TransactionStatus.ACQUIRE
 
     async def test_begin_unknown_tx(self):
         manager, _ = _make_manager()

From 5a9ffb52dcc46f2cd5142b23e75c2122c962a834 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 15:08:07 +0800
Subject: [PATCH 05/18] fix(transaction): fix rollback and race condition bugs

- Reconstruct RequestContext from undo params for vectordb_delete/update_uri
  rollback (previously skipped silently due to missing ctx)
- Serialize ctx fields into undo params in rm/mv operations
- Fix Phase 1 undo path to target archive dir instead of session root
- Remove Phase 2 fs_write_new undo (overwrites are idempotent, checkpoint
  handles recovery)
- Add ancestor SUBTREE recheck after lock creation in acquire_subtree
- Move _collect_uris inside TransactionContext in rm/mv to close race window
- Log journal persistence failures instead of silently swallowing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openviking/session/session.py                 |  6 +-
 .../storage/transaction/context_manager.py    |  8 +--
 openviking/storage/transaction/path_lock.py   |  2 +
 openviking/storage/transaction/undo.py        | 61 ++++++++++++++-----
 openviking/storage/viking_fs.py               | 34 +++++++++--
 5 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/openviking/session/session.py b/openviking/session/session.py
index 067d0e1b..005b2f57 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -336,7 +336,9 @@ async def _phase1_archive_async(
         async with TransactionContext(
             tx_manager, "session_archive", [session_path], lock_mode="point"
         ) as tx:
-            seq = tx.record_undo("fs_write_new", {"uri": session_path})
+            archive_uri = f"{self._session_uri}/history/archive_{compression_index:03d}"
+            archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx)
+            seq = tx.record_undo("fs_write_new", {"uri": archive_path})
             self._write_archive(
                 index=compression_index,
                 messages=messages_to_archive,
@@ -355,11 +357,9 @@ async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None
         async with TransactionContext(
             tx_manager, "session_memory", [session_path], lock_mode="point"
         ) as tx:
-            seq = tx.record_undo("fs_write_new", {"uri": session_path})
             self._write_to_agfs(self._messages)
             self._write_relations()
             self._write_checkpoint({"status": "completed"})
-            tx.mark_completed(seq)
             tx.add_post_action(
                 "enqueue_semantic",
                 {
diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py
index 5d63658e..8272b91c 100644
--- a/openviking/storage/transaction/context_manager.py
+++ b/openviking/storage/transaction/context_manager.py
@@ -125,8 +125,8 @@ def record_undo(self, op_type: str, params: Dict[str, Any]) -> int:
 
         try:
             self._tx_manager.journal.update(self.record.to_journal())
-        except Exception:
-            pass
+        except Exception as e:
+            logger.debug(f"[Transaction] Failed to persist journal: {e}")
 
         return seq
 
@@ -138,8 +138,8 @@ def mark_completed(self, sequence: int) -> None:
 
         try:
             self._tx_manager.journal.update(self.record.to_journal())
-        except Exception:
-            pass
+        except Exception as e:
+            logger.debug(f"[Transaction] Failed to persist journal: {e}")
 
     def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None:
         self.record.post_actions.append({"type": action_type, "params": params})
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index 856a288c..a67cb6bc 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -323,6 +323,8 @@ async def acquire_subtree(
 
             backed_off = False
             conflict_after = await self._scan_descendants_for_locks(path, transaction_id)
+            if not conflict_after:
+                conflict_after = await self._check_ancestors_for_subtree(path, transaction_id)
             if conflict_after:
                 their_token = self._read_token(conflict_after)
                 if their_token:
diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py
index d64d1619..e11575ad 100644
--- a/openviking/storage/transaction/undo.py
+++ b/openviking/storage/transaction/undo.py
@@ -15,6 +15,29 @@
 logger = get_logger(__name__)
 
 
+def _reconstruct_ctx(params: Dict[str, Any]) -> Optional[Any]:
+    """Reconstruct a RequestContext from serialized _ctx_* fields in undo params.
+
+    Returns None if the required fields are missing.
+    """
+    account_id = params.get("_ctx_account_id")
+    user_id = params.get("_ctx_user_id")
+    agent_id = params.get("_ctx_agent_id")
+    role_value = params.get("_ctx_role")
+    if account_id is None or user_id is None:
+        return None
+    try:
+        from openviking.server.identity import RequestContext, Role
+        from openviking_cli.session.user_id import UserIdentifier
+
+        role = Role(role_value) if role_value in {r.value for r in Role} else Role.ROOT
+        user = UserIdentifier(account_id, user_id, agent_id or "")
+        return RequestContext(user=user, role=role)
+    except Exception as e:
+        logger.warning(f"[Rollback] Failed to reconstruct ctx: {e}")
+        return None
+
+
 @dataclass
 class UndoEntry:
     """A single undo log entry representing one reversible sub-operation.
@@ -124,24 +147,32 @@ def _rollback_entry(
                 run_async(vector_store.delete([record_id]))
 
     elif op == "vectordb_delete":
-        if vector_store and ctx:
-            records_snapshot = params.get("records_snapshot", [])
-            for record in records_snapshot:
-                try:
-                    run_async(vector_store.upsert(record))
-                except Exception as e:
-                    logger.warning(f"[Rollback] Failed to restore vector record: {e}")
+        if vector_store:
+            restored_ctx = _reconstruct_ctx(params)
+            if restored_ctx is None:
+                logger.warning("[Rollback] vectordb_delete: cannot reconstruct ctx, skipping")
+            else:
+                records_snapshot = params.get("records_snapshot", [])
+                for record in records_snapshot:
+                    try:
+                        run_async(vector_store.upsert(record, ctx=restored_ctx))
+                    except Exception as e:
+                        logger.warning(f"[Rollback] Failed to restore vector record: {e}")
 
     elif op == "vectordb_update_uri":
-        if vector_store and ctx:
-            run_async(
-                vector_store.update_uri_mapping(
-                    ctx=ctx,
-                    uri=params["new_uri"],
-                    new_uri=params["old_uri"],
-                    new_parent_uri=params.get("old_parent_uri", ""),
+        if vector_store:
+            restored_ctx = _reconstruct_ctx(params)
+            if restored_ctx is None:
+                logger.warning("[Rollback] vectordb_update_uri: cannot reconstruct ctx, skipping")
+            else:
+                run_async(
+                    vector_store.update_uri_mapping(
+                        ctx=restored_ctx,
+                        uri=params["new_uri"],
+                        new_uri=params["old_uri"],
+                        new_parent_uri=params.get("old_parent_uri", ""),
+                    )
                 )
-            )
 
     else:
         logger.warning(f"[Rollback] Unknown op_type: {op}")
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 56621f60..71ca1b74 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -298,12 +298,12 @@ async def rm(
         self._ensure_access(uri, ctx)
         path = self._uri_to_path(uri, ctx=ctx)
         target_uri = self._path_to_uri(path, ctx=ctx)
-        uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
-        uris_to_delete.append(target_uri)
 
         tx_manager = get_transaction_manager()
         if not tx_manager:
             # Fallback: no transaction support
+            uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
+            uris_to_delete.append(target_uri)
             result = self.agfs.rm(path, recursive=recursive)
             await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
             return result
@@ -314,6 +314,8 @@ async def rm(
             is_dir = stat.get("isDir", False) if isinstance(stat, dict) else False
         except Exception:
             # Path does not exist: clean up any orphan index records and return
+            uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
+            uris_to_delete.append(target_uri)
             await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
             logger.info(f"[VikingFS] rm target not found, cleaned orphan index: {uri}")
             return {}
@@ -327,12 +329,25 @@ async def rm(
             lock_mode = "point"
 
         async with TransactionContext(tx_manager, "rm", lock_paths, lock_mode=lock_mode) as tx:
+            # Collect URIs inside the lock to avoid race conditions
+            uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
+            uris_to_delete.append(target_uri)
+
             # Snapshot vector records for rollback
             records_snapshot = await self._snapshot_vector_records(uris_to_delete, ctx=ctx)
 
             # Step 1: Delete from VectorDB first
+            real_ctx = self._ctx_or_default(ctx)
             seq_vdb = tx.record_undo(
-                "vectordb_delete", {"uris": uris_to_delete, "records_snapshot": records_snapshot}
+                "vectordb_delete",
+                {
+                    "uris": uris_to_delete,
+                    "records_snapshot": records_snapshot,
+                    "_ctx_account_id": real_ctx.account_id,
+                    "_ctx_user_id": real_ctx.user.user_id,
+                    "_ctx_agent_id": real_ctx.user.agent_id,
+                    "_ctx_role": real_ctx.role.value,
+                },
             )
             await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
             tx.mark_completed(seq_vdb)
@@ -364,12 +379,12 @@ async def mv(
         old_path = self._uri_to_path(old_uri, ctx=ctx)
         new_path = self._uri_to_path(new_uri, ctx=ctx)
         target_uri = self._path_to_uri(old_path, ctx=ctx)
-        uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
-        uris_to_move.append(target_uri)
 
         tx_manager = get_transaction_manager()
         if not tx_manager:
             # Fallback: no transaction support
+            uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
+            uris_to_move.append(target_uri)
             try:
                 result = self.agfs.mv(old_path, new_path)
                 await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
@@ -397,6 +412,10 @@ async def mv(
             mv_dst_path=dst_parent,
             src_is_dir=is_dir,
         ) as tx:
+            # Collect URIs inside the lock to avoid race conditions
+            uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
+            uris_to_move.append(target_uri)
+
             # Step 1: Copy source to destination
             seq_cp = tx.record_undo("fs_write_new", {"uri": new_path})
             try:
@@ -421,6 +440,7 @@ async def mv(
             old_parent_uri = (
                 old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else ""
             )
+            real_ctx = self._ctx_or_default(ctx)
             seq_vdb = tx.record_undo(
                 "vectordb_update_uri",
                 {
@@ -428,6 +448,10 @@ async def mv(
                     "new_uri": new_uri,
                     "old_parent_uri": old_parent_uri,
                     "uris": uris_to_move,
+                    "_ctx_account_id": real_ctx.account_id,
+                    "_ctx_user_id": real_ctx.user.user_id,
+                    "_ctx_agent_id": real_ctx.user.agent_id,
+                    "_ctx_role": real_ctx.role.value,
                 },
             )
             await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)

From 273efbc408f2ef0d3fb902c4a0334ff74ff17f59 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 15:38:50 +0800
Subject: [PATCH 06/18] refactor(transaction): make TransactionManager required
 and rewrite tests with real backends

Remove all optional/fallback code paths where tx_manager could be None. get_transaction_manager()
now raises RuntimeError if not initialized. Fix undo rollback to reconstruct ctx for vectordb_upsert
and use correct agent_id default. Replace mock-based transaction tests with integration tests using
real AGFS and VectorDB backends.
---
 openviking/parse/tree_builder.py              |  59 +-
 openviking/service/debug_service.py           |   7 +-
 openviking/session/session.py                 |  33 +-
 .../transaction/transaction_manager.py        |  12 +-
 openviking/storage/transaction/undo.py        |   8 +-
 openviking/storage/viking_fs.py               |  23 +-
 tests/integration/test_add_resource_index.py  |   1 -
 tests/transaction/conftest.py                 |  86 +-
 tests/transaction/test_crash_recovery.py      | 854 +++++++++++-------
 tests/transaction/test_rm_rollback.py         | 363 ++++----
 tests/transaction/test_undo.py                | 260 ++++--
 11 files changed, 1035 insertions(+), 671 deletions(-)

diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 97935bdb..721c12bc 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -176,45 +176,32 @@ async def finalize_from_temp(
         # Lock parent directory (final_path doesn't exist yet)
         parent_path = final_path.rsplit("/", 1)[0] if "/" in final_path else final_path
 
-        if tx_manager:
-            # Ensure parent directories exist before locking
-            await self._ensure_parent_dirs(final_uri, ctx=ctx)
-
-            async with TransactionContext(
-                tx_manager, "finalize_from_temp", [parent_path], lock_mode="point"
-            ) as tx:
-                # Move temp to final
-                seq = tx.record_undo("fs_write_new", {"uri": final_path})
-                await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx)
-                tx.mark_completed(seq)
-                logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}")
-
-                # Register semantic enqueue as post_action
-                tx.add_post_action(
-                    "enqueue_semantic",
-                    {
-                        "uri": final_uri,
-                        "context_type": "resource",
-                        "account_id": ctx.account_id,
-                        "user_id": ctx.user.user_id,
-                        "agent_id": ctx.user.agent_id,
-                        "role": ctx.role.value,
-                    },
-                )
-
-                await tx.commit()
-        else:
-            # Fallback: no transaction support
+        # Ensure parent directories exist before locking
+        await self._ensure_parent_dirs(final_uri, ctx=ctx)
+
+        async with TransactionContext(
+            tx_manager, "finalize_from_temp", [parent_path], lock_mode="point"
+        ) as tx:
+            # Move temp to final
+            seq = tx.record_undo("fs_write_new", {"uri": final_path})
             await self._move_temp_to_dest(viking_fs, temp_doc_uri, final_uri, ctx=ctx)
+            tx.mark_completed(seq)
             logger.info(f"[TreeBuilder] Moved temp tree: {temp_doc_uri} -> {final_uri}")
 
-            try:
-                await self._enqueue_semantic_generation(final_uri, "resource", ctx=ctx)
-                logger.info(f"[TreeBuilder] Enqueued semantic generation for: {final_uri}")
-            except Exception as e:
-                logger.error(
-                    f"[TreeBuilder] Failed to enqueue semantic generation: {e}", exc_info=True
-                )
+            # Register semantic enqueue as post_action
+            tx.add_post_action(
+                "enqueue_semantic",
+                {
+                    "uri": final_uri,
+                    "context_type": "resource",
+                    "account_id": ctx.account_id,
+                    "user_id": ctx.user.user_id,
+                    "agent_id": ctx.user.agent_id,
+                    "role": ctx.role.value,
+                },
+            )
+
+            await tx.commit()
 
         # 5. Cleanup temporary root directory
         try:
diff --git a/openviking/service/debug_service.py b/openviking/service/debug_service.py
index 9c3cf39b..7dffff65 100644
--- a/openviking/service/debug_service.py
+++ b/openviking/service/debug_service.py
@@ -138,13 +138,14 @@ def vlm(self) -> ComponentStatus:
     @property
     def transaction(self) -> ComponentStatus:
         """Get transaction status."""
-        transaction_manager = get_transaction_manager()
-        if transaction_manager is None:
+        try:
+            transaction_manager = get_transaction_manager()
+        except Exception:
             return ComponentStatus(
                 name="transaction",
                 is_healthy=False,
                 has_errors=True,
-                status="Transaction manager not initialized.",
+                status="Not initialized",
             )
         observer = TransactionObserver(transaction_manager)
         return ComponentStatus(
diff --git a/openviking/session/session.py b/openviking/session/session.py
index 005b2f57..88444adb 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -251,25 +251,16 @@ def commit(self) -> Dict[str, Any]:
         archive_abstract = self._extract_abstract_from_summary(summary)
         archive_overview = summary
 
-        if tx_manager:
-            run_async(
-                self._phase1_archive_async(
-                    tx_manager,
-                    session_path,
-                    self._compression.compression_index,
-                    messages_to_archive,
-                    archive_abstract,
-                    archive_overview,
-                )
-            )
-        else:
-            self._write_archive(
-                index=self._compression.compression_index,
-                messages=messages_to_archive,
-                abstract=archive_abstract,
-                overview=archive_overview,
+        run_async(
+            self._phase1_archive_async(
+                tx_manager,
+                session_path,
+                self._compression.compression_index,
+                messages_to_archive,
+                archive_abstract,
+                archive_overview,
             )
-            self._write_to_agfs(messages=[])
+        )
 
         self._compression.original_count += len(messages_to_archive)
         result["archived"] = True
@@ -298,11 +289,7 @@ def commit(self) -> Dict[str, Any]:
             get_current_telemetry().set("memory.extracted", len(memories))
 
         # ===== Phase 2: Memory write =====
-        if tx_manager:
-            run_async(self._phase2_memory_async(tx_manager, session_path))
-        else:
-            self._write_to_agfs(self._messages)
-            self._write_relations()
+        run_async(self._phase2_memory_async(tx_manager, session_path))
 
         # Update active_count
         active_count_updated = self._update_active_counts()
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index 9cd0cd9f..7b40c6be 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -629,12 +629,12 @@ def init_transaction_manager(
         return _transaction_manager
 
 
-def get_transaction_manager() -> Optional[TransactionManager]:
-    """Get transaction manager singleton.
-
-    Returns:
-        TransactionManager instance or None if not initialized
-    """
+def get_transaction_manager() -> TransactionManager:
+    """Get transaction manager singleton."""
+    if _transaction_manager is None:
+        raise RuntimeError(
+            "TransactionManager not initialized. Call init_transaction_manager() first."
+        )
     return _transaction_manager
 
 
diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py
index e11575ad..a77aa5aa 100644
--- a/openviking/storage/transaction/undo.py
+++ b/openviking/storage/transaction/undo.py
@@ -31,7 +31,7 @@ def _reconstruct_ctx(params: Dict[str, Any]) -> Optional[Any]:
         from openviking_cli.session.user_id import UserIdentifier
 
         role = Role(role_value) if role_value in {r.value for r in Role} else Role.ROOT
-        user = UserIdentifier(account_id, user_id, agent_id or "")
+        user = UserIdentifier(account_id, user_id, agent_id or "default")
         return RequestContext(user=user, role=role)
     except Exception as e:
         logger.warning(f"[Rollback] Failed to reconstruct ctx: {e}")
@@ -144,7 +144,11 @@ def _rollback_entry(
         if vector_store:
             record_id = params.get("record_id")
             if record_id:
-                run_async(vector_store.delete([record_id]))
+                restored_ctx = _reconstruct_ctx(params)
+                if restored_ctx:
+                    run_async(vector_store.delete([record_id], ctx=restored_ctx))
+                else:
+                    logger.warning("[Rollback] vectordb_upsert: cannot reconstruct ctx, skipping")
 
     elif op == "vectordb_delete":
         if vector_store:
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 71ca1b74..4db5afd7 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -22,7 +22,6 @@
 from pathlib import PurePath
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-from openviking.pyagfs.exceptions import AGFSHTTPError
 from openviking.server.identity import RequestContext, Role
 from openviking.telemetry import get_current_telemetry
 from openviking.utils.time_utils import format_simplified, get_current_timestamp, parse_iso_datetime
@@ -300,13 +299,6 @@ async def rm(
         target_uri = self._path_to_uri(path, ctx=ctx)
 
         tx_manager = get_transaction_manager()
-        if not tx_manager:
-            # Fallback: no transaction support
-            uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
-            uris_to_delete.append(target_uri)
-            result = self.agfs.rm(path, recursive=recursive)
-            await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
-            return result
 
         # Check existence and determine lock strategy
         try:
@@ -381,19 +373,6 @@ async def mv(
         target_uri = self._path_to_uri(old_path, ctx=ctx)
 
         tx_manager = get_transaction_manager()
-        if not tx_manager:
-            # Fallback: no transaction support
-            uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
-            uris_to_move.append(target_uri)
-            try:
-                result = self.agfs.mv(old_path, new_path)
-                await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
-                return result
-            except AGFSHTTPError as e:
-                if e.status_code == 404:
-                    await self._delete_from_vector_store(uris_to_move, ctx=ctx)
-                    logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
-                raise
 
         # Verify source exists and determine type before locking
         try:
@@ -1228,9 +1207,9 @@ async def _snapshot_vector_records(
         for uri in uris:
             try:
                 records = await vector_store.get_context_by_uri(
-                    account_id=real_ctx.account_id,
                     uri=uri,
                     limit=10,
+                    ctx=real_ctx,
                 )
                 if records:
                     snapshots.extend(records)
diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py
index 27d6e234..2a35462a 100644
--- a/tests/integration/test_add_resource_index.py
+++ b/tests/integration/test_add_resource_index.py
@@ -94,7 +94,6 @@ async def test_add_resource_indexing_logic(test_config, tmp_path):
         patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs),
         patch("openviking.agfs_manager.AGFSManager.start"),
         patch("openviking.agfs_manager.AGFSManager.stop"),
-        patch("openviking.storage.transaction.get_transaction_manager", return_value=None),
     ):
         mock_summarize.return_value = {"status": "success"}
 
diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py
index db77bbdd..05fac402 100644
--- a/tests/transaction/conftest.py
+++ b/tests/transaction/conftest.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
-"""Shared fixtures for transaction tests using real AGFS backend."""
+"""Shared fixtures for transaction tests using real AGFS and VectorDB backends."""
 
 import os
 import shutil
@@ -9,13 +9,24 @@
 import pytest
 
 from openviking.agfs_manager import AGFSManager
+from openviking.server.identity import RequestContext, Role
+from openviking.storage.collection_schemas import CollectionSchemas
+from openviking.storage.transaction.journal import TransactionJournal
+from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _make_fencing_token
+from openviking.storage.transaction.transaction_manager import TransactionManager
+from openviking.storage.viking_vector_index_backend import VikingVectorIndexBackend
 from openviking.utils.agfs_utils import create_agfs_client
+from openviking_cli.session.user_id import UserIdentifier
 from openviking_cli.utils.config.agfs_config import AGFSConfig
+from openviking_cli.utils.config.vectordb_config import VectorDBBackendConfig
 
 AGFS_CONF = AGFSConfig(
     path="/tmp/ov-tx-test", backend="local", port=1834, url="http://localhost:1834", timeout=10
 )
 
+VECTOR_DIM = 4
+COLLECTION_NAME = "tx_test_ctx"
+
 # Clean slate before session starts
 if os.path.exists(AGFS_CONF.path):
     shutil.rmtree(AGFS_CONF.path)
@@ -54,3 +65,76 @@ def test_dir(agfs_client):
         agfs_client.rm(path, recursive=True)
     except Exception:
         pass
+
+
+# ---------------------------------------------------------------------------
+# VectorDB fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def vector_store(tmp_path_factory):
+    """Session-scoped real local VectorDB backend."""
+    db_path = str(tmp_path_factory.mktemp("vectordb"))
+    config = VectorDBBackendConfig(
+        backend="local",
+        name=COLLECTION_NAME,
+        path=db_path,
+        dimension=VECTOR_DIM,
+    )
+    store = VikingVectorIndexBackend(config=config)
+
+    import asyncio
+
+    schema = CollectionSchemas.context_collection(COLLECTION_NAME, VECTOR_DIM)
+    asyncio.get_event_loop().run_until_complete(store.create_collection(COLLECTION_NAME, schema))
+
+    yield store
+
+    asyncio.get_event_loop().run_until_complete(store.close())
+
+
+@pytest.fixture(scope="session")
+def request_ctx():
+    """Session-scoped RequestContext for VectorDB operations."""
+    user = UserIdentifier("default", "test_user", "default")
+    return RequestContext(user=user, role=Role.ROOT)
+
+
+# ---------------------------------------------------------------------------
+# Transaction fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def tx_manager(agfs_client, vector_store):
+    """Function-scoped TransactionManager with real backends."""
+    return TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
+
+
+@pytest.fixture
+def journal(agfs_client):
+    """Function-scoped TransactionJournal with real AGFS backend."""
+    return TransactionJournal(agfs_client)
+
+
+# ---------------------------------------------------------------------------
+# Utility helpers
+# ---------------------------------------------------------------------------
+
+
+def file_exists(agfs_client, path) -> bool:
+    """Check if a file/dir exists in AGFS."""
+    try:
+        agfs_client.stat(path)
+        return True
+    except Exception:
+        return False
+
+
+def make_lock_file(agfs_client, dir_path, tx_id, lock_type="P") -> str:
+    """Create a real lock file in AGFS and return its path."""
+    lock_path = f"{dir_path.rstrip('/')}/{LOCK_FILE_NAME}"
+    token = _make_fencing_token(tx_id, lock_type)
+    agfs_client.write(lock_path, token.encode("utf-8"))
+    return lock_path
diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py
index a8e3d993..21569edd 100644
--- a/tests/transaction/test_crash_recovery.py
+++ b/tests/transaction/test_crash_recovery.py
@@ -1,385 +1,561 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
-"""Integration test: crash recovery from journal."""
+"""Integration test: crash recovery from journal using real AGFS and VectorDB backends."""
 
-import time
-from unittest.mock import AsyncMock, MagicMock, patch
+import uuid
+from unittest.mock import AsyncMock, patch
 
+from openviking.storage.transaction.journal import TransactionJournal
 from openviking.storage.transaction.transaction_manager import TransactionManager
+from openviking.storage.transaction.transaction_record import (
+    TransactionRecord,
+    TransactionStatus,
+)
+from openviking.storage.transaction.undo import UndoEntry
+
+from .conftest import VECTOR_DIM, _mkdir_ok, file_exists, make_lock_file
+
+
+def _write_journal(journal, record):
+    """Write a TransactionRecord to real journal storage."""
+    journal.write(record.to_journal())
 
 
 class TestCrashRecovery:
-    def _make_manager(self, journal_entries=None):
-        """Create a TransactionManager with mocked AGFS and journal data."""
-        agfs = MagicMock()
-        manager = TransactionManager(agfs_client=agfs, timeout=3600)
-
-        if journal_entries:
-            manager._journal = MagicMock()
-            manager._journal.list_all.return_value = list(journal_entries.keys())
-            manager._journal.read.side_effect = lambda tx_id: journal_entries[tx_id]
-            manager._journal.delete = MagicMock()
-        else:
-            manager._journal = MagicMock()
-            manager._journal.list_all.return_value = []
-
-        return manager, agfs
-
-    async def test_recover_committed_with_post_actions(self):
-        """COMMIT + post_actions → replay post_actions, clean up."""
-        entries = {
-            "tx-1": {
-                "id": "tx-1",
-                "status": "COMMIT",
-                "locks": ["/local/test/.path.ovlock"],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [
-                    {
-                        "type": "enqueue_semantic",
-                        "params": {
-                            "uri": "viking://test",
-                            "context_type": "resource",
-                            "account_id": "acc",
-                        },
-                    }
-                ],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+    """
+    Core technique: simulate crash recovery.
+
+    1. Create real FS state via agfs_client
+    2. Build TransactionRecord, write to real journal
+    3. Create fresh TransactionManager (simulates process restart)
+    4. Call manager._recover_pending_transactions()
+    5. Verify final state via agfs_client.stat()/cat() and vector_store.get()
+    """
+
+    async def test_recover_commit_no_rollback(self, agfs_client, vector_store, test_dir):
+        """COMMIT status → committed files NOT rolled back, journal cleaned up."""
+        # Create a file that was part of a committed transaction
+        committed_file = f"{test_dir}/committed.txt"
+        agfs_client.write(committed_file, b"committed data")
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-commit-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.COMMIT,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_write_new",
+                    params={"uri": committed_file},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        # New manager (simulates restart)
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
+        await manager._recover_pending_transactions()
 
-        with patch(
-            "openviking.storage.transaction.transaction_manager.TransactionManager._execute_post_actions",
-            new_callable=AsyncMock,
-        ) as mock_post:
+        # File should still exist (no rollback for committed tx)
+        assert file_exists(agfs_client, committed_file)
+        # Journal should be cleaned up
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_commit_replays_post_actions(self, agfs_client, vector_store, test_dir):
+        """COMMIT + post_actions → replay post_actions."""
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-post-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.COMMIT,
+            locks=[],
+            undo_log=[],
+            post_actions=[
+                {
+                    "type": "enqueue_semantic",
+                    "params": {
+                        "uri": "viking://test-post",
+                        "context_type": "resource",
+                        "account_id": "acc",
+                    },
+                }
+            ],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
+
+        with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post:
             await manager._recover_pending_transactions()
 
         mock_post.assert_called_once()
-        agfs.rm.assert_called_once_with("/local/test/.path.ovlock")
-        manager._journal.delete.assert_called_once_with("tx-1")
-
-    async def test_recover_committed_no_post_actions(self):
-        """COMMIT + no post_actions → just clean up, no rollback."""
-        entries = {
-            "tx-2": {
-                "id": "tx-2",
-                "status": "COMMIT",
-                "locks": [],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [
-                    # Even if undo_log has entries, COMMIT should NOT rollback
-                    {
-                        "sequence": 0,
-                        "op_type": "fs_mv",
-                        "params": {"src": "/a", "dst": "/b"},
-                        "completed": True,
-                    }
-                ],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_exec_rollback_fs_mv(self, agfs_client, vector_store, test_dir):
+        """EXEC status with fs_mv → recovery rolls back → file moved back."""
+        src = f"{test_dir}/exec-mv-src"
+        dst = f"{test_dir}/exec-mv-dst"
+        _mkdir_ok(agfs_client, src)
+        agfs_client.write(f"{src}/data.txt", b"mv-data")
+
+        # Simulate: forward mv happened, then crash
+        agfs_client.mv(src, dst)
+        assert not file_exists(agfs_client, src)
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-exec-mv-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mv",
+                    params={"src": src, "dst": dst},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        agfs.mv.assert_not_called()  # No rollback for committed transactions
-        manager._journal.delete.assert_called_once_with("tx-2")
-
-    async def test_recover_exec_triggers_rollback(self):
-        """EXEC status → execute rollback regardless of transaction age."""
-        entries = {
-            "tx-3": {
-                "id": "tx-3",
-                "status": "EXEC",
-                "locks": ["/local/x/.path.ovlock"],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [
-                    {
-                        "sequence": 0,
-                        "op_type": "fs_mv",
-                        "params": {"src": "/local/a", "dst": "/local/b"},
-                        "completed": True,
-                    }
-                ],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        assert file_exists(agfs_client, src)
+        assert not file_exists(agfs_client, dst)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_exec_rollback_fs_mkdir(self, agfs_client, vector_store, test_dir):
+        """EXEC with fs_mkdir → recovery → directory removed."""
+        new_dir = f"{test_dir}/exec-mkdir"
+        _mkdir_ok(agfs_client, new_dir)
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-exec-mkdir-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mkdir",
+                    params={"uri": new_dir},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        agfs.mv.assert_called_once_with("/local/b", "/local/a")
-        manager._journal.delete.assert_called_once_with("tx-3")
-
-    async def test_recover_fail_triggers_rollback(self):
-        """FAIL status → execute rollback."""
-        entries = {
-            "tx-fail": {
-                "id": "tx-fail",
-                "status": "FAIL",
-                "locks": [],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [
-                    {
-                        "sequence": 0,
-                        "op_type": "fs_mkdir",
-                        "params": {"uri": "/local/newdir"},
-                        "completed": True,
-                    }
-                ],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        assert not file_exists(agfs_client, new_dir)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_exec_rollback_fs_write_new(self, agfs_client, vector_store, test_dir):
+        """EXEC with fs_write_new → recovery → file removed."""
+        file_path = f"{test_dir}/exec-write.txt"
+        agfs_client.write(file_path, b"to-be-rolled-back")
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-exec-write-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_write_new",
+                    params={"uri": file_path},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        agfs.rm.assert_called_once_with("/local/newdir")
-        manager._journal.delete.assert_called_once_with("tx-fail")
-
-    async def test_recover_exec_recover_all_includes_incomplete(self):
-        """EXEC recovery uses recover_all=True: also reverses incomplete entries."""
-        entries = {
-            "tx-partial": {
-                "id": "tx-partial",
-                "status": "EXEC",
-                "locks": [],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [
-                    {
-                        "sequence": 0,
-                        "op_type": "fs_mv",
-                        "params": {"src": "/local/a", "dst": "/local/b"},
-                        "completed": False,  # not completed, but recover_all=True should still reverse it
-                    }
-                ],
-                "post_actions": [],
-            }
+        assert not file_exists(agfs_client, file_path)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_exec_rollback_vectordb_upsert(
+        self, agfs_client, vector_store, request_ctx, test_dir
+    ):
+        """EXEC with vectordb_upsert → recovery → record deleted from VectorDB."""
+        record_id = str(uuid.uuid4())
+        record = {
+            "id": record_id,
+            "uri": f"viking://resources/crash-upsert-{record_id}.md",
+            "parent_uri": "viking://resources/",
+            "account_id": "default",
+            "context_type": "resource",
+            "level": 2,
+            "vector": [0.5] * VECTOR_DIM,
+            "name": "crash-upsert",
+            "description": "test",
+            "abstract": "test",
         }
-        manager, agfs = self._make_manager(entries)
+        await vector_store.upsert(record, ctx=request_ctx)
+        assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-exec-vdb-{uuid.uuid4().hex[:8]}"
+        tx_record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="vectordb_upsert",
+                    params={
+                        "record_id": record_id,
+                        "_ctx_account_id": "default",
+                        "_ctx_user_id": "test_user",
+                        "_ctx_role": "root",
+                    },
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, tx_record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        agfs.mv.assert_called_once_with("/local/b", "/local/a")
-        manager._journal.delete.assert_called_once_with("tx-partial")
-
-    async def test_recover_init_just_cleans_up(self):
-        """INIT status → no rollback (nothing executed), just release locks and clean journal."""
-        entries = {
-            "tx-4": {
-                "id": "tx-4",
-                "status": "INIT",
-                "locks": ["/local/y/.path.ovlock"],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        results = await vector_store.get([record_id], ctx=request_ctx)
+        assert len(results) == 0
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_fail_triggers_rollback(self, agfs_client, vector_store, test_dir):
+        """FAIL status → also triggers rollback."""
+        new_dir = f"{test_dir}/fail-dir"
+        _mkdir_ok(agfs_client, new_dir)
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-fail-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.FAIL,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mkdir",
+                    params={"uri": new_dir},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        agfs.rm.assert_called_once_with("/local/y/.path.ovlock")
-        manager._journal.delete.assert_called_once_with("tx-4")
-
-    async def test_recover_multiple_transactions(self):
-        """Multiple journals are all recovered."""
-        entries = {
-            "tx-a": {
-                "id": "tx-a",
-                "status": "INIT",
-                "locks": [],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            },
-            "tx-b": {
-                "id": "tx-b",
-                "status": "COMMIT",
-                "locks": [],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            },
-        }
-        manager, agfs = self._make_manager(entries)
+        assert not file_exists(agfs_client, new_dir)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_releasing_triggers_rollback(self, agfs_client, vector_store, test_dir):
+        """RELEASING status → rollback + lock cleanup."""
+        new_dir = f"{test_dir}/releasing-dir"
+        _mkdir_ok(agfs_client, new_dir)
+
+        lock_path = make_lock_file(agfs_client, test_dir, "tx-releasing-placeholder", "S")
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-releasing-{uuid.uuid4().hex[:8]}"
+        # Rewrite lock with correct tx_id
+        lock_path = make_lock_file(agfs_client, test_dir, tx_id, "S")
+
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.RELEASING,
+            locks=[lock_path],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mkdir",
+                    params={"uri": new_dir},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
-        assert manager._journal.delete.call_count == 2
-
-    async def test_recover_init_empty_locks_cleans_orphan_via_init_info(self):
-        """INIT with empty locks but init_info.lock_paths → clean up orphan lock files."""
-        entries = {
-            "tx-orphan": {
-                "id": "tx-orphan",
-                "status": "INIT",
-                "locks": [],  # Empty: crash happened before journal recorded locks
-                "init_info": {
-                    "operation": "rm",
-                    "lock_paths": ["/local/orphan-dir"],
-                    "lock_mode": "subtree",
-                },
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
-
-        # Simulate: the lock file exists and is owned by this transaction
-        from openviking.storage.transaction.path_lock import _make_fencing_token
-
-        token = _make_fencing_token("tx-orphan", "S")
-        agfs.cat.return_value = token.encode("utf-8")
 
+        assert not file_exists(agfs_client, new_dir)
+        assert not file_exists(agfs_client, lock_path)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_exec_includes_incomplete(self, agfs_client, vector_store, test_dir):
+        """EXEC recovery uses recover_all=True → also reverses incomplete entries."""
+        new_dir = f"{test_dir}/exec-incomplete"
+        _mkdir_ok(agfs_client, new_dir)
+
+        journal = TransactionJournal(agfs_client)
+        tx_id = f"tx-exec-inc-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mkdir",
+                    params={"uri": new_dir},
+                    completed=False,  # incomplete, but recover_all=True reverses it
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        # Should have removed the orphan lock file
-        agfs.rm.assert_called()
-        rm_paths = [call[0][0] for call in agfs.rm.call_args_list]
-        assert any(".path.ovlock" in p for p in rm_paths)
-        manager._journal.delete.assert_called_once_with("tx-orphan")
-
-    async def test_recover_init_orphan_lock_owned_by_other_tx_not_removed(self):
-        """INIT with orphan lock path, but lock file owned by a different tx → not removed."""
-        entries = {
-            "tx-innocent": {
-                "id": "tx-innocent",
-                "status": "INIT",
-                "locks": [],
-                "init_info": {
-                    "operation": "rm",
-                    "lock_paths": ["/local/shared-dir"],
-                    "lock_mode": "subtree",
-                },
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        assert not file_exists(agfs_client, new_dir)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_init_cleans_locks(self, agfs_client, vector_store, test_dir):
+        """INIT status → no rollback, just lock cleanup + journal delete."""
+        lock_dir = f"{test_dir}/init-lock-dir"
+        _mkdir_ok(agfs_client, lock_dir)
 
-        # Lock file owned by a different transaction
-        from openviking.storage.transaction.path_lock import _make_fencing_token
+        tx_id = f"tx-init-{uuid.uuid4().hex[:8]}"
+        lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P")
 
-        token = _make_fencing_token("tx-OTHER-owner", "S")
-        agfs.cat.return_value = token.encode("utf-8")
+        journal = TransactionJournal(agfs_client)
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.INIT,
+            locks=[lock_path],
+            undo_log=[],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
 
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        # rm should NOT be called for the lock file (only journal delete)
-        rm_calls = [call[0][0] for call in agfs.rm.call_args_list] if agfs.rm.called else []
-        assert not any(".path.ovlock" in p for p in rm_calls)
-        manager._journal.delete.assert_called_once_with("tx-innocent")
+        assert not file_exists(agfs_client, lock_path)
+        assert tx_id not in journal.list_all()
 
-    async def test_recover_acquire_status(self):
+    async def test_recover_acquire_cleans_locks(self, agfs_client, vector_store, test_dir):
         """ACQUIRE status → same as INIT, clean up only."""
-        entries = {
-            "tx-acq": {
-                "id": "tx-acq",
-                "status": "ACQUIRE",
-                "locks": ["/local/z/.path.ovlock"],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        lock_dir = f"{test_dir}/acquire-lock-dir"
+        _mkdir_ok(agfs_client, lock_dir)
+
+        tx_id = f"tx-acq-{uuid.uuid4().hex[:8]}"
+        lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P")
+
+        journal = TransactionJournal(agfs_client)
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.ACQUIRE,
+            locks=[lock_path],
+            undo_log=[],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        agfs.rm.assert_called_once_with("/local/z/.path.ovlock")
-        manager._journal.delete.assert_called_once_with("tx-acq")
-
-    async def test_recover_releasing_status_triggers_rollback(self):
-        """RELEASING status → process crashed while releasing, rollback undo log."""
-        entries = {
-            "tx-rel": {
-                "id": "tx-rel",
-                "status": "RELEASING",
-                "locks": ["/local/r/.path.ovlock"],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [
-                    {
-                        "sequence": 0,
-                        "op_type": "fs_mkdir",
-                        "params": {"uri": "/local/tmpdir"},
-                        "completed": True,
-                    }
-                ],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        assert not file_exists(agfs_client, lock_path)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_init_orphan_lock_via_init_info(
+        self, agfs_client, vector_store, test_dir
+    ):
+        """INIT with empty locks but init_info.lock_paths → clean orphan lock owned by tx."""
+        orphan_dir = f"{test_dir}/orphan-dir"
+        _mkdir_ok(agfs_client, orphan_dir)
+
+        tx_id = f"tx-orphan-{uuid.uuid4().hex[:8]}"
+        lock_path = make_lock_file(agfs_client, orphan_dir, tx_id, "S")
+
+        journal = TransactionJournal(agfs_client)
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.INIT,
+            locks=[],  # Empty — crash happened before journal recorded locks
+            init_info={
+                "operation": "rm",
+                "lock_paths": [orphan_dir],
+                "lock_mode": "subtree",
+            },
+            undo_log=[],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        # Should rollback the undo log
-        rm_paths = [call[0][0] for call in agfs.rm.call_args_list]
-        assert "/local/tmpdir" in rm_paths
-        manager._journal.delete.assert_called_once_with("tx-rel")
-
-    async def test_recover_mv_orphan_locks_include_dst(self):
-        """INIT mv operation with init_info → check both lock_paths and mv_dst_path for orphan locks."""
-        entries = {
-            "tx-mv-orphan": {
-                "id": "tx-mv-orphan",
-                "status": "INIT",
-                "locks": [],
-                "init_info": {
-                    "operation": "mv",
-                    "lock_paths": ["/local/src-dir"],
-                    "lock_mode": "mv",
-                    "mv_dst_path": "/local/dst-dir",
-                },
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            }
-        }
-        manager, agfs = self._make_manager(entries)
+        assert not file_exists(agfs_client, lock_path)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_init_orphan_lock_other_owner(self, agfs_client, vector_store, test_dir):
+        """INIT with orphan lock owned by different tx → not removed."""
+        orphan_dir = f"{test_dir}/orphan-other"
+        _mkdir_ok(agfs_client, orphan_dir)
+
+        other_tx_id = f"tx-OTHER-{uuid.uuid4().hex[:8]}"
+        lock_path = make_lock_file(agfs_client, orphan_dir, other_tx_id, "S")
+
+        tx_id = f"tx-innocent-{uuid.uuid4().hex[:8]}"
+        journal = TransactionJournal(agfs_client)
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.INIT,
+            locks=[],
+            init_info={
+                "operation": "rm",
+                "lock_paths": [orphan_dir],
+                "lock_mode": "subtree",
+            },
+            undo_log=[],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
 
-        from openviking.storage.transaction.path_lock import _make_fencing_token
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
+        await manager._recover_pending_transactions()
 
-        token = _make_fencing_token("tx-mv-orphan", "P")
-        agfs.cat.return_value = token.encode("utf-8")
+        # Lock file should still exist — owned by different tx
+        assert file_exists(agfs_client, lock_path)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_mv_orphan_both_paths(self, agfs_client, vector_store, test_dir):
+        """INIT mv operation → check both lock_paths and mv_dst_path for orphan locks."""
+        src_dir = f"{test_dir}/mv-orphan-src"
+        dst_dir = f"{test_dir}/mv-orphan-dst"
+        _mkdir_ok(agfs_client, src_dir)
+        _mkdir_ok(agfs_client, dst_dir)
+
+        tx_id = f"tx-mv-orphan-{uuid.uuid4().hex[:8]}"
+        src_lock = make_lock_file(agfs_client, src_dir, tx_id, "S")
+        dst_lock = make_lock_file(agfs_client, dst_dir, tx_id, "P")
+
+        journal = TransactionJournal(agfs_client)
+        record = TransactionRecord(
+            id=tx_id,
+            status=TransactionStatus.INIT,
+            locks=[],
+            init_info={
+                "operation": "mv",
+                "lock_paths": [src_dir],
+                "lock_mode": "mv",
+                "mv_dst_path": dst_dir,
+            },
+            undo_log=[],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
 
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        # Should check both src and dst paths for orphan locks
-        cat_paths = [call[0][0] for call in agfs.cat.call_args_list]
-        assert any("src-dir" in p for p in cat_paths)
-        assert any("dst-dir" in p for p in cat_paths)
-
-    async def test_recover_journal_read_failure_skips_gracefully(self):
-        """If reading a journal entry fails, skip that tx and continue with others."""
-        agfs = MagicMock()
-        manager = TransactionManager(agfs_client=agfs, timeout=3600)
-        manager._journal = MagicMock()
-        manager._journal.list_all.return_value = ["tx-bad", "tx-good"]
-
-        def read_side_effect(tx_id):
-            if tx_id == "tx-bad":
-                raise Exception("corrupted journal")
-            return {
-                "id": "tx-good",
-                "status": "INIT",
-                "locks": [],
-                "created_at": time.time(),
-                "updated_at": time.time(),
-                "undo_log": [],
-                "post_actions": [],
-            }
-
-        manager._journal.read.side_effect = read_side_effect
-        manager._journal.delete = MagicMock()
+        # Both orphan locks should be cleaned up
+        assert not file_exists(agfs_client, src_lock)
+        assert not file_exists(agfs_client, dst_lock)
+        assert tx_id not in journal.list_all()
+
+    async def test_recover_multiple_transactions(self, agfs_client, vector_store, test_dir):
+        """Multiple journal entries are all recovered."""
+        dir_a = f"{test_dir}/multi-tx-a"
+        _mkdir_ok(agfs_client, dir_a)
+
+        journal = TransactionJournal(agfs_client)
+
+        # tx-a: EXEC with mkdir → should rollback
+        tx_a = f"tx-multi-a-{uuid.uuid4().hex[:8]}"
+        record_a = TransactionRecord(
+            id=tx_a,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mkdir",
+                    params={"uri": dir_a},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record_a)
+
+        # tx-b: COMMIT → no rollback, just cleanup
+        tx_b = f"tx-multi-b-{uuid.uuid4().hex[:8]}"
+        record_b = TransactionRecord(
+            id=tx_b,
+            status=TransactionStatus.COMMIT,
+            locks=[],
+            undo_log=[],
+            post_actions=[],
+        )
+        _write_journal(journal, record_b)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
+        await manager._recover_pending_transactions()
 
+        assert not file_exists(agfs_client, dir_a)  # rolled back
+        assert tx_a not in journal.list_all()
+        assert tx_b not in journal.list_all()
+
+    async def test_recover_corrupted_journal_skips(self, agfs_client, vector_store, test_dir):
+        """Corrupted journal entry → skipped, others still processed."""
+        journal = TransactionJournal(agfs_client)
+
+        # Write a corrupted journal entry (invalid JSON)
+        bad_tx_id = f"tx-bad-{uuid.uuid4().hex[:8]}"
+        _mkdir_ok(agfs_client, "/local/_system")
+        _mkdir_ok(agfs_client, "/local/_system/transactions")
+        bad_dir = f"/local/_system/transactions/{bad_tx_id}"
+        _mkdir_ok(agfs_client, bad_dir)
+        agfs_client.write(f"{bad_dir}/journal.json", b"NOT VALID JSON {{{{")
+
+        # Write a good journal entry
+        good_dir = f"{test_dir}/good-recovery"
+        _mkdir_ok(agfs_client, good_dir)
+
+        good_tx_id = f"tx-good-{uuid.uuid4().hex[:8]}"
+        record = TransactionRecord(
+            id=good_tx_id,
+            status=TransactionStatus.EXEC,
+            locks=[],
+            undo_log=[
+                UndoEntry(
+                    sequence=0,
+                    op_type="fs_mkdir",
+                    params={"uri": good_dir},
+                    completed=True,
+                )
+            ],
+            post_actions=[],
+        )
+        _write_journal(journal, record)
+
+        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
         await manager._recover_pending_transactions()
 
-        # tx-good should still be cleaned up
-        manager._journal.delete.assert_called_once_with("tx-good")
+        # Good tx should still be recovered
+        assert not file_exists(agfs_client, good_dir)
+        assert good_tx_id not in journal.list_all()
diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py
index ee28b7e7..68f5e8b4 100644
--- a/tests/transaction/test_rm_rollback.py
+++ b/tests/transaction/test_rm_rollback.py
@@ -2,232 +2,293 @@
 # SPDX-License-Identifier: Apache-2.0
 """Integration tests: multi-step rollback covering FS + VectorDB coordination."""
 
-from unittest.mock import AsyncMock, MagicMock
+import uuid
 
 from openviking.storage.transaction.undo import UndoEntry, execute_rollback
 
+from .conftest import VECTOR_DIM, _mkdir_ok, file_exists
+
 
 class TestRmRollback:
-    def test_vectordb_records_restored_on_fs_failure(self):
-        """When FS rm fails (incomplete), VectorDB delete is rolled back via snapshot."""
-        agfs = MagicMock()
-        vector_store = AsyncMock()
-        ctx = MagicMock()
+    def test_fs_rm_not_reversible(self, agfs_client, test_dir):
+        """fs_rm is intentionally irreversible: even completed=True is a no-op."""
+        path = f"{test_dir}/rm-target"
+        _mkdir_ok(agfs_client, path)
 
-        snapshot = [{"id": "r1", "uri": "viking://a", "content": "data"}]
         undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_delete",
-                params={"uris": ["viking://a"], "records_snapshot": snapshot},
-                completed=True,  # VectorDB delete succeeded
-            ),
-            UndoEntry(
-                sequence=1,
-                op_type="fs_rm",
-                params={"uri": "/local/test", "recursive": True},
-                completed=False,  # FS rm never ran
-            ),
+            UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True),
         ]
+        execute_rollback(undo_log, agfs_client)
 
-        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+        # Directory still exists — fs_rm rollback does nothing
+        assert file_exists(agfs_client, path)
 
-        # Only vectordb_delete (completed=True) is reversed
-        vector_store.upsert.assert_called_once_with(snapshot[0])
-        # fs_rm is incomplete, so it's skipped (also fs_rm is never reversible anyway)
-        agfs.rm.assert_not_called()
 
-    def test_fs_rm_not_reversible_even_when_completed(self):
-        """fs_rm is intentionally irreversible: even completed=True is skipped."""
-        agfs = MagicMock()
+class TestMvRollback:
+    def test_mv_reversed_on_rollback(self, agfs_client, test_dir):
+        """Real mv → rollback → content back at original location."""
+        src = f"{test_dir}/mv-src"
+        dst = f"{test_dir}/mv-dst"
+        _mkdir_ok(agfs_client, src)
+        agfs_client.write(f"{src}/payload.txt", b"important data")
+
+        # Forward mv
+        agfs_client.mv(src, dst)
+        assert not file_exists(agfs_client, src)
+        content = agfs_client.cat(f"{dst}/payload.txt")
+        assert content == b"important data"
+
         undo_log = [
             UndoEntry(
                 sequence=0,
-                op_type="fs_rm",
-                params={"uri": "/local/test"},
+                op_type="fs_mv",
+                params={"src": src, "dst": dst},
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs)
-        agfs.rm.assert_not_called()
-        agfs.mv.assert_not_called()
+        execute_rollback(undo_log, agfs_client)
 
+        assert file_exists(agfs_client, src)
+        restored = agfs_client.cat(f"{src}/payload.txt")
+        assert restored == b"important data"
 
-class TestMvRollback:
-    def test_file_moved_back_on_vectordb_failure(self):
-        """When VectorDB update fails (incomplete), FS mv is reversed."""
-        agfs = MagicMock()
+
+class TestRecoverAll:
+    def test_recover_all_reverses_incomplete(self, agfs_client, test_dir):
+        """recover_all=True also reverses entries with completed=False."""
+        new_dir = f"{test_dir}/recover-all-dir"
+        _mkdir_ok(agfs_client, new_dir)
 
         undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="fs_mv",
-                params={"src": "/local/a", "dst": "/local/b"},
-                completed=True,  # FS mv succeeded
-            ),
-            UndoEntry(
-                sequence=1,
-                op_type="vectordb_update_uri",
-                params={
-                    "old_uri": "viking://a",
-                    "new_uri": "viking://b",
-                    "old_parent_uri": "viking://",
-                },
-                completed=False,  # VectorDB update never ran
-            ),
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
         ]
+        execute_rollback(undo_log, agfs_client, recover_all=True)
 
-        execute_rollback(undo_log, agfs)
+        assert not file_exists(agfs_client, new_dir)
 
-        # Only fs_mv (completed=True) is reversed
-        agfs.mv.assert_called_once_with("/local/b", "/local/a")
+    def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir):
+        """recover_all=False skips entries with completed=False."""
+        new_dir = f"{test_dir}/skip-incomplete"
+        _mkdir_ok(agfs_client, new_dir)
 
+        undo_log = [
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
+        ]
+        execute_rollback(undo_log, agfs_client, recover_all=False)
 
-class TestRecoverAll:
-    def test_recover_all_reverses_incomplete_entries(self):
-        """recover_all=True (crash recovery mode) also reverses incomplete entries."""
-        agfs = MagicMock()
+        assert file_exists(agfs_client, new_dir)
+
+
+class TestMultiStepRollback:
+    def test_reverse_order_nested_dirs(self, agfs_client, test_dir):
+        """parent + child → rollback reverses in reverse sequence order."""
+        parent = f"{test_dir}/multi-parent"
+        child = f"{test_dir}/multi-parent/child"
+        _mkdir_ok(agfs_client, parent)
+        _mkdir_ok(agfs_client, child)
 
         undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="fs_mkdir",
-                params={"uri": "/local/newdir"},
-                completed=True,
-            ),
-            UndoEntry(
-                sequence=1,
-                op_type="fs_mv",
-                params={"src": "/local/a", "dst": "/local/b"},
-                completed=False,  # Crash happened mid-operation
-            ),
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True),
+            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True),
         ]
+        execute_rollback(undo_log, agfs_client)
 
-        execute_rollback(undo_log, agfs, recover_all=True)
-
-        # Both entries should be reversed (in reverse sequence order)
-        assert agfs.mv.call_count == 1
-        agfs.mv.assert_called_once_with("/local/b", "/local/a")
-        agfs.rm.assert_called_once_with("/local/newdir")
+        assert not file_exists(agfs_client, child)
+        assert not file_exists(agfs_client, parent)
 
-    def test_recover_all_false_skips_incomplete(self):
-        """recover_all=False (normal rollback) skips incomplete entries."""
-        agfs = MagicMock()
+    def test_write_new_rollback(self, agfs_client, test_dir):
+        """New file → rollback → file deleted."""
+        file_path = f"{test_dir}/new-file.txt"
+        agfs_client.write(file_path, b"new content")
+        assert file_exists(agfs_client, file_path)
 
         undo_log = [
             UndoEntry(
-                sequence=0,
-                op_type="fs_mv",
-                params={"src": "/local/a", "dst": "/local/b"},
-                completed=False,
+                sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True
             ),
         ]
+        execute_rollback(undo_log, agfs_client)
 
-        execute_rollback(undo_log, agfs, recover_all=False)
-        agfs.mv.assert_not_called()
-
+        assert not file_exists(agfs_client, file_path)
 
-class TestVectorDBRollbackEdgeCases:
-    def test_multi_record_vectordb_delete_rollback(self):
-        """Multiple VectorDB records in snapshot should all be restored."""
-        agfs = MagicMock()
-        vector_store = AsyncMock()
-        ctx = MagicMock()
+    def test_best_effort_continues(self, agfs_client, test_dir):
+        """If one step fails, subsequent steps still execute."""
+        real_dir = f"{test_dir}/best-effort-real"
+        _mkdir_ok(agfs_client, real_dir)
 
-        snapshot = [
-            {"id": "r1", "uri": "viking://a", "content": "data1"},
-            {"id": "r2", "uri": "viking://b", "content": "data2"},
-            {"id": "r3", "uri": "viking://c", "content": "data3"},
-        ]
         undo_log = [
+            # seq=0: mkdir rollback on real dir → should succeed
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": real_dir}, completed=True),
+            # seq=1: mkdir rollback on nonexistent dir → fails silently
             UndoEntry(
-                sequence=0,
-                op_type="vectordb_delete",
-                params={
-                    "uris": ["viking://a", "viking://b", "viking://c"],
-                    "records_snapshot": snapshot,
-                },
+                sequence=1,
+                op_type="fs_mkdir",
+                params={"uri": f"{test_dir}/no-such-dir-{uuid.uuid4().hex}"},
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+        execute_rollback(undo_log, agfs_client)
 
-        assert vector_store.upsert.call_count == 3
+        # seq=0 still executed despite seq=1 failure (reversed order: 1 runs first, then 0)
+        assert not file_exists(agfs_client, real_dir)
 
-    def test_empty_snapshot_vectordb_delete_rollback(self):
-        """Empty snapshot → nothing to restore, no error."""
-        agfs = MagicMock()
-        vector_store = AsyncMock()
-        ctx = MagicMock()
+    def test_unknown_op_type_no_crash(self, agfs_client, test_dir):
+        """Unknown op_type is logged but doesn't raise."""
+        undo_log = [
+            UndoEntry(
+                sequence=0,
+                op_type="some_future_op",
+                params={"foo": "bar"},
+                completed=True,
+            ),
+        ]
+        # Should not raise
+        execute_rollback(undo_log, agfs_client)
+
+
+class TestVectorDBRollback:
+    async def test_vectordb_delete_rollback_restores(self, agfs_client, vector_store, request_ctx):
+        """upsert → delete → rollback(vectordb_delete) → record restored."""
+        record_id = str(uuid.uuid4())
+        record = {
+            "id": record_id,
+            "uri": f"viking://resources/del-restore-{record_id}.md",
+            "parent_uri": "viking://resources/",
+            "account_id": "default",
+            "context_type": "resource",
+            "level": 2,
+            "vector": [0.3] * VECTOR_DIM,
+            "name": "del-restore",
+            "description": "test",
+            "abstract": "test",
+        }
+        await vector_store.upsert(record, ctx=request_ctx)
+
+        # Snapshot before delete
+        snapshot = await vector_store.get([record_id], ctx=request_ctx)
+        assert len(snapshot) == 1
+
+        # Forward: delete
+        await vector_store.delete([record_id], ctx=request_ctx)
+        assert len(await vector_store.get([record_id], ctx=request_ctx)) == 0
 
         undo_log = [
             UndoEntry(
                 sequence=0,
                 op_type="vectordb_delete",
-                params={"uris": [], "records_snapshot": []},
+                params={
+                    "uris": [record["uri"]],
+                    "records_snapshot": snapshot,
+                    "_ctx_account_id": "default",
+                    "_ctx_user_id": "test_user",
+                    "_ctx_role": "root",
+                },
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
-        vector_store.upsert.assert_not_called()
-
-    def test_vectordb_delete_partial_restore_failure(self):
-        """If restoring one record fails, others should still be attempted."""
-        agfs = MagicMock()
-        vector_store = AsyncMock()
-        ctx = MagicMock()
-
-        call_count = 0
-
-        async def upsert_side_effect(record):
-            nonlocal call_count
-            call_count += 1
-            if record["id"] == "r2":
-                raise Exception("upsert failed")
-
-        vector_store.upsert = AsyncMock(side_effect=upsert_side_effect)
+        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+
+        results = await vector_store.get([record_id], ctx=request_ctx)
+        assert len(results) == 1
+
+    async def test_vectordb_delete_multi_record(self, agfs_client, vector_store, request_ctx):
+        """3 records in snapshot → rollback → all restored."""
+        records = []
+        for i in range(3):
+            rid = str(uuid.uuid4())
+            rec = {
+                "id": rid,
+                "uri": f"viking://resources/multi-{rid}.md",
+                "parent_uri": "viking://resources/",
+                "account_id": "default",
+                "context_type": "resource",
+                "level": 2,
+                "vector": [0.1 * (i + 1)] * VECTOR_DIM,
+                "name": f"multi-{i}",
+                "description": "test",
+                "abstract": "test",
+            }
+            await vector_store.upsert(rec, ctx=request_ctx)
+            records.append(rec)
+
+        ids = [r["id"] for r in records]
+        snapshot = await vector_store.get(ids, ctx=request_ctx)
+        assert len(snapshot) == 3
+
+        # Delete all
+        await vector_store.delete(ids, ctx=request_ctx)
+        assert len(await vector_store.get(ids, ctx=request_ctx)) == 0
 
-        snapshot = [
-            {"id": "r1", "uri": "viking://a"},
-            {"id": "r2", "uri": "viking://b"},  # This one fails
-            {"id": "r3", "uri": "viking://c"},
-        ]
         undo_log = [
             UndoEntry(
                 sequence=0,
                 op_type="vectordb_delete",
-                params={"records_snapshot": snapshot},
+                params={
+                    "uris": [r["uri"] for r in records],
+                    "records_snapshot": snapshot,
+                    "_ctx_account_id": "default",
+                    "_ctx_user_id": "test_user",
+                    "_ctx_role": "root",
+                },
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
+        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
-        # All 3 should be attempted (best-effort per record)
-        assert call_count == 3
+        results = await vector_store.get(ids, ctx=request_ctx)
+        assert len(results) == 3
 
-    def test_vectordb_upsert_rollback_without_vector_store_is_noop(self):
-        """vectordb_upsert rollback without vector_store does nothing."""
-        agfs = MagicMock()
+    async def test_vectordb_delete_empty_snapshot(self, agfs_client, vector_store, request_ctx):
+        """Empty snapshot → no-op, no error."""
         undo_log = [
             UndoEntry(
                 sequence=0,
-                op_type="vectordb_upsert",
-                params={"record_id": "r1"},
+                op_type="vectordb_delete",
+                params={
+                    "uris": [],
+                    "records_snapshot": [],
+                    "_ctx_account_id": "default",
+                    "_ctx_user_id": "test_user",
+                    "_ctx_role": "root",
+                },
                 completed=True,
             ),
         ]
         # Should not raise
-        execute_rollback(undo_log, agfs, vector_store=None)
+        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+
+    async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, request_ctx):
+        """upsert → rollback(vectordb_upsert) → record deleted."""
+        record_id = str(uuid.uuid4())
+        record = {
+            "id": record_id,
+            "uri": f"viking://resources/upsert-del-{record_id}.md",
+            "parent_uri": "viking://resources/",
+            "account_id": "default",
+            "context_type": "resource",
+            "level": 2,
+            "vector": [0.4] * VECTOR_DIM,
+            "name": "upsert-del",
+            "description": "test",
+            "abstract": "test",
+        }
+        await vector_store.upsert(record, ctx=request_ctx)
+        assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1
 
-    def test_unknown_op_type_does_not_crash(self):
-        """Unknown op_type is logged but doesn't raise."""
-        agfs = MagicMock()
         undo_log = [
             UndoEntry(
                 sequence=0,
-                op_type="some_future_op",
-                params={"foo": "bar"},
+                op_type="vectordb_upsert",
+                params={
+                    "record_id": record_id,
+                    "_ctx_account_id": "default",
+                    "_ctx_user_id": "test_user",
+                    "_ctx_role": "root",
+                },
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs)
+        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+
+        results = await vector_store.get([record_id], ctx=request_ctx)
+        assert len(results) == 0
diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py
index d67063d1..1a68fe6a 100644
--- a/tests/transaction/test_undo.py
+++ b/tests/transaction/test_undo.py
@@ -2,10 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for undo log and rollback executor."""
 
-from unittest.mock import AsyncMock, MagicMock
+import uuid
 
 from openviking.storage.transaction.undo import UndoEntry, execute_rollback
 
+from .conftest import VECTOR_DIM, _mkdir_ok, file_exists
+
 
 class TestUndoEntry:
     def test_to_dict(self):
@@ -35,129 +37,213 @@ def test_roundtrip(self):
 
 
 class TestExecuteRollback:
-    def test_rollback_fs_mv(self):
-        agfs = MagicMock()
+    """Integration tests for execute_rollback using real AGFS and VectorDB backends."""
+
+    def test_rollback_fs_mv(self, agfs_client, test_dir):
+        src = f"{test_dir}/src"
+        dst = f"{test_dir}/dst"
+        _mkdir_ok(agfs_client, src)
+        agfs_client.write(f"{src}/data.txt", b"hello")
+
+        # Forward: mv src → dst
+        agfs_client.mv(src, dst)
+        assert not file_exists(agfs_client, src)
+        assert file_exists(agfs_client, dst)
+
         undo_log = [
             UndoEntry(
-                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+                sequence=0,
+                op_type="fs_mv",
+                params={"src": src, "dst": dst},
+                completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs)
-        agfs.mv.assert_called_once_with("/b", "/a")
+        execute_rollback(undo_log, agfs_client)
+
+        # src restored, dst gone
+        assert file_exists(agfs_client, src)
+        assert not file_exists(agfs_client, dst)
+
+    def test_rollback_fs_rm_skipped(self, agfs_client, test_dir):
+        path = f"{test_dir}/will-not-delete"
+        _mkdir_ok(agfs_client, path)
 
-    def test_rollback_fs_rm_skipped(self):
-        agfs = MagicMock()
         undo_log = [
-            UndoEntry(sequence=0, op_type="fs_rm", params={"uri": "/a"}, completed=True),
+            UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True),
         ]
-        execute_rollback(undo_log, agfs)
-        agfs.mv.assert_not_called()
-        agfs.rm.assert_not_called()
+        execute_rollback(undo_log, agfs_client)
+
+        # fs_rm rollback is a no-op; directory still exists
+        assert file_exists(agfs_client, path)
+
+    def test_rollback_fs_mkdir(self, agfs_client, test_dir):
+        new_dir = f"{test_dir}/created"
+        _mkdir_ok(agfs_client, new_dir)
+        assert file_exists(agfs_client, new_dir)
 
-    def test_rollback_fs_mkdir(self):
-        agfs = MagicMock()
         undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": "/a/b"}, completed=True),
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=True),
         ]
-        execute_rollback(undo_log, agfs)
-        agfs.rm.assert_called_once_with("/a/b")
+        execute_rollback(undo_log, agfs_client)
+
+        assert not file_exists(agfs_client, new_dir)
+
+    def test_rollback_fs_write_new(self, agfs_client, test_dir):
+        file_path = f"{test_dir}/new-file.txt"
+        agfs_client.write(file_path, b"content")
+        assert file_exists(agfs_client, file_path)
 
-    def test_rollback_fs_write_new(self):
-        agfs = MagicMock()
         undo_log = [
             UndoEntry(
-                sequence=0, op_type="fs_write_new", params={"uri": "/a/f.txt"}, completed=True
+                sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True
             ),
         ]
-        execute_rollback(undo_log, agfs)
-        agfs.rm.assert_called_once_with("/a/f.txt", recursive=True)
+        execute_rollback(undo_log, agfs_client)
+
+        assert not file_exists(agfs_client, file_path)
+
+    def test_rollback_reverse_order(self, agfs_client, test_dir):
+        """mkdir parent + child → rollback → both removed in reverse order."""
+        parent = f"{test_dir}/parent"
+        child = f"{test_dir}/parent/child"
+        _mkdir_ok(agfs_client, parent)
+        _mkdir_ok(agfs_client, child)
 
-    def test_rollback_vectordb_upsert(self):
-        agfs = MagicMock()
-        vector_store = AsyncMock()
         undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_upsert",
-                params={"record_id": "r1"},
-                completed=True,
-            ),
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True),
+            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True),
         ]
-        execute_rollback(undo_log, agfs, vector_store=vector_store)
-        vector_store.delete.assert_called_once_with(["r1"])
+        execute_rollback(undo_log, agfs_client)
+
+        # child removed first (seq=1), then parent (seq=0)
+        assert not file_exists(agfs_client, child)
+        assert not file_exists(agfs_client, parent)
+
+    def test_rollback_skips_incomplete(self, agfs_client, test_dir):
+        new_dir = f"{test_dir}/incomplete"
+        _mkdir_ok(agfs_client, new_dir)
 
-    def test_rollback_vectordb_update_uri(self):
-        agfs = MagicMock()
-        ctx = MagicMock()
-        vector_store = AsyncMock()
         undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_update_uri",
-                params={
-                    "old_uri": "viking://a",
-                    "new_uri": "viking://b",
-                    "old_parent_uri": "viking://",
-                },
-                completed=True,
-            ),
+            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
         ]
-        execute_rollback(undo_log, agfs, vector_store=vector_store, ctx=ctx)
-        vector_store.update_uri_mapping.assert_called_once_with(
-            ctx=ctx, uri="viking://b", new_uri="viking://a", new_parent_uri="viking://"
-        )
+        execute_rollback(undo_log, agfs_client)
 
-    def test_rollback_reverse_order(self):
-        """Rollback should process entries in reverse sequence order."""
-        agfs = MagicMock()
-        call_order = []
-        original_mv = agfs.mv
-        original_rm = agfs.rm
+        # completed=False → not rolled back
+        assert file_exists(agfs_client, new_dir)
 
-        def track_mv(*args):
-            call_order.append(("mv", args))
-            return original_mv(*args)
-
-        def track_rm(*args, **kwargs):
-            call_order.append(("rm", args))
-            return original_rm(*args, **kwargs)
+    def test_rollback_best_effort(self, agfs_client, test_dir):
+        """A failing rollback entry should not prevent others from running."""
+        real_dir = f"{test_dir}/real-dir"
+        _mkdir_ok(agfs_client, real_dir)
 
-        agfs.mv = track_mv
-        agfs.rm = track_rm
+        src = f"{test_dir}/be-src"
+        dst = f"{test_dir}/be-dst"
+        _mkdir_ok(agfs_client, dst)
 
         undo_log = [
+            # seq=0: fs_mv rollback will succeed
+            UndoEntry(sequence=0, op_type="fs_mv", params={"src": src, "dst": dst}, completed=True),
+            # seq=1: fs_mkdir rollback will fail (rm on non-empty or non-existent path)
             UndoEntry(
-                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+                sequence=1,
+                op_type="fs_mkdir",
+                params={"uri": f"{test_dir}/nonexistent-dir-xyz"},
+                completed=True,
             ),
-            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True),
         ]
-        execute_rollback(undo_log, agfs)
-        # seq=1 should be rolled back first (mkdir→rm), then seq=0 (mv→reverse mv)
-        assert call_order[0][0] == "rm"
-        assert call_order[1][0] == "mv"
+        # Should not raise
+        execute_rollback(undo_log, agfs_client)
+
+        # seq=0 mv rollback should have executed (dst → src)
+        assert file_exists(agfs_client, src)
+
+    async def test_rollback_vectordb_upsert(self, agfs_client, vector_store, request_ctx):
+        """Real upsert → rollback → record deleted."""
+        record_id = str(uuid.uuid4())
+        record = {
+            "id": record_id,
+            "uri": f"viking://resources/test-upsert-{record_id}.md",
+            "parent_uri": "viking://resources/",
+            "account_id": "default",
+            "context_type": "resource",
+            "level": 2,
+            "vector": [0.1] * VECTOR_DIM,
+            "name": "test",
+            "description": "test record",
+            "abstract": "test",
+        }
+        await vector_store.upsert(record, ctx=request_ctx)
+
+        # Confirm it exists
+        results = await vector_store.get([record_id], ctx=request_ctx)
+        assert len(results) == 1
 
-    def test_rollback_skips_incomplete(self):
-        agfs = MagicMock()
         undo_log = [
             UndoEntry(
-                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=False
+                sequence=0,
+                op_type="vectordb_upsert",
+                params={
+                    "record_id": record_id,
+                    "_ctx_account_id": "default",
+                    "_ctx_user_id": "test_user",
+                    "_ctx_role": "root",
+                },
+                completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs)
-        agfs.mv.assert_not_called()
+        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+
+        results = await vector_store.get([record_id], ctx=request_ctx)
+        assert len(results) == 0
+
+    async def test_rollback_vectordb_update_uri(self, agfs_client, vector_store, request_ctx):
+        """Real upsert → update_uri_mapping → rollback → URI restored."""
+        record_id = str(uuid.uuid4())
+        old_uri = f"viking://resources/old-{record_id}.md"
+        new_uri = f"viking://resources/new-{record_id}.md"
+        record = {
+            "id": record_id,
+            "uri": old_uri,
+            "parent_uri": "viking://resources/",
+            "account_id": "default",
+            "context_type": "resource",
+            "level": 2,
+            "vector": [0.2] * VECTOR_DIM,
+            "name": "test",
+            "description": "test",
+            "abstract": "test",
+        }
+        await vector_store.upsert(record, ctx=request_ctx)
+
+        # Forward: update URI mapping
+        await vector_store.update_uri_mapping(
+            ctx=request_ctx,
+            uri=old_uri,
+            new_uri=new_uri,
+            new_parent_uri="viking://resources/",
+        )
 
-    def test_rollback_best_effort(self):
-        """A failing rollback entry should not prevent others from running."""
-        agfs = MagicMock()
-        agfs.rm.side_effect = Exception("boom")
-        agfs.mv = MagicMock()
+        # Verify forward operation
+        result = await vector_store.fetch_by_uri(new_uri, ctx=request_ctx)
+        assert result is not None
 
         undo_log = [
             UndoEntry(
-                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
+                sequence=0,
+                op_type="vectordb_update_uri",
+                params={
+                    "old_uri": old_uri,
+                    "new_uri": new_uri,
+                    "old_parent_uri": "viking://resources/",
+                    "_ctx_account_id": "default",
+                    "_ctx_user_id": "test_user",
+                    "_ctx_role": "root",
+                },
+                completed=True,
             ),
-            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": "/c"}, completed=True),
         ]
-        execute_rollback(undo_log, agfs)
-        # fs_mkdir rollback failed (rm raises), but fs_mv rollback should still run
-        agfs.mv.assert_called_once_with("/b", "/a")
+        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+
+        # URI should be restored to old_uri
+        result = await vector_store.fetch_by_uri(old_uri, ctx=request_ctx)
+        assert result is not None

From 0122a3bc943dbc63893deb1240cd8d8e56b35c33 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 17:38:16 +0800
Subject: [PATCH 07/18] refactor(transaction): make rollback fully async and
 unify session commit path

- Convert execute_rollback/rollback_entry to async, removing sync run_async wrappers
- Unify Session.commit() to delegate to commit_async(), removing duplicate phase methods
- Fix SUBTREE lock to conflict with ancestor SUBTREE locks (was previously missing)
- Fix mv lock mode: directory moves now use SUBTREE on both source and destination
- Replace deprecated asyncio.get_event_loop() with get_running_loop()
- Remove max_parallel_locks config option
- Update docs (en/zh) and tests to match new async rollback signatures
---
 docs/en/concepts/09-transaction.md            |  48 +++--
 docs/zh/concepts/09-transaction.md            |  48 +++--
 openviking/session/session.py                 | 188 +++++-------------
 openviking/storage/transaction/path_lock.py   |  32 +--
 .../transaction/transaction_manager.py        |   4 +-
 openviking/storage/transaction/undo.py        |  24 +--
 tests/agfs/test_fs_binding.py                 |   9 +-
 tests/agfs/test_fs_binding_s3.py              |   6 +-
 tests/agfs/test_fs_local.py                   |   5 +-
 tests/agfs/test_fs_s3.py                      |   5 +-
 tests/integration/test_add_resource_index.py  |  25 ++-
 tests/session/test_memory_dedup_actions.py    |   1 -
 tests/session/test_session_commit.py          |   9 +-
 tests/storage/test_semantic_dag_skip_files.py |  29 ++-
 tests/storage/test_semantic_dag_stats.py      |   2 +-
 tests/test_session_task_tracking.py           |   2 +-
 tests/transaction/test_rm_rollback.py         |  40 ++--
 tests/transaction/test_undo.py                |  32 +--
 18 files changed, 260 insertions(+), 249 deletions(-)

diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
index 3469ed2f..48e6d355 100644
--- a/docs/en/concepts/09-transaction.md
+++ b/docs/en/concepts/09-transaction.md
@@ -68,7 +68,7 @@ Rollback: Step 4 fails -> restore VectorDB records from snapshot.
 Transaction flow:
 
 ```
-1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on source + POINT on destination)
+1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on both source and destination for directories)
 2. Move FS file
 3. Update VectorDB URIs
 4. Commit -> release lock -> delete journal
@@ -151,8 +151,8 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 | lock_mode | Use case | Behavior |
 |-----------|----------|----------|
 | `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors |
-| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path and any lock on descendants |
-| `mv` | Move operations | Acquire SUBTREE lock on source path, then POINT lock on destination path |
+| `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors |
+| `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) |
 
 ## Lock Types (POINT vs SUBTREE)
 
@@ -161,10 +161,10 @@ The lock mechanism uses two lock types to handle different conflict patterns:
 | | POINT on same path | SUBTREE on same path | POINT on descendant | SUBTREE on ancestor |
 |---|---|---|---|---|
 | **POINT** | Conflict | Conflict | — | Conflict |
-| **SUBTREE** | Conflict | Conflict | Conflict | — |
+| **SUBTREE** | Conflict | Conflict | Conflict | Conflict |
 
 - **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock.
-- **SUBTREE (S)**: Used for rm and mv-source operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants for conflicting locks.
+- **SUBTREE (S)**: Used for rm and mv operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants and ancestor directories for conflicting locks.
 
 ## Undo Log
 
@@ -182,6 +182,17 @@ Each transaction maintains an Undo Log recording the reverse action for each ste
 
 Rollback rules: Only entries with `completed=True` are rolled back, in **reverse order**. Each step has independent try-catch (best-effort). During crash recovery, `recover_all=True` also reverses uncompleted entries to clean up partial operations.
 
+### Context Reconstruction
+
+VectorDB rollback operations require a `RequestContext` (containing account_id, user_id, agent_id, role). Since the original context is unavailable during crash recovery, `_ctx_*` fields are serialized into undo params when calling record_undo:
+
+- `_ctx_account_id`: Account ID
+- `_ctx_user_id`: User ID
+- `_ctx_agent_id`: Agent ID
+- `_ctx_role`: Role
+
+During rollback, `_reconstruct_ctx()` rebuilds the context from these fields. If reconstruction fails (missing fields), the VectorDB rollback step is skipped with a warning.
+
 ## Lock Mechanism
 
 ### Lock Protocol
@@ -223,12 +234,23 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError
 loop until timeout (poll interval: 200ms):
     1. Check target directory exists
     2. Check if target directory is locked by another transaction
-    3. Scan all descendant directories for any locks by other transactions
-    4. Write SUBTREE (S) lock file (only one file, at the root path)
-    5. TOCTOU double-check: re-scan descendants for new locks
-       - Conflict found: later one backs off (livelock prevention)
-    6. Verify lock file ownership
-    7. Success
+       - Stale lock? -> remove and retry
+       - Active lock? -> wait
+    3. Check all ancestor directories for SUBTREE locks
+       - Stale lock? -> remove and retry
+       - Active lock? -> wait
+    4. Scan all descendant directories for any locks by other transactions
+       - Stale lock? -> remove and retry
+       - Active lock? -> wait
+    5. Write SUBTREE (S) lock file (only one file, at the root path)
+    6. TOCTOU double-check: re-scan descendants and ancestors
+       - Conflict found: compare (timestamp, tx_id)
+       - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock
+       - Wait and retry
+    7. Verify lock file ownership (fencing token matches)
+    8. Success
+
+Timeout (default 0 = no-wait) raises LockAcquisitionError
 ```
 
 ### Lock Expiry Cleanup
@@ -305,8 +327,7 @@ The transaction mechanism is enabled by default with no extra configuration need
   "storage": {
     "transaction": {
       "lock_timeout": 5.0,
-      "lock_expire": 300.0,
-      "max_parallel_locks": 8
+      "lock_expire": 300.0
     }
   }
 }
@@ -316,7 +337,6 @@ The transaction mechanism is enabled by default with no extra configuration need
 |-----------|------|-------------|---------|
 | `lock_timeout` | float | Lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds. | `0.0` |
 | `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` |
-| `max_parallel_locks` | int | Max parallel locks for rm/mv operations | `8` |
 
 ### QueueFS Persistence
 
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index 6397cd2d..e4f1b8d0 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -68,7 +68,7 @@ Storage Layer (VikingFS, VectorDB, QueueManager)
 事务流程：
 
 ```
-1. 开始事务，加锁（lock_mode="mv"，源路径 SUBTREE + 目标路径 POINT）
+1. 开始事务，加锁（lock_mode="mv"，目录移动时源和目标均 SUBTREE）
 2. 移动 FS 文件
 3. 更新 VectorDB 中的 URI
 4. 提交 → 删锁 → 删 journal
@@ -151,8 +151,8 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 | lock_mode | 用途 | 行为 |
 |-----------|------|------|
 | `point` | 写操作 | 锁定指定路径；与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 |
-| `subtree` | 删除操作 | 锁定子树根节点；与同路径的任何锁和后代目录的任何锁冲突 |
-| `mv` | 移动操作 | 源路径加 SUBTREE 锁，目标路径加 POINT 锁 |
+| `subtree` | 删除操作 | 锁定子树根节点；与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 |
+| `mv` | 移动操作 | 目录移动：源和目标均加 SUBTREE 锁；文件移动：源父目录和目标均加 POINT 锁（通过 `src_is_dir` 控制） |
 
 ## 锁类型（POINT vs SUBTREE）
 
@@ -161,10 +161,10 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 | | 同路径 POINT | 同路径 SUBTREE | 后代 POINT | 祖先 SUBTREE |
 |---|---|---|---|---|
 | **POINT** | 冲突 | 冲突 | — | 冲突 |
-| **SUBTREE** | 冲突 | 冲突 | 冲突 | — |
+| **SUBTREE** | 冲突 | 冲突 | 冲突 | 冲突 |
 
 - **POINT (P)**：用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。
-- **SUBTREE (S)**：用于删除和移动源操作。逻辑上覆盖整个子树，但只在根目录写**一个锁文件**。获取前扫描所有后代确认无冲突锁。
+- **SUBTREE (S)**：用于删除和移动操作。逻辑上覆盖整个子树，但只在根目录写**一个锁文件**。获取前扫描所有后代和祖先目录确认无冲突锁。
 
 ## Undo Log
 
@@ -182,6 +182,17 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 
 回滚规则：只回滚 `completed=True` 的条目，**反序执行**。每步独立 try-catch（best-effort）。崩溃恢复时使用 `recover_all=True`，也会回滚未完成的条目以清理部分操作残留。
 
+### 上下文重建
+
+VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、agent_id、role）。由于崩溃恢复时原始上下文不可用，record_undo 时在 undo params 中序列化 `_ctx_*` 字段：
+
+- `_ctx_account_id`：账户 ID
+- `_ctx_user_id`：用户 ID
+- `_ctx_agent_id`：代理 ID
+- `_ctx_role`：角色
+
+回滚时通过 `_reconstruct_ctx()` 从这些字段重建上下文。若重建失败（字段缺失），该 VectorDB 回滚步骤将被跳过并记录警告。
+
 ## 锁机制
 
 ### 锁协议
@@ -223,12 +234,23 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 循环直到超时（轮询间隔：200ms）：
     1. 检查目标目录存在
     2. 检查目标路径是否被其他事务锁定
-    3. 扫描所有后代目录，检查是否有其他事务持有的锁
-    4. 写入 SUBTREE (S) 锁文件（只写一个文件，在根路径）
-    5. TOCTOU 双重检查：重新扫描后代目录
-       - 发现冲突：后到者主动让步（活锁防止）
-    6. 验证锁文件归属
-    7. 成功
+       - 陈旧锁？ → 移除后重试
+       - 活跃锁？ → 等待
+    3. 检查所有祖先目录是否有 SUBTREE 锁
+       - 陈旧锁？ → 移除后重试
+       - 活跃锁？ → 等待
+    4. 扫描所有后代目录，检查是否有其他事务持有的锁
+       - 陈旧锁？ → 移除后重试
+       - 活跃锁？ → 等待
+    5. 写入 SUBTREE (S) 锁文件（只写一个文件，在根路径）
+    6. TOCTOU 双重检查：重新扫描后代目录和祖先目录
+       - 发现冲突：比较 (timestamp, tx_id)
+       - 后到者（更大的 timestamp/tx_id）主动让步（删除自己的锁），防止活锁
+       - 等待后重试
+    7. 验证锁文件归属（fencing token 匹配）
+    8. 成功
+
+超时（默认 0 = 不等待）抛出 LockAcquisitionError
 ```
 
 ### 锁过期清理
@@ -305,8 +327,7 @@ INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED
   "storage": {
     "transaction": {
       "lock_timeout": 5.0,
-      "lock_expire": 300.0,
-      "max_parallel_locks": 8
+      "lock_expire": 300.0
     }
   }
 }
@@ -316,7 +337,6 @@ INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED
 |------|------|------|--------|
 | `lock_timeout` | float | 获取锁的等待超时（秒）。`0` = 立即失败（默认）；`> 0` = 最多等待此时间 | `0.0` |
 | `lock_expire` | float | 锁过期时间（秒），超过此时间的事务锁将被视为陈旧锁并强制释放 | `300.0` |
-| `max_parallel_locks` | int | rm/mv 操作的最大并行加锁数 | `8` |
 
 ### QueueFS 持久化
 
diff --git a/openviking/session/session.py b/openviking/session/session.py
index 88444adb..b861deea 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -220,13 +220,17 @@ def update_tool_part(
         self._update_message_in_jsonl()
 
     def commit(self) -> Dict[str, Any]:
-        """Commit session: two-phase transaction with checkpoint.
+        """Sync wrapper for commit_async()."""
+        return run_async(self.commit_async())
+
+    async def commit_async(self) -> Dict[str, Any]:
+        """Async commit session: two-phase transaction with checkpoint.
 
         Phase 1 (Archive): Lock session, write archive, clear messages, write checkpoint.
         LLM call (no transaction): Extract long-term memories.
         Phase 2 (Memory): Lock session, write memories + relations, update checkpoint.
         """
-        from openviking.storage.transaction import get_transaction_manager
+        from openviking.storage.transaction import TransactionContext, get_transaction_manager
 
         result = {
             "session_id": self.session_id,
@@ -247,20 +251,30 @@ def commit(self) -> Dict[str, Any]:
         self._compression.compression_index += 1
         messages_to_archive = self._messages.copy()
 
-        summary = self._generate_archive_summary(messages_to_archive)
+        summary = await self._generate_archive_summary_async(messages_to_archive)
         archive_abstract = self._extract_abstract_from_summary(summary)
         archive_overview = summary
 
-        run_async(
-            self._phase1_archive_async(
-                tx_manager,
-                session_path,
-                self._compression.compression_index,
-                messages_to_archive,
-                archive_abstract,
-                archive_overview,
+        async with TransactionContext(
+            tx_manager, "session_archive", [session_path], lock_mode="point"
+        ) as tx:
+            archive_uri = (
+                f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}"
             )
-        )
+            archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx)
+            seq = tx.record_undo("fs_write_new", {"uri": archive_path})
+            await self._write_archive_async(
+                index=self._compression.compression_index,
+                messages=messages_to_archive,
+                abstract=archive_abstract,
+                overview=archive_overview,
+            )
+            await self._write_to_agfs_async(messages=[])
+            await self._write_checkpoint_async(
+                {"status": "archived", "archive_index": self._compression.compression_index}
+            )
+            tx.mark_completed(seq)
+            await tx.commit()
 
         self._compression.original_count += len(messages_to_archive)
         result["archived"] = True
@@ -275,13 +289,11 @@ def commit(self) -> Dict[str, Any]:
             logger.info(
                 f"Starting memory extraction from {len(messages_to_archive)} archived messages"
             )
-            memories = run_async(
-                self._session_compressor.extract_long_term_memories(
-                    messages=messages_to_archive,
-                    user=self.user,
-                    session_id=self.session_id,
-                    ctx=self.ctx,
-                )
+            memories = await self._session_compressor.extract_long_term_memories(
+                messages=messages_to_archive,
+                user=self.user,
+                session_id=self.session_id,
+                ctx=self.ctx,
             )
             logger.info(f"Extracted {len(memories)} memories")
             result["memories_extracted"] = len(memories)
@@ -289,64 +301,12 @@ def commit(self) -> Dict[str, Any]:
             get_current_telemetry().set("memory.extracted", len(memories))
 
         # ===== Phase 2: Memory write =====
-        run_async(self._phase2_memory_async(tx_manager, session_path))
-
-        # Update active_count
-        active_count_updated = self._update_active_counts()
-        result["active_count_updated"] = active_count_updated
-
-        # Update statistics
-        self._stats.compression_count = self._compression.compression_index
-        result["stats"] = {
-            "total_turns": self._stats.total_turns,
-            "contexts_used": self._stats.contexts_used,
-            "skills_used": self._stats.skills_used,
-            "memories_extracted": self._stats.memories_extracted,
-        }
-
-        self._stats.total_tokens = 0
-        logger.info(f"Session {self.session_id} committed")
-        return result
-
-    async def _phase1_archive_async(
-        self,
-        tx_manager: Any,
-        session_path: str,
-        compression_index: int,
-        messages_to_archive: list,
-        archive_abstract: str,
-        archive_overview: str,
-    ) -> None:
-        """Phase 1 of commit: archive messages inside a transaction."""
-        from openviking.storage.transaction import TransactionContext
-
-        async with TransactionContext(
-            tx_manager, "session_archive", [session_path], lock_mode="point"
-        ) as tx:
-            archive_uri = f"{self._session_uri}/history/archive_{compression_index:03d}"
-            archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx)
-            seq = tx.record_undo("fs_write_new", {"uri": archive_path})
-            self._write_archive(
-                index=compression_index,
-                messages=messages_to_archive,
-                abstract=archive_abstract,
-                overview=archive_overview,
-            )
-            self._write_to_agfs(messages=[])
-            self._write_checkpoint({"status": "archived", "archive_index": compression_index})
-            tx.mark_completed(seq)
-            await tx.commit()
-
-    async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None:
-        """Phase 2 of commit: write memories inside a transaction."""
-        from openviking.storage.transaction import TransactionContext
-
         async with TransactionContext(
             tx_manager, "session_memory", [session_path], lock_mode="point"
         ) as tx:
-            self._write_to_agfs(self._messages)
-            self._write_relations()
-            self._write_checkpoint({"status": "completed"})
+            await self._write_to_agfs_async(self._messages)
+            await self._write_relations_async()
+            await self._write_checkpoint_async({"status": "completed"})
             tx.add_post_action(
                 "enqueue_semantic",
                 {
@@ -360,70 +320,11 @@ async def _phase2_memory_async(self, tx_manager: Any, session_path: str) -> None
             )
             await tx.commit()
 
-    async def commit_async(self) -> Dict[str, Any]:
-        """Async commit session: create archive, extract memories, persist."""
-        result = {
-            "session_id": self.session_id,
-            "status": "committed",
-            "memories_extracted": 0,
-            "active_count_updated": 0,
-            "archived": False,
-            "stats": None,
-        }
-        if not self._messages:
-            get_current_telemetry().set("memory.extracted", 0)
-            return result
-
-        # 1. Archive current messages
-        self._compression.compression_index += 1
-        messages_to_archive = self._messages.copy()
-
-        summary = await self._generate_archive_summary_async(messages_to_archive)
-        archive_abstract = self._extract_abstract_from_summary(summary)
-        archive_overview = summary
-
-        await self._write_archive_async(
-            index=self._compression.compression_index,
-            messages=messages_to_archive,
-            abstract=archive_abstract,
-            overview=archive_overview,
-        )
-
-        self._compression.original_count += len(messages_to_archive)
-        result["archived"] = True
-
-        self._messages.clear()
-        logger.info(
-            f"Archived: {len(messages_to_archive)} messages → history/archive_{self._compression.compression_index:03d}/"
-        )
-
-        # 2. Extract long-term memories
-        if self._session_compressor:
-            logger.info(
-                f"Starting memory extraction from {len(messages_to_archive)} archived messages"
-            )
-            memories = await self._session_compressor.extract_long_term_memories(
-                messages=messages_to_archive,
-                user=self.user,
-                session_id=self.session_id,
-                ctx=self.ctx,
-            )
-            logger.info(f"Extracted {len(memories)} memories")
-            result["memories_extracted"] = len(memories)
-            self._stats.memories_extracted += len(memories)
-            get_current_telemetry().set("memory.extracted", len(memories))
-
-        # 3. Write current messages to AGFS
-        await self._write_to_agfs_async(self._messages)
-
-        # 4. Create relations
-        await self._write_relations_async()
-
-        # 5. Update active_count
+        # Update active_count
         active_count_updated = await self._update_active_counts_async()
         result["active_count_updated"] = active_count_updated
 
-        # 6. Update statistics
+        # Update statistics
         self._stats.compression_count = self._compression.compression_index
         result["stats"] = {
             "total_turns": self._stats.total_turns,
@@ -844,6 +745,23 @@ def _write_checkpoint(self, data: Dict[str, Any]) -> None:
             )
         )
 
+    async def _write_checkpoint_async(self, data: Dict[str, Any]) -> None:
+        """Write a commit checkpoint file for crash recovery (async)."""
+        if not self._viking_fs:
+            return
+
+        checkpoint = {
+            **data,
+            "session_id": self.session_id,
+            "compression_index": self._compression.compression_index,
+            "timestamp": get_current_timestamp(),
+        }
+        await self._viking_fs.write_file(
+            f"{self._session_uri}/.commit_checkpoint.json",
+            json.dumps(checkpoint, ensure_ascii=False),
+            ctx=self.ctx,
+        )
+
     def _read_checkpoint(self) -> Optional[Dict[str, Any]]:
         """Read commit checkpoint file if it exists."""
         if not self._viking_fs:
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index a67cb6bc..8b412a67 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -159,7 +159,7 @@ async def acquire_point(
     ) -> bool:
         transaction_id = transaction.id
         lock_path = self._get_lock_path(path)
-        deadline = asyncio.get_event_loop().time() + timeout
+        deadline = asyncio.get_running_loop().time() + timeout
 
         try:
             self._agfs.stat(path)
@@ -172,12 +172,12 @@ async def acquire_point(
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[POINT] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
-                    if asyncio.get_event_loop().time() >= deadline:
+                    if asyncio.get_running_loop().time() >= deadline:
                         logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
                         return False
                     await asyncio.sleep(_POLL_INTERVAL)
                     continue
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
                     return False
                 await asyncio.sleep(_POLL_INTERVAL)
@@ -190,14 +190,14 @@ async def acquire_point(
                         f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
                     )
                     await self._remove_lock_file(ancestor_conflict)
-                    if asyncio.get_event_loop().time() >= deadline:
+                    if asyncio.get_running_loop().time() >= deadline:
                         logger.warning(
                             f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
                         )
                         return False
                     await asyncio.sleep(_POLL_INTERVAL)
                     continue
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(
                         f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
                     )
@@ -225,7 +225,7 @@ async def acquire_point(
                         logger.debug(f"[POINT] Backing off (livelock guard) on {path}")
                         await self._remove_lock_file(lock_path)
                         backed_off = True
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     if not backed_off:
                         await self._remove_lock_file(lock_path)
                     return False
@@ -234,7 +234,7 @@ async def acquire_point(
 
             if not await self._verify_lock_ownership(lock_path, transaction_id):
                 logger.debug(f"[POINT] Lock ownership verification failed: {path}")
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     return False
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
@@ -248,7 +248,7 @@ async def acquire_subtree(
     ) -> bool:
         transaction_id = transaction.id
         lock_path = self._get_lock_path(path)
-        deadline = asyncio.get_event_loop().time() + timeout
+        deadline = asyncio.get_running_loop().time() + timeout
 
         try:
             self._agfs.stat(path)
@@ -261,12 +261,12 @@ async def acquire_subtree(
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
-                    if asyncio.get_event_loop().time() >= deadline:
+                    if asyncio.get_running_loop().time() >= deadline:
                         logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
                         return False
                     await asyncio.sleep(_POLL_INTERVAL)
                     continue
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
                     return False
                 await asyncio.sleep(_POLL_INTERVAL)
@@ -280,14 +280,14 @@ async def acquire_subtree(
                         f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
                     )
                     await self._remove_lock_file(ancestor_conflict)
-                    if asyncio.get_event_loop().time() >= deadline:
+                    if asyncio.get_running_loop().time() >= deadline:
                         logger.warning(
                             f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
                         )
                         return False
                     await asyncio.sleep(_POLL_INTERVAL)
                     continue
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(
                         f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
                     )
@@ -300,14 +300,14 @@ async def acquire_subtree(
                 if self.is_lock_stale(desc_conflict, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}")
                     await self._remove_lock_file(desc_conflict)
-                    if asyncio.get_event_loop().time() >= deadline:
+                    if asyncio.get_running_loop().time() >= deadline:
                         logger.warning(
                             f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}"
                         )
                         return False
                     await asyncio.sleep(_POLL_INTERVAL)
                     continue
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(
                         f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}"
                     )
@@ -337,7 +337,7 @@ async def acquire_subtree(
                         logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}")
                         await self._remove_lock_file(lock_path)
                         backed_off = True
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     if not backed_off:
                         await self._remove_lock_file(lock_path)
                     return False
@@ -346,7 +346,7 @@ async def acquire_subtree(
 
             if not await self._verify_lock_ownership(lock_path, transaction_id):
                 logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}")
-                if asyncio.get_event_loop().time() >= deadline:
+                if asyncio.get_running_loop().time() >= deadline:
                     return False
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index 7b40c6be..379684a6 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -221,7 +221,7 @@ async def _recover_one(self, tx_id: str) -> None:
             # Pass recover_all=True so partial (completed=False) ops are also reversed,
             # e.g. a directory mv that started but never finished still leaves residue.
             try:
-                execute_rollback(
+                await execute_rollback(
                     tx.undo_log,
                     self._agfs,
                     vector_store=self._vector_store,
@@ -397,7 +397,7 @@ async def rollback(self, transaction_id: str) -> bool:
         # Execute undo log (best-effort)
         if tx.undo_log:
             try:
-                execute_rollback(
+                await execute_rollback(
                     tx.undo_log,
                     self._agfs,
                     vector_store=self._vector_store,
diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py
index a77aa5aa..0b5b3113 100644
--- a/openviking/storage/transaction/undo.py
+++ b/openviking/storage/transaction/undo.py
@@ -73,7 +73,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "UndoEntry":
         )
 
 
-def execute_rollback(
+async def execute_rollback(
     undo_log: List[UndoEntry],
     agfs: Any,
     vector_store: Optional[Any] = None,
@@ -102,7 +102,7 @@ def execute_rollback(
 
     for entry in entries:
         try:
-            _rollback_entry(entry, agfs, vector_store, ctx)
+            await _rollback_entry(entry, agfs, vector_store, ctx)
             logger.info(f"[Rollback] Reversed {entry.op_type} seq={entry.sequence}")
         except Exception as e:
             logger.warning(
@@ -110,15 +110,13 @@ def execute_rollback(
             )
 
 
-def _rollback_entry(
+async def _rollback_entry(
     entry: UndoEntry,
     agfs: Any,
     vector_store: Optional[Any],
     ctx: Optional[Any],
 ) -> None:
     """Dispatch rollback for a single undo entry."""
-    from openviking_cli.utils import run_async
-
     op = entry.op_type
     params = entry.params
 
@@ -146,7 +144,7 @@ def _rollback_entry(
             if record_id:
                 restored_ctx = _reconstruct_ctx(params)
                 if restored_ctx:
-                    run_async(vector_store.delete([record_id], ctx=restored_ctx))
+                    await vector_store.delete([record_id], ctx=restored_ctx)
                 else:
                     logger.warning("[Rollback] vectordb_upsert: cannot reconstruct ctx, skipping")
 
@@ -159,7 +157,7 @@ def _rollback_entry(
                 records_snapshot = params.get("records_snapshot", [])
                 for record in records_snapshot:
                     try:
-                        run_async(vector_store.upsert(record, ctx=restored_ctx))
+                        await vector_store.upsert(record, ctx=restored_ctx)
                     except Exception as e:
                         logger.warning(f"[Rollback] Failed to restore vector record: {e}")
 
@@ -169,13 +167,11 @@ def _rollback_entry(
             if restored_ctx is None:
                 logger.warning("[Rollback] vectordb_update_uri: cannot reconstruct ctx, skipping")
             else:
-                run_async(
-                    vector_store.update_uri_mapping(
-                        ctx=restored_ctx,
-                        uri=params["new_uri"],
-                        new_uri=params["old_uri"],
-                        new_parent_uri=params.get("old_parent_uri", ""),
-                    )
+                await vector_store.update_uri_mapping(
+                    ctx=restored_ctx,
+                    uri=params["new_uri"],
+                    new_uri=params["old_uri"],
+                    new_parent_uri=params.get("old_parent_uri", ""),
                 )
 
     else:
diff --git a/tests/agfs/test_fs_binding.py b/tests/agfs/test_fs_binding.py
index ed8d3d33..e55ff6fd 100644
--- a/tests/agfs/test_fs_binding.py
+++ b/tests/agfs/test_fs_binding.py
@@ -13,6 +13,7 @@
 
 import pytest
 
+from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
 from openviking.storage.viking_fs import init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -32,16 +33,16 @@ async def viking_fs_binding_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize VikingFS with client
+    # Initialize TransactionManager and VikingFS with client
+    init_transaction_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
     # make sure default/temp directory exists
     await vfs.mkdir("viking://temp/", exist_ok=True)
 
-    # Ensure test directory exists
-    await vfs.mkdir("viking://temp/", exist_ok=True)
-
     yield vfs
 
+    reset_transaction_manager()
+
 
 @pytest.mark.asyncio
 class TestVikingFSBindingLocal:
diff --git a/tests/agfs/test_fs_binding_s3.py b/tests/agfs/test_fs_binding_s3.py
index 692b869d..aa7a753b 100644
--- a/tests/agfs/test_fs_binding_s3.py
+++ b/tests/agfs/test_fs_binding_s3.py
@@ -13,6 +13,7 @@
 
 import pytest
 
+from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
 from openviking.storage.viking_fs import init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -57,11 +58,14 @@ async def viking_fs_binding_s3_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize VikingFS with client
+    # Initialize TransactionManager and VikingFS with client
+    init_transaction_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
 
     yield vfs
 
+    reset_transaction_manager()
+
 
 @pytest.mark.asyncio
 class TestVikingFSBindingS3:
diff --git a/tests/agfs/test_fs_local.py b/tests/agfs/test_fs_local.py
index 3a428ed6..9e59f610 100644
--- a/tests/agfs/test_fs_local.py
+++ b/tests/agfs/test_fs_local.py
@@ -10,6 +10,7 @@
 import pytest
 
 from openviking.agfs_manager import AGFSManager
+from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
 from openviking.storage.viking_fs import init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -39,13 +40,15 @@ async def viking_fs_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize VikingFS with client
+    # Initialize TransactionManager and VikingFS with client
+    init_transaction_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
     # make sure default/temp directory exists
     await vfs.mkdir("viking://temp/", exist_ok=True)
 
     yield vfs
 
+    reset_transaction_manager()
     # AGFSManager.stop is synchronous
     manager.stop()
 
diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py
index ff9647e4..67a54e40 100644
--- a/tests/agfs/test_fs_s3.py
+++ b/tests/agfs/test_fs_s3.py
@@ -13,6 +13,7 @@
 import pytest
 
 from openviking.agfs_manager import AGFSManager
+from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
 from openviking.storage.viking_fs import VikingFS, init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -82,11 +83,13 @@ async def viking_fs_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize VikingFS with client
+    # Initialize TransactionManager and VikingFS with client
+    init_transaction_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
 
     yield vfs
 
+    reset_transaction_manager()
     # AGFSManager.stop is synchronous
     manager.stop()
 
diff --git a/tests/integration/test_add_resource_index.py b/tests/integration/test_add_resource_index.py
index 2a35462a..84e1ebbe 100644
--- a/tests/integration/test_add_resource_index.py
+++ b/tests/integration/test_add_resource_index.py
@@ -1,6 +1,6 @@
 import json
 import os
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 
@@ -86,6 +86,19 @@ async def test_add_resource_indexing_logic(test_config, tmp_path):
 
     mock_agfs = MockLocalAGFS(root_path=tmp_path / "mock_agfs_root")
 
+    # Create mock parse result for Phase 1 (media processor)
+    mock_parse_result = MagicMock()
+    mock_parse_result.source_path = str(resource_file)
+    mock_parse_result.meta = {}
+    mock_parse_result.temp_dir_path = "/tmp/fake_temp_dir"
+    mock_parse_result.warnings = []
+    mock_parse_result.source_format = "markdown"
+
+    # Create mock context tree for Phase 2/3 (tree builder)
+    mock_context_tree = MagicMock()
+    mock_context_tree.root = MagicMock()
+    mock_context_tree.root.uri = "viking://resources/test_doc"
+
     # Patch the Summarizer and IndexBuilder to verify calls
     with (
         patch(
@@ -94,6 +107,16 @@ async def test_add_resource_indexing_logic(test_config, tmp_path):
         patch("openviking.utils.agfs_utils.create_agfs_client", return_value=mock_agfs),
         patch("openviking.agfs_manager.AGFSManager.start"),
         patch("openviking.agfs_manager.AGFSManager.stop"),
+        patch(
+            "openviking.utils.media_processor.UnifiedResourceProcessor.process",
+            new_callable=AsyncMock,
+            return_value=mock_parse_result,
+        ),
+        patch(
+            "openviking.parse.tree_builder.TreeBuilder.finalize_from_temp",
+            new_callable=AsyncMock,
+            return_value=mock_context_tree,
+        ),
     ):
         mock_summarize.return_value = {"status": "success"}
 
diff --git a/tests/session/test_memory_dedup_actions.py b/tests/session/test_memory_dedup_actions.py
index e7bb1a80..0f8f94e6 100644
--- a/tests/session/test_memory_dedup_actions.py
+++ b/tests/session/test_memory_dedup_actions.py
@@ -179,7 +179,6 @@ async def test_find_similar_memories_uses_path_must_filter_and__score(self):
         assert len(similar) == 1
         assert similar[0].uri == existing.uri
         call = vikingdb.search_similar_memories.await_args.kwargs
-        assert call["account_id"] == "acc1"
         assert call["owner_space"] == _make_user().user_space_name()
         assert call["category_uri_prefix"] == (
             f"viking://user/{_make_user().user_space_name()}/memories/preferences/"
diff --git a/tests/session/test_session_commit.py b/tests/session/test_session_commit.py
index 60a42d02..efa57fc7 100644
--- a/tests/session/test_session_commit.py
+++ b/tests/session/test_session_commit.py
@@ -6,9 +6,6 @@
 from openviking import AsyncOpenViking
 from openviking.message import TextPart
 from openviking.session import Session
-from tests.utils.mock_context import make_test_ctx
-
-ctx = make_test_ctx()
 
 
 class TestCommit:
@@ -98,12 +95,14 @@ async def test_active_count_incremented_after_commit(self, client_with_resource_
         """
         client, uri = client_with_resource_sync
         vikingdb = client._client.service.vikingdb_manager
+        # Use the client's own context to match the account_id used when adding the resource
+        client_ctx = client._client._ctx
 
         # Look up the record by URI
         records_before = await vikingdb.get_context_by_uri(
             uri=uri,
             limit=1,
-            ctx=ctx,
+            ctx=client_ctx,
         )
         assert records_before, f"Resource not found for URI: {uri}"
         count_before = records_before[0].get("active_count") or 0
@@ -121,7 +120,7 @@ async def test_active_count_incremented_after_commit(self, client_with_resource_
         records_after = await vikingdb.get_context_by_uri(
             uri=uri,
             limit=1,
-            ctx=ctx,
+            ctx=client_ctx,
         )
         assert records_after, f"Record disappeared after commit for URI: {uri}"
         count_after = records_after[0].get("active_count") or 0
diff --git a/tests/storage/test_semantic_dag_skip_files.py b/tests/storage/test_semantic_dag_skip_files.py
index 75b23314..3c6fdd61 100644
--- a/tests/storage/test_semantic_dag_skip_files.py
+++ b/tests/storage/test_semantic_dag_skip_files.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
 
+from unittest.mock import AsyncMock, MagicMock
+
 import pytest
 
 from openviking.server.identity import RequestContext, Role
@@ -8,6 +10,24 @@
 from openviking_cli.session.user_id import UserIdentifier
 
 
+def _mock_transaction_layer(monkeypatch):
+    """Patch transaction layer to no-op for DAG tests."""
+    mock_tx = MagicMock()
+    mock_tx.commit = AsyncMock()
+    monkeypatch.setattr(
+        "openviking.storage.transaction.context_manager.TransactionContext.__aenter__",
+        AsyncMock(return_value=mock_tx),
+    )
+    monkeypatch.setattr(
+        "openviking.storage.transaction.context_manager.TransactionContext.__aexit__",
+        AsyncMock(return_value=False),
+    )
+    monkeypatch.setattr(
+        "openviking.storage.transaction.get_transaction_manager",
+        lambda: MagicMock(),
+    )
+
+
 class _FakeVikingFS:
     def __init__(self, tree):
         self._tree = tree
@@ -19,6 +39,9 @@ async def ls(self, uri, ctx=None):
     async def write_file(self, path, content, ctx=None):
         self.writes.append((path, content))
 
+    def _uri_to_path(self, uri, ctx=None):
+        return uri.replace("viking://", "/local/acc1/")
+
 
 class _FakeProcessor:
     def __init__(self):
@@ -47,7 +70,8 @@ async def _vectorize_single_file(
 @pytest.mark.asyncio
 async def test_messages_jsonl_excluded_from_summary(monkeypatch):
     """messages.jsonl should be skipped by _list_dir and never summarized."""
-    root_uri = "viking://sessions/test-session"
+    _mock_transaction_layer(monkeypatch)
+    root_uri = "viking://session/test-session"
     tree = {
         root_uri: [
             {"name": "messages.jsonl", "isDir": False},
@@ -77,7 +101,8 @@ async def test_messages_jsonl_excluded_from_summary(monkeypatch):
 @pytest.mark.asyncio
 async def test_messages_jsonl_excluded_in_subdirectory(monkeypatch):
     """messages.jsonl in a subdirectory should also be skipped."""
-    root_uri = "viking://sessions/test-session"
+    _mock_transaction_layer(monkeypatch)
+    root_uri = "viking://session/test-session"
     tree = {
         root_uri: [
             {"name": "subdir", "isDir": True},
diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py
index 202db790..85f4cb8b 100644
--- a/tests/storage/test_semantic_dag_stats.py
+++ b/tests/storage/test_semantic_dag_stats.py
@@ -76,7 +76,7 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch):
         AsyncMock(return_value=False),
     )
     monkeypatch.setattr(
-        "openviking.storage.transaction.transaction_manager.get_transaction_manager",
+        "openviking.storage.transaction.get_transaction_manager",
         lambda: MagicMock(),
     )
 
diff --git a/tests/test_session_task_tracking.py b/tests/test_session_task_tracking.py
index 8a61fe4d..1306d500 100644
--- a/tests/test_session_task_tracking.py
+++ b/tests/test_session_task_tracking.py
@@ -181,7 +181,7 @@ async def test_task_failed_when_memory_extraction_raises(api_client):
     async def failing_extract(_context, _user, _session_id):
         raise RuntimeError("memory_extraction_failed: synthetic extractor error")
 
-    service.sessions._session_compressor.extractor.extract_strict = failing_extract
+    service.sessions._session_compressor.extractor.extract = failing_extract
 
     resp = await client.post(f"/api/v1/sessions/{session_id}/commit", params={"wait": False})
     task_id = resp.json()["result"]["task_id"]
diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py
index 68f5e8b4..604b5f50 100644
--- a/tests/transaction/test_rm_rollback.py
+++ b/tests/transaction/test_rm_rollback.py
@@ -10,7 +10,7 @@
 
 
 class TestRmRollback:
-    def test_fs_rm_not_reversible(self, agfs_client, test_dir):
+    async def test_fs_rm_not_reversible(self, agfs_client, test_dir):
         """fs_rm is intentionally irreversible: even completed=True is a no-op."""
         path = f"{test_dir}/rm-target"
         _mkdir_ok(agfs_client, path)
@@ -18,14 +18,14 @@ def test_fs_rm_not_reversible(self, agfs_client, test_dir):
         undo_log = [
             UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # Directory still exists — fs_rm rollback does nothing
         assert file_exists(agfs_client, path)
 
 
 class TestMvRollback:
-    def test_mv_reversed_on_rollback(self, agfs_client, test_dir):
+    async def test_mv_reversed_on_rollback(self, agfs_client, test_dir):
         """Real mv → rollback → content back at original location."""
         src = f"{test_dir}/mv-src"
         dst = f"{test_dir}/mv-dst"
@@ -46,7 +46,7 @@ def test_mv_reversed_on_rollback(self, agfs_client, test_dir):
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         assert file_exists(agfs_client, src)
         restored = agfs_client.cat(f"{src}/payload.txt")
@@ -54,7 +54,7 @@ def test_mv_reversed_on_rollback(self, agfs_client, test_dir):
 
 
 class TestRecoverAll:
-    def test_recover_all_reverses_incomplete(self, agfs_client, test_dir):
+    async def test_recover_all_reverses_incomplete(self, agfs_client, test_dir):
         """recover_all=True also reverses entries with completed=False."""
         new_dir = f"{test_dir}/recover-all-dir"
         _mkdir_ok(agfs_client, new_dir)
@@ -62,11 +62,11 @@ def test_recover_all_reverses_incomplete(self, agfs_client, test_dir):
         undo_log = [
             UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
         ]
-        execute_rollback(undo_log, agfs_client, recover_all=True)
+        await execute_rollback(undo_log, agfs_client, recover_all=True)
 
         assert not file_exists(agfs_client, new_dir)
 
-    def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir):
+    async def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir):
         """recover_all=False skips entries with completed=False."""
         new_dir = f"{test_dir}/skip-incomplete"
         _mkdir_ok(agfs_client, new_dir)
@@ -74,13 +74,13 @@ def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir):
         undo_log = [
             UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
         ]
-        execute_rollback(undo_log, agfs_client, recover_all=False)
+        await execute_rollback(undo_log, agfs_client, recover_all=False)
 
         assert file_exists(agfs_client, new_dir)
 
 
 class TestMultiStepRollback:
-    def test_reverse_order_nested_dirs(self, agfs_client, test_dir):
+    async def test_reverse_order_nested_dirs(self, agfs_client, test_dir):
         """parent + child → rollback reverses in reverse sequence order."""
         parent = f"{test_dir}/multi-parent"
         child = f"{test_dir}/multi-parent/child"
@@ -91,12 +91,12 @@ def test_reverse_order_nested_dirs(self, agfs_client, test_dir):
             UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True),
             UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         assert not file_exists(agfs_client, child)
         assert not file_exists(agfs_client, parent)
 
-    def test_write_new_rollback(self, agfs_client, test_dir):
+    async def test_write_new_rollback(self, agfs_client, test_dir):
         """New file → rollback → file deleted."""
         file_path = f"{test_dir}/new-file.txt"
         agfs_client.write(file_path, b"new content")
@@ -107,11 +107,11 @@ def test_write_new_rollback(self, agfs_client, test_dir):
                 sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True
             ),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         assert not file_exists(agfs_client, file_path)
 
-    def test_best_effort_continues(self, agfs_client, test_dir):
+    async def test_best_effort_continues(self, agfs_client, test_dir):
         """If one step fails, subsequent steps still execute."""
         real_dir = f"{test_dir}/best-effort-real"
         _mkdir_ok(agfs_client, real_dir)
@@ -127,12 +127,12 @@ def test_best_effort_continues(self, agfs_client, test_dir):
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # seq=0 still executed despite seq=1 failure (reversed order: 1 runs first, then 0)
         assert not file_exists(agfs_client, real_dir)
 
-    def test_unknown_op_type_no_crash(self, agfs_client, test_dir):
+    async def test_unknown_op_type_no_crash(self, agfs_client, test_dir):
         """Unknown op_type is logged but doesn't raise."""
         undo_log = [
             UndoEntry(
@@ -143,7 +143,7 @@ def test_unknown_op_type_no_crash(self, agfs_client, test_dir):
             ),
         ]
         # Should not raise
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
 
 class TestVectorDBRollback:
@@ -186,7 +186,7 @@ async def test_vectordb_delete_rollback_restores(self, agfs_client, vector_store
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
         results = await vector_store.get([record_id], ctx=request_ctx)
         assert len(results) == 1
@@ -233,7 +233,7 @@ async def test_vectordb_delete_multi_record(self, agfs_client, vector_store, req
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
         results = await vector_store.get(ids, ctx=request_ctx)
         assert len(results) == 3
@@ -255,7 +255,7 @@ async def test_vectordb_delete_empty_snapshot(self, agfs_client, vector_store, r
             ),
         ]
         # Should not raise
-        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
     async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, request_ctx):
         """upsert → rollback(vectordb_upsert) → record deleted."""
@@ -288,7 +288,7 @@ async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store,
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
         results = await vector_store.get([record_id], ctx=request_ctx)
         assert len(results) == 0
diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py
index 1a68fe6a..aff57887 100644
--- a/tests/transaction/test_undo.py
+++ b/tests/transaction/test_undo.py
@@ -39,7 +39,7 @@ def test_roundtrip(self):
 class TestExecuteRollback:
     """Integration tests for execute_rollback using real AGFS and VectorDB backends."""
 
-    def test_rollback_fs_mv(self, agfs_client, test_dir):
+    async def test_rollback_fs_mv(self, agfs_client, test_dir):
         src = f"{test_dir}/src"
         dst = f"{test_dir}/dst"
         _mkdir_ok(agfs_client, src)
@@ -58,25 +58,25 @@ def test_rollback_fs_mv(self, agfs_client, test_dir):
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # src restored, dst gone
         assert file_exists(agfs_client, src)
         assert not file_exists(agfs_client, dst)
 
-    def test_rollback_fs_rm_skipped(self, agfs_client, test_dir):
+    async def test_rollback_fs_rm_skipped(self, agfs_client, test_dir):
         path = f"{test_dir}/will-not-delete"
         _mkdir_ok(agfs_client, path)
 
         undo_log = [
             UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # fs_rm rollback is a no-op; directory still exists
         assert file_exists(agfs_client, path)
 
-    def test_rollback_fs_mkdir(self, agfs_client, test_dir):
+    async def test_rollback_fs_mkdir(self, agfs_client, test_dir):
         new_dir = f"{test_dir}/created"
         _mkdir_ok(agfs_client, new_dir)
         assert file_exists(agfs_client, new_dir)
@@ -84,11 +84,11 @@ def test_rollback_fs_mkdir(self, agfs_client, test_dir):
         undo_log = [
             UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=True),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         assert not file_exists(agfs_client, new_dir)
 
-    def test_rollback_fs_write_new(self, agfs_client, test_dir):
+    async def test_rollback_fs_write_new(self, agfs_client, test_dir):
         file_path = f"{test_dir}/new-file.txt"
         agfs_client.write(file_path, b"content")
         assert file_exists(agfs_client, file_path)
@@ -98,11 +98,11 @@ def test_rollback_fs_write_new(self, agfs_client, test_dir):
                 sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True
             ),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         assert not file_exists(agfs_client, file_path)
 
-    def test_rollback_reverse_order(self, agfs_client, test_dir):
+    async def test_rollback_reverse_order(self, agfs_client, test_dir):
         """mkdir parent + child → rollback → both removed in reverse order."""
         parent = f"{test_dir}/parent"
         child = f"{test_dir}/parent/child"
@@ -113,25 +113,25 @@ def test_rollback_reverse_order(self, agfs_client, test_dir):
             UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True),
             UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # child removed first (seq=1), then parent (seq=0)
         assert not file_exists(agfs_client, child)
         assert not file_exists(agfs_client, parent)
 
-    def test_rollback_skips_incomplete(self, agfs_client, test_dir):
+    async def test_rollback_skips_incomplete(self, agfs_client, test_dir):
         new_dir = f"{test_dir}/incomplete"
         _mkdir_ok(agfs_client, new_dir)
 
         undo_log = [
             UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
         ]
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # completed=False → not rolled back
         assert file_exists(agfs_client, new_dir)
 
-    def test_rollback_best_effort(self, agfs_client, test_dir):
+    async def test_rollback_best_effort(self, agfs_client, test_dir):
         """A failing rollback entry should not prevent others from running."""
         real_dir = f"{test_dir}/real-dir"
         _mkdir_ok(agfs_client, real_dir)
@@ -152,7 +152,7 @@ def test_rollback_best_effort(self, agfs_client, test_dir):
             ),
         ]
         # Should not raise
-        execute_rollback(undo_log, agfs_client)
+        await execute_rollback(undo_log, agfs_client)
 
         # seq=0 mv rollback should have executed (dst → src)
         assert file_exists(agfs_client, src)
@@ -191,7 +191,7 @@ async def test_rollback_vectordb_upsert(self, agfs_client, vector_store, request
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
         results = await vector_store.get([record_id], ctx=request_ctx)
         assert len(results) == 0
@@ -242,7 +242,7 @@ async def test_rollback_vectordb_update_uri(self, agfs_client, vector_store, req
                 completed=True,
             ),
         ]
-        execute_rollback(undo_log, agfs_client, vector_store=vector_store)
+        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
 
         # URI should be restored to old_uri
         result = await vector_store.fetch_by_uri(old_uri, ctx=request_ctx)

From 74cba6b6e17d5c3419983c0abca77f431a045e3a Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 18:11:00 +0800
Subject: [PATCH 08/18] fix: tests

---
 .../storage/viking_vector_index_backend.py     | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py
index d7a0ebe8..fd3b6e11 100644
--- a/openviking/storage/viking_vector_index_backend.py
+++ b/openviking/storage/viking_vector_index_backend.py
@@ -851,13 +851,21 @@ async def update_uri_mapping(
     async def increment_active_count(self, ctx: RequestContext, uris: List[str]) -> int:
         updated = 0
         for uri in uris:
-            records = await self.get_context_by_uri(uri=uri, limit=1, ctx=ctx)
+            records = await self.get_context_by_uri(uri=uri, limit=100, ctx=ctx)
             if not records:
                 continue
-            record = records[0]
-            current = int(record.get("active_count", 0) or 0)
-            record["active_count"] = current + 1
-            if await self.upsert(record, ctx=ctx):
+            record_ids = [r["id"] for r in records if r.get("id")]
+            if not record_ids:
+                continue
+            # Re-fetch by ID to get full records including vectors
+            full_records = await self.get(record_ids, ctx=ctx)
+            uri_updated = False
+            for record in full_records:
+                current = int(record.get("active_count", 0) or 0)
+                record["active_count"] = current + 1
+                if await self.upsert(record, ctx=ctx):
+                    uri_updated = True
+            if uri_updated:
                 updated += 1
         return updated
 

From c460ac6bccea8c6096759203af2e767d6ef2accc Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 21:15:24 +0800
Subject: [PATCH 09/18] refactor(transaction): simplify session commit and add
 redo-based crash recovery

Session commit no longer wraps archive phase in a transaction. Phase 2 uses
redo semantics so crashed memory-extraction can be replayed from archive.
PathLock stale-lock cleanup no longer redundantly re-checks timeout.
Semantic processor vectorization runs concurrently via asyncio.gather.
---
 openviking/session/session.py                 |  94 +++++++------
 .../storage/queuefs/semantic_processor.py     |  51 ++-----
 .../storage/transaction/context_manager.py    |  10 +-
 openviking/storage/transaction/path_lock.py   |  26 ----
 .../transaction/transaction_manager.py        | 132 ++++++++++++++++--
 .../storage/viking_vector_index_backend.py    |   2 +-
 6 files changed, 192 insertions(+), 123 deletions(-)

diff --git a/openviking/session/session.py b/openviking/session/session.py
index b861deea..5726b8e3 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -224,11 +224,10 @@ def commit(self) -> Dict[str, Any]:
         return run_async(self.commit_async())
 
     async def commit_async(self) -> Dict[str, Any]:
-        """Async commit session: two-phase transaction with checkpoint.
+        """Async commit session: two-phase approach.
 
-        Phase 1 (Archive): Lock session, write archive, clear messages, write checkpoint.
-        LLM call (no transaction): Extract long-term memories.
-        Phase 2 (Memory): Lock session, write memories + relations, update checkpoint.
+        Phase 1 (Archive, no transaction): Write archive, clear messages.
+        Phase 2 (Memory, transaction with redo semantics): Extract memories, write, enqueue.
         """
         from openviking.storage.transaction import TransactionContext, get_transaction_manager
 
@@ -245,9 +244,8 @@ async def commit_async(self) -> Dict[str, Any]:
             return result
 
         tx_manager = get_transaction_manager()
-        session_path = self._viking_fs._uri_to_path(self._session_uri, ctx=self.ctx)
 
-        # ===== Phase 1: Archive =====
+        # ===== Preparation (no transaction) =====
         self._compression.compression_index += 1
         messages_to_archive = self._messages.copy()
 
@@ -255,58 +253,62 @@ async def commit_async(self) -> Dict[str, Any]:
         archive_abstract = self._extract_abstract_from_summary(summary)
         archive_overview = summary
 
-        async with TransactionContext(
-            tx_manager, "session_archive", [session_path], lock_mode="point"
-        ) as tx:
-            archive_uri = (
-                f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}"
-            )
-            archive_path = self._viking_fs._uri_to_path(archive_uri, ctx=self.ctx)
-            seq = tx.record_undo("fs_write_new", {"uri": archive_path})
-            await self._write_archive_async(
-                index=self._compression.compression_index,
-                messages=messages_to_archive,
-                abstract=archive_abstract,
-                overview=archive_overview,
-            )
-            await self._write_to_agfs_async(messages=[])
-            await self._write_checkpoint_async(
-                {"status": "archived", "archive_index": self._compression.compression_index}
-            )
-            tx.mark_completed(seq)
-            await tx.commit()
+        # ===== Phase 1: Archive (no transaction, no lock) =====
+        archive_uri = (
+            f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}"
+        )
+        await self._write_archive_async(
+            index=self._compression.compression_index,
+            messages=messages_to_archive,
+            abstract=archive_abstract,
+            overview=archive_overview,
+        )
+        await self._write_to_agfs_async(messages=[])
+        self._messages.clear()
 
         self._compression.original_count += len(messages_to_archive)
         result["archived"] = True
-        self._messages.clear()
         logger.info(
             f"Archived: {len(messages_to_archive)} messages → "
             f"history/archive_{self._compression.compression_index:03d}/"
         )
 
-        # ===== LLM call (no transaction) =====
-        if self._session_compressor:
-            logger.info(
-                f"Starting memory extraction from {len(messages_to_archive)} archived messages"
-            )
-            memories = await self._session_compressor.extract_long_term_memories(
-                messages=messages_to_archive,
-                user=self.user,
-                session_id=self.session_id,
-                ctx=self.ctx,
-            )
-            logger.info(f"Extracted {len(memories)} memories")
-            result["memories_extracted"] = len(memories)
-            self._stats.memories_extracted += len(memories)
-            get_current_telemetry().set("memory.extracted", len(memories))
-
-        # ===== Phase 2: Memory write =====
+        # ===== Phase 2: Memory extraction + write (transaction, redo semantics) =====
         async with TransactionContext(
-            tx_manager, "session_memory", [session_path], lock_mode="point"
+            tx_manager,
+            "session_memory",
+            [],
+            lock_mode="none",
         ) as tx:
+            # Store redo info so _recover_one can redo from archive on crash
+            tx.record.init_info.update(
+                {
+                    "archive_uri": archive_uri,
+                    "session_uri": self._session_uri,
+                    "account_id": self.ctx.account_id,
+                    "user_id": self.ctx.user.user_id,
+                    "agent_id": self.ctx.user.agent_id,
+                    "role": self.ctx.role.value,
+                }
+            )
+
+            if self._session_compressor:
+                logger.info(
+                    f"Starting memory extraction from {len(messages_to_archive)} archived messages"
+                )
+                memories = await self._session_compressor.extract_long_term_memories(
+                    messages=messages_to_archive,
+                    user=self.user,
+                    session_id=self.session_id,
+                    ctx=self.ctx,
+                )
+                logger.info(f"Extracted {len(memories)} memories")
+                result["memories_extracted"] = len(memories)
+                self._stats.memories_extracted += len(memories)
+                get_current_telemetry().set("memory.extracted", len(memories))
+
             await self._write_to_agfs_async(self._messages)
             await self._write_relations_async()
-            await self._write_checkpoint_async({"status": "completed"})
             tx.add_post_action(
                 "enqueue_semantic",
                 {
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index e7e108f2..c7d93eda 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -343,10 +343,8 @@ async def _process_single_directory(
                 # 1. Collect .abstract.md from subdirectories
                 children_abstracts = await self._collect_children_abstracts(children_uris)
 
-                # 2. Generate file summaries (vectorize inline, not via enqueue)
-                file_summaries = await self._generate_file_summaries(
-                    file_paths, context_type=context_type, parent_uri=uri, enqueue_files=False
-                )
+                # 2. Concurrently generate summaries for files in directory
+                file_summaries = await self._generate_file_summaries(file_paths)
 
                 # 3. Generate .overview.md
                 overview = await self._generate_overview(uri, file_summaries, children_abstracts)
@@ -360,22 +358,23 @@ async def _process_single_directory(
 
                 logger.debug(f"Generated overview and abstract for {uri}")
 
-                # 6. Vectorize directory and files (all inside the lock)
-                try:
-                    await self._vectorize_directory_simple(uri, context_type, abstract, overview)
-                except Exception as e:
-                    logger.error(f"Failed to vectorize directory {uri}: {e}", exc_info=True)
-
-                for fp, summary in zip(file_paths, file_summaries):
-                    try:
-                        await self._vectorize_single_file(
+                # 6. Vectorize directory and files concurrently
+                vectorize_tasks = [
+                    self._vectorize_directory_simple(uri, context_type, abstract, overview),
+                    *(
+                        self._vectorize_single_file(
                             parent_uri=uri,
                             context_type=context_type,
                             file_path=fp,
                             summary_dict=summary,
                         )
-                    except Exception as e:
-                        logger.error(f"Failed to vectorize file {fp}: {e}", exc_info=True)
+                        for fp, summary in zip(file_paths, file_summaries)
+                    ),
+                ]
+                results = await asyncio.gather(*vectorize_tasks, return_exceptions=True)
+                for result in results:
+                    if isinstance(result, Exception):
+                        logger.error(f"Vectorization failed: {result}", exc_info=True)
 
                 await tx.commit()
         except LockAcquisitionError:
@@ -395,32 +394,12 @@ async def _collect_children_abstracts(self, children_uris: List[str]) -> List[Di
     async def _generate_file_summaries(
         self,
         file_paths: List[str],
-        context_type: Optional[str] = None,
-        parent_uri: Optional[str] = None,
-        enqueue_files: bool = False,
     ) -> List[Dict[str, str]]:
         """Concurrently generate file summaries."""
         if not file_paths:
             return []
 
-        async def generate_one_summary(file_path: str) -> Dict[str, str]:
-            summary = await self._generate_single_file_summary(file_path, ctx=self._current_ctx)
-            if enqueue_files and context_type and parent_uri:
-                try:
-                    await self._vectorize_single_file(
-                        parent_uri=parent_uri,
-                        context_type=context_type,
-                        file_path=file_path,
-                        summary_dict=summary,
-                    )
-                except Exception as e:
-                    logger.error(
-                        f"Failed to vectorize file {file_path}: {e}",
-                        exc_info=True,
-                    )
-            return summary
-
-        tasks = [generate_one_summary(fp) for fp in file_paths]
+        tasks = [self._generate_single_file_summary(fp, ctx=self._current_ctx) for fp in file_paths]
         return await asyncio.gather(*tasks)
 
     async def _generate_text_summary(
diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py
index 8272b91c..09697e10 100644
--- a/openviking/storage/transaction/context_manager.py
+++ b/openviking/storage/transaction/context_manager.py
@@ -74,7 +74,15 @@ async def __aenter__(self) -> "TransactionContext":
             logger.warning(f"[Transaction] Failed to write journal for {tx_id}: {e}")
 
         success = False
-        if self._lock_mode == "subtree":
+        if self._lock_mode == "none":
+            # No lock acquisition — transition directly to EXEC status
+            tx = self._tx_manager.get_transaction(tx_id)
+            if tx:
+                from openviking.storage.transaction.transaction_record import TransactionStatus
+
+                tx.update_status(TransactionStatus.EXEC)
+            success = True
+        elif self._lock_mode == "subtree":
             for path in self._lock_paths:
                 success = await self._tx_manager.acquire_lock_subtree(tx_id, path)
                 if not success:
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index 8b412a67..1097b0da 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -172,10 +172,6 @@ async def acquire_point(
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[POINT] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
-                    if asyncio.get_running_loop().time() >= deadline:
-                        logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
-                        return False
-                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(f"[POINT] Timeout waiting for lock on: {path}")
@@ -190,12 +186,6 @@ async def acquire_point(
                         f"[POINT] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
                     )
                     await self._remove_lock_file(ancestor_conflict)
-                    if asyncio.get_running_loop().time() >= deadline:
-                        logger.warning(
-                            f"[POINT] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
-                        )
-                        return False
-                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(
@@ -261,10 +251,6 @@ async def acquire_subtree(
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
-                    if asyncio.get_running_loop().time() >= deadline:
-                        logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
-                        return False
-                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(f"[SUBTREE] Timeout waiting for lock on: {path}")
@@ -280,12 +266,6 @@ async def acquire_subtree(
                         f"[SUBTREE] Removing stale ancestor SUBTREE lock: {ancestor_conflict}"
                     )
                     await self._remove_lock_file(ancestor_conflict)
-                    if asyncio.get_running_loop().time() >= deadline:
-                        logger.warning(
-                            f"[SUBTREE] Timeout waiting for ancestor SUBTREE lock: {ancestor_conflict}"
-                        )
-                        return False
-                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(
@@ -300,12 +280,6 @@ async def acquire_subtree(
                 if self.is_lock_stale(desc_conflict, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}")
                     await self._remove_lock_file(desc_conflict)
-                    if asyncio.get_running_loop().time() >= deadline:
-                        logger.warning(
-                            f"[SUBTREE] Timeout waiting for descendant lock: {desc_conflict}"
-                        )
-                        return False
-                    await asyncio.sleep(_POLL_INTERVAL)
                     continue
                 if asyncio.get_running_loop().time() >= deadline:
                     logger.warning(
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index 379684a6..cb395432 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -184,7 +184,8 @@ async def _recover_one(self, tx_id: str) -> None:
         Recovery strategy by status:
           COMMITTED + post_actions  → replay post_actions (enqueue etc.), then clean up
           COMMITTED, no post_actions / RELEASED → just clean up
-          EXEC / FAIL / RELEASING   → rollback completed+partial ops, then clean up
+          EXEC / FAIL / RELEASING, all ops completed → roll forward (commit), then clean up
+          EXEC / FAIL / RELEASING, partial ops       → rollback completed+partial ops, then clean up
           INIT / ACQUIRE            → nothing executed yet, just clean up
         """
         from openviking.storage.transaction.undo import execute_rollback
@@ -217,18 +218,43 @@ async def _recover_one(self, tx_id: str) -> None:
             if not tx.locks:
                 await self._cleanup_orphan_locks_from_init_info(tx_id, tx.init_info)
         else:
-            # EXEC / FAIL / RELEASING: process crashed mid-operation — rollback
-            # Pass recover_all=True so partial (completed=False) ops are also reversed,
-            # e.g. a directory mv that started but never finished still leaves residue.
-            try:
-                await execute_rollback(
-                    tx.undo_log,
-                    self._agfs,
-                    vector_store=self._vector_store,
-                    recover_all=True,
-                )
-            except Exception as e:
-                logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}")
+            # EXEC / FAIL / RELEASING: process crashed mid-operation
+            operation = tx.init_info.get("operation", "")
+            if operation == "session_memory":
+                # Redo: re-extract memories from archive and write
+                try:
+                    await self._redo_session_memory(tx)
+                except Exception as e:
+                    logger.warning(f"Redo session_memory failed for tx {tx_id}: {e}")
+            elif (
+                tx.status == TransactionStatus.EXEC
+                and tx.undo_log
+                and all(e.completed for e in tx.undo_log)
+            ):
+                # All operations completed successfully but commit didn't persist.
+                # Roll forward: treat as committed to avoid data loss from rollback
+                # of irreversible operations (e.g. mv's fs_rm).
+                logger.info(f"All ops completed for tx {tx_id}, rolling forward (commit)")
+                if tx.post_actions:
+                    try:
+                        await self._execute_post_actions(tx.post_actions)
+                    except Exception as e:
+                        logger.warning(
+                            f"Post-action replay during roll-forward failed for tx {tx_id}: {e}"
+                        )
+            else:
+                # Default: rollback completed+partial ops
+                # Pass recover_all=True so partial (completed=False) ops are also reversed,
+                # e.g. a directory mv that started but never finished still leaves residue.
+                try:
+                    await execute_rollback(
+                        tx.undo_log,
+                        self._agfs,
+                        vector_store=self._vector_store,
+                        recover_all=True,
+                    )
+                except Exception as e:
+                    logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}")
 
         # Release any lock files still present
         await self._path_lock.release(tx)
@@ -275,6 +301,86 @@ async def _cleanup_orphan_locks_from_init_info(
             except Exception as e:
                 logger.warning(f"Failed to check orphan lock {lock_file}: {e}")
 
+    async def _redo_session_memory(self, tx: TransactionRecord) -> None:
+        """Redo a session_memory transaction from its archived messages.
+
+        On crash during Phase 2 of session commit, we redo memory extraction
+        from the archive rather than rolling back.
+        """
+        import json
+
+        from openviking.message import Message
+        from openviking.server.identity import RequestContext, Role
+        from openviking_cli.session.user_id import UserIdentifier
+
+        archive_uri = tx.init_info.get("archive_uri")
+        session_uri = tx.init_info.get("session_uri")
+        account_id = tx.init_info.get("account_id", "default")
+        user_id = tx.init_info.get("user_id", "default")
+        agent_id = tx.init_info.get("agent_id", "default")
+        role_str = tx.init_info.get("role", "root")
+
+        if not archive_uri or not session_uri:
+            logger.warning("Cannot redo session_memory: missing archive_uri or session_uri")
+            return
+
+        # 1. Read archived messages from AGFS
+        messages_path = f"{archive_uri}/messages.jsonl"
+        try:
+            agfs_path = messages_path.replace("viking://", "")
+            content = self._agfs.cat(agfs_path)
+            if isinstance(content, bytes):
+                content = content.decode("utf-8")
+        except Exception as e:
+            logger.warning(f"Cannot read archive for redo: {messages_path}: {e}")
+            return
+
+        messages = []
+        for line in content.strip().split("\n"):
+            if line.strip():
+                try:
+                    messages.append(Message.from_dict(json.loads(line)))
+                except Exception:
+                    pass
+
+        if not messages:
+            logger.warning(f"No messages found in archive for redo: {archive_uri}")
+            return
+
+        # 2. Build request context for memory extraction
+        user = UserIdentifier(user_id=user_id, agent_id=agent_id)
+        ctx = RequestContext(user=user, role=Role(role_str), account_id=account_id)
+
+        # 3. Re-extract memories
+        from openviking.session.compressor import SessionCompressor
+
+        compressor = SessionCompressor()
+        session_id = session_uri.rstrip("/").rsplit("/", 1)[-1]
+        memories = await compressor.extract_long_term_memories(
+            messages=messages,
+            user=user,
+            session_id=session_id,
+            ctx=ctx,
+        )
+        logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}")
+
+        # 4. Enqueue semantic processing
+        await self._execute_post_actions(
+            [
+                {
+                    "type": "enqueue_semantic",
+                    "params": {
+                        "uri": session_uri,
+                        "context_type": "memory",
+                        "account_id": account_id,
+                        "user_id": user_id,
+                        "agent_id": agent_id,
+                        "role": role_str,
+                    },
+                }
+            ]
+        )
+
     def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord:
         """Create a new transaction.
 
diff --git a/openviking/storage/viking_vector_index_backend.py b/openviking/storage/viking_vector_index_backend.py
index fd3b6e11..9bdf5976 100644
--- a/openviking/storage/viking_vector_index_backend.py
+++ b/openviking/storage/viking_vector_index_backend.py
@@ -834,7 +834,7 @@ async def update_uri_mapping(
         # so fetch and update all of them.
         records = await self.filter(
             filter=And([Eq("uri", uri), Eq("account_id", ctx.account_id)]),
-            limit=1,
+            limit=100,
             ctx=ctx,
         )
         if not records:

From e494f38fe9e54c25479a886ec6e3178258e38dfc Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Mon, 16 Mar 2026 21:47:07 +0800
Subject: [PATCH 10/18] fix: transaction

---
 docs/en/concepts/09-transaction.md            | 50 +++++++++++--------
 docs/zh/concepts/09-transaction.md            | 50 +++++++++++--------
 .../transaction/transaction_manager.py        | 19 +------
 3 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
index 48e6d355..53e1a4a1 100644
--- a/docs/en/concepts/09-transaction.md
+++ b/docs/en/concepts/09-transaction.md
@@ -97,29 +97,36 @@ Crash recovery: Journal records the post_action; replayed automatically on resta
 
 | Problem | Solution |
 |---------|----------|
-| Messages cleared but archive not written -> conversation data lost | Split into two transactions + checkpoint |
+| Messages cleared but archive not written -> conversation data lost | Phase 1 without transaction (incomplete archive has no side effects) + Phase 2 with redo transaction |
 
-LLM calls have unpredictable latency (5s~60s+), so they cannot be inside a transaction. Split into:
+LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding transaction. The design splits into two phases:
 
 ```
-Transaction 1 (Archive):
-  1. Write archive (history/archive_N/messages.jsonl + summaries)
-  2. Clear messages.jsonl
-  3. Write checkpoint (status="archived")
-  4. Commit
-
-LLM call (no transaction):
-  Extract memories from archived messages
-
-Transaction 2 (Memory write):
-  1. Write memory files
-  2. Write relations
-  3. Update checkpoint (status="completed")
-  4. Register post_action: enqueue SemanticQueue
-  5. Commit
+Phase 1 — Archive (no transaction, no lock):
+  1. Generate archive summary (LLM)
+  2. Write archive (history/archive_N/messages.jsonl + summaries)
+  3. Clear messages.jsonl
+  4. Clear in-memory message list
+
+Phase 2 — Memory extraction + write (transaction, lock_mode="none", redo semantics):
+  1. Record init_info (archive_uri, session_uri, user identity)
+  2. Extract memories from archived messages (LLM)
+  3. Write current message state
+  4. Write relations
+  5. Register post_action: enqueue SemanticQueue
+  6. Commit
 ```
 
-Crash recovery: Read checkpoint, resume from the appropriate step based on status.
+**Redo semantics**: Phase 2 does not register undo log entries. On crash recovery, memory extraction and writing are re-executed from the archive (`_redo_session_memory`) instead of being rolled back.
+
+**Crash recovery analysis**:
+
+| Crash point | State | Recovery action |
+|------------|-------|----------------|
+| During Phase 1 archive write | No transaction | Incomplete archive; next commit scans history/ for index, unaffected |
+| Phase 1 archive complete but messages not cleared | No transaction | Archive complete + messages still present = redundant but safe |
+| During Phase 2 memory extraction/write | Journal EXEC | On startup: `_redo_session_memory` redoes extraction + write + enqueue from archive |
+| After Phase 2 commit | Journal COMMIT | On startup: replay `post_action("enqueue_semantic")` |
 
 ## TransactionContext
 
@@ -153,6 +160,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 | `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors |
 | `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors |
 | `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) |
+| `none` | Lock-free operations | Skip lock acquisition, transition directly to EXEC status. Used for session.commit Phase 2 and other scenarios that don't require path mutual exclusion |
 
 ## Lock Types (POINT vs SUBTREE)
 
@@ -288,7 +296,9 @@ Rollback           -> execute undo log -> release locks -> delete journal
 |------------------------|----------------|
 | `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal |
 | `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal |
-| `EXEC` / `FAIL` / `RELEASING` | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal |
+| `EXEC` / `FAIL` / `RELEASING` (`session_memory` operation) | Redo memory extraction + write from archive (`_redo_session_memory`) -> release locks -> delete journal |
+| `EXEC` / `FAIL` / `RELEASING` (all undo entries completed) | Roll forward (treat as committed, replay post_actions) -> release locks -> delete journal |
+| `EXEC` / `FAIL` / `RELEASING` (other) | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal |
 | `INIT` / `ACQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) |
 
 ### Defense Summary
@@ -298,7 +308,7 @@ Rollback           -> execute undo log -> release locks -> delete journal
 | Crash during transaction | Journal + undo log rollback | On restart |
 | Crash after commit, before enqueue | Journal post_actions replay | On restart |
 | Crash after enqueue, before worker processes | QueueFS SQLite persistence | Worker auto-pulls after restart |
-| Crash during session.commit LLM call | Checkpoint file recovery | On restart, re-invoke LLM |
+| Crash during session.commit Phase 2 | Journal + redo (re-extract memories from archive) | On restart |
 | Orphan index | Cleaned on L2 on-demand load | When user accesses |
 | Crash between lock creation and journal update | init_info records intended lock paths; recovery checks and cleans orphan locks | On restart |
 
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index e4f1b8d0..ccd98ffc 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -97,29 +97,36 @@ Storage Layer (VikingFS, VectorDB, QueueManager)
 
 | 问题 | 方案 |
 |------|------|
-| 消息已清空但 archive 未写入 → 对话数据丢失 | 拆为两段事务 + checkpoint |
+| 消息已清空但 archive 未写入 → 对话数据丢失 | Phase 1 无事务（archive 不完整无副作用）+ Phase 2 redo 事务 |
 
-LLM 调用耗时不可控（5s~60s+），放在事务内会长时间持锁。因此拆为：
+LLM 调用耗时不可控（5s~60s+），不能放在持锁事务内。设计拆为两个阶段：
 
 ```
-第一段事务（归档）：
-  1. 写 archive（history/archive_N/messages.jsonl + 摘要）
-  2. 清空 messages.jsonl
-  3. 写 checkpoint（status="archived"）
-  4. 提交
-
-LLM 调用（无事务）：
-  从归档消息提取 memories
-
-第二段事务（memory 写入）：
-  1. 写 memory 文件
-  2. 写 relations
-  3. 更新 checkpoint（status="completed"）
-  4. 注册 post_action: enqueue SemanticQueue
-  5. 提交
+Phase 1 — 归档（无事务、无锁）：
+  1. 生成归档摘要（LLM）
+  2. 写 archive（history/archive_N/messages.jsonl + 摘要）
+  3. 清空 messages.jsonl
+  4. 清空内存中的消息列表
+
+Phase 2 — 记忆提取 + 写入（事务，lock_mode="none"，redo 语义）：
+  1. 记录 init_info（archive_uri、session_uri、用户身份信息）
+  2. 从归档消息提取 memories（LLM）
+  3. 写当前消息状态
+  4. 写 relations
+  5. 注册 post_action: enqueue SemanticQueue
+  6. 提交
 ```
 
-崩溃恢复：读 checkpoint，根据 status 决定从哪一步继续。
+**Redo 语义**：Phase 2 不注册 undo log。崩溃恢复时从 archive 重新执行记忆提取和写入（`_redo_session_memory`），而非回滚。
+
+**崩溃恢复分析**：
+
+| 崩溃时间点 | 状态 | 恢复动作 |
+|-----------|------|---------|
+| Phase 1 写 archive 中途 | 无事务 | archive 不完整，下次 commit 从 history/ 扫描 index，不受影响 |
+| Phase 1 archive 完成但 messages 未清空 | 无事务 | archive 完整 + messages 仍在 = 数据冗余但安全 |
+| Phase 2 记忆提取/写入中途 | journal EXEC | 启动恢复：`_redo_session_memory` 从 archive 重做提取+写入+入队 |
+| Phase 2 commit 后 | journal COMMIT | 启动恢复：重放 `post_action("enqueue_semantic")` |
 
 ## TransactionContext
 
@@ -153,6 +160,7 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 | `point` | 写操作 | 锁定指定路径；与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 |
 | `subtree` | 删除操作 | 锁定子树根节点；与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 |
 | `mv` | 移动操作 | 目录移动：源和目标均加 SUBTREE 锁；文件移动：源父目录和目标均加 POINT 锁（通过 `src_is_dir` 控制） |
+| `none` | 无锁操作 | 跳过锁获取，直接进入 EXEC 状态。用于 session.commit Phase 2 等不需要路径互斥的场景 |
 
 ## 锁类型（POINT vs SUBTREE）
 
@@ -288,7 +296,9 @@ VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、age
 |---------------------|---------|
 | `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal |
 | `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal |
-| `EXEC` / `FAIL` / `RELEASING` | 执行 undo log 回滚（`recover_all=True`） → 删锁 → 删 journal |
+| `EXEC` / `FAIL` / `RELEASING`（`session_memory` 操作） | 从 archive 重做记忆提取+写入（`_redo_session_memory`） → 删锁 → 删 journal |
+| `EXEC` / `FAIL` / `RELEASING`（所有 undo 均 completed） | 前滚（视为已提交，重放 post_actions） → 删锁 → 删 journal |
+| `EXEC` / `FAIL` / `RELEASING`（其他） | 执行 undo log 回滚（`recover_all=True`） → 删锁 → 删 journal |
 | `INIT` / `ACQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal（变更未执行） |
 
 ### 防线总结
@@ -298,7 +308,7 @@ VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、age
 | 事务内崩溃 | journal + undo log 回滚 | 重启时 |
 | 提交后 enqueue 前崩溃 | journal post_actions 重放 | 重启时 |
 | enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后自动拉取 |
-| session.commit LLM 调用中崩溃 | checkpoint 文件恢复 | 重启时重新调用 LLM |
+| session.commit Phase 2 中崩溃 | journal + redo（从 archive 重做记忆提取） | 重启时 |
 | 孤儿索引 | L2 按需加载时清理 | 用户访问时 |
 | 加锁后 journal 更新前崩溃 | init_info 记录预期锁路径，恢复时检查并清理孤儿锁 | 重启时 |
 
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index cb395432..041b8423 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -184,8 +184,7 @@ async def _recover_one(self, tx_id: str) -> None:
         Recovery strategy by status:
           COMMITTED + post_actions  → replay post_actions (enqueue etc.), then clean up
           COMMITTED, no post_actions / RELEASED → just clean up
-          EXEC / FAIL / RELEASING, all ops completed → roll forward (commit), then clean up
-          EXEC / FAIL / RELEASING, partial ops       → rollback completed+partial ops, then clean up
+          EXEC / FAIL / RELEASING   → rollback completed+partial ops, then clean up
           INIT / ACQUIRE            → nothing executed yet, just clean up
         """
         from openviking.storage.transaction.undo import execute_rollback
@@ -226,22 +225,6 @@ async def _recover_one(self, tx_id: str) -> None:
                     await self._redo_session_memory(tx)
                 except Exception as e:
                     logger.warning(f"Redo session_memory failed for tx {tx_id}: {e}")
-            elif (
-                tx.status == TransactionStatus.EXEC
-                and tx.undo_log
-                and all(e.completed for e in tx.undo_log)
-            ):
-                # All operations completed successfully but commit didn't persist.
-                # Roll forward: treat as committed to avoid data loss from rollback
-                # of irreversible operations (e.g. mv's fs_rm).
-                logger.info(f"All ops completed for tx {tx_id}, rolling forward (commit)")
-                if tx.post_actions:
-                    try:
-                        await self._execute_post_actions(tx.post_actions)
-                    except Exception as e:
-                        logger.warning(
-                            f"Post-action replay during roll-forward failed for tx {tx_id}: {e}"
-                        )
             else:
                 # Default: rollback completed+partial ops
                 # Pass recover_all=True so partial (completed=False) ops are also reversed,

From c61978f5df85cc823dc41211ac95ccd3ce8ccd55 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Tue, 17 Mar 2026 15:32:45 +0800
Subject: [PATCH 11/18] fix: UserIdentifier

---
 openviking/storage/transaction/transaction_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
index 041b8423..e80df09b 100644
--- a/openviking/storage/transaction/transaction_manager.py
+++ b/openviking/storage/transaction/transaction_manager.py
@@ -331,7 +331,7 @@ async def _redo_session_memory(self, tx: TransactionRecord) -> None:
             return
 
         # 2. Build request context for memory extraction
-        user = UserIdentifier(user_id=user_id, agent_id=agent_id)
+        user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id)
         ctx = RequestContext(user=user, role=Role(role_str), account_id=account_id)
 
         # 3. Re-extract memories

From 4e44a5d6163531bdeecd9a27854183ce17926bc7 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Tue, 17 Mar 2026 19:44:22 +0800
Subject: [PATCH 12/18] refactor(transaction): replace undo-based transaction
 manager with lightweight lock + redo-log

Remove the heavyweight TransactionManager/Journal/UndoEntry system (~4000 lines) and
replace it with a simpler architecture: LockManager for path locking, LockContext as
the async context manager, LockHandle/LockOwner protocol, and a RedoLog for crash
recovery of session_memory operations. VikingFS rm/mv now use inline error handling
instead of rollback semantics. Updated docs, observers, and tests accordingly.

Co-Authored-By: Claude Opus 4.6
---
 docs/design/multi-tenant-design.md            |   2 +-
 docs/en/concepts/09-transaction.md            | 306 ++++----
 docs/en/guides/01-configuration.md            |  11 +-
 docs/zh/concepts/09-transaction.md            | 341 ++++----
 docs/zh/guides/01-configuration.md            |  13 +-
 openviking/async_client.py                    |   6 +-
 openviking/server/routers/content.py          |  26 +-
 openviking/server/routers/observer.py         |   8 +-
 openviking/service/core.py                    |  32 +-
 openviking/service/debug_service.py           |  18 +-
 openviking/session/session.py                 | 109 +--
 openviking/storage/errors.py                  |  10 +-
 openviking/storage/observers/__init__.py      |   4 +-
 openviking/storage/observers/lock_observer.py |  71 ++
 .../storage/observers/transaction_observer.py | 222 ------
 openviking/storage/queuefs/semantic_dag.py    |   9 +-
 .../storage/queuefs/semantic_processor.py     |  10 +-
 openviking/storage/transaction/__init__.py    |  42 +-
 .../storage/transaction/context_manager.py    | 159 ----
 openviking/storage/transaction/journal.py     | 113 ---
 .../storage/transaction/lock_context.py       |  68 ++
 openviking/storage/transaction/lock_handle.py |  37 +
 .../storage/transaction/lock_manager.py       | 247 ++++++
 openviking/storage/transaction/path_lock.py   | 110 ++-
 openviking/storage/transaction/redo_log.py    |  76 ++
 .../transaction/transaction_manager.py        | 739 ------------------
 .../storage/transaction/transaction_record.py | 139 ----
 openviking/storage/transaction/undo.py        | 178 -----
 openviking/storage/viking_fs.py               | 119 +--
 openviking/utils/resource_processor.py        |  17 +-
 .../utils/config/transaction_config.py        |   5 -
 tests/agfs/test_fs_binding.py                 |   8 +-
 tests/agfs/test_fs_binding_s3.py              |   8 +-
 tests/agfs/test_fs_local.py                   |   8 +-
 tests/agfs/test_fs_s3.py                      |   8 +-
 tests/server/conftest.py                      |   8 +-
 tests/storage/test_semantic_dag_skip_files.py |  16 +-
 tests/storage/test_semantic_dag_stats.py      |  16 +-
 tests/transaction/conftest.py                 |  19 +-
 tests/transaction/test_concurrent_lock.py     |  12 +-
 tests/transaction/test_context_manager.py     | 226 ------
 tests/transaction/test_crash_recovery.py      | 561 -------------
 tests/transaction/test_e2e.py                 | 448 ++---------
 tests/transaction/test_journal.py             | 215 -----
 tests/transaction/test_lock_context.py        |  85 ++
 tests/transaction/test_lock_manager.py        |  88 +++
 tests/transaction/test_path_lock.py           |  52 +-
 tests/transaction/test_post_actions.py        | 112 ---
 tests/transaction/test_redo_log.py            |  78 ++
 tests/transaction/test_rm_rollback.py         | 294 -------
 tests/transaction/test_transaction_manager.py | 323 --------
 tests/transaction/test_undo.py                | 249 ------
 52 files changed, 1408 insertions(+), 4673 deletions(-)
 create mode 100644 openviking/storage/observers/lock_observer.py
 delete mode 100644 openviking/storage/observers/transaction_observer.py
 delete mode 100644 openviking/storage/transaction/context_manager.py
 delete mode 100644 openviking/storage/transaction/journal.py
 create mode 100644 openviking/storage/transaction/lock_context.py
 create mode 100644 openviking/storage/transaction/lock_handle.py
 create mode 100644 openviking/storage/transaction/lock_manager.py
 create mode 100644 openviking/storage/transaction/redo_log.py
 delete mode 100644 openviking/storage/transaction/transaction_manager.py
 delete mode 100644 openviking/storage/transaction/transaction_record.py
 delete mode 100644 openviking/storage/transaction/undo.py
 delete mode 100644 tests/transaction/test_context_manager.py
 delete mode 100644 tests/transaction/test_crash_recovery.py
 delete mode 100644 tests/transaction/test_journal.py
 create mode 100644 tests/transaction/test_lock_context.py
 create mode 100644 tests/transaction/test_lock_manager.py
 delete mode 100644 tests/transaction/test_post_actions.py
 create mode 100644 tests/transaction/test_redo_log.py
 delete mode 100644 tests/transaction/test_rm_rollback.py
 delete mode 100644 tests/transaction/test_transaction_manager.py
 delete mode 100644 tests/transaction/test_undo.py

diff --git a/docs/design/multi-tenant-design.md b/docs/design/multi-tenant-design.md
index 6c5ad43d..9a131ac2 100644
--- a/docs/design/multi-tenant-design.md
+++ b/docs/design/multi-tenant-design.md
@@ -283,7 +283,7 @@ def agent_space_name(self) -> str:
 | `agent/instructions` | `/{account_id}/agent/{agent_space}/instructions/` | account + user + agent | agent 的行为规则，每用户独立 |
 | `resources/` | `/{account_id}/resources/` | account | account 内共享的知识资源 |
 | `session/` | `/{account_id}/session/{user_space}/{session_id}/` | account + user | 用户的对话记录 |
-| `transactions/` | `/{account_id}/transactions/` | account | 账户级事务记录 |
+| `redo/` | `/{account_id}/_system/redo/` | account | 崩溃恢复 redo 标记 |
 | `_system/`（全局） | `/_system/` | 系统级 | 全局工作区列表 |
 | `_system/`（per-account） | `/{account_id}/_system/` | account | 用户注册表 |
 
diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
index 3d7ced04..1ada00dc 100644
--- a/docs/en/concepts/09-transaction.md
+++ b/docs/en/concepts/09-transaction.md
@@ -1,6 +1,6 @@
-# Transaction Mechanism
+# Path Locks and Crash Recovery
 
-OpenViking's transaction mechanism protects the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur.
+OpenViking uses two simple primitives — **path locks** and **redo log** — to protect the consistency of core write operations (`rm`, `mv`, `add_resource`, `session.commit`), ensuring that VikingFS, VectorDB, and QueueManager remain consistent even when failures occur.
 
 ## Design Philosophy
 
@@ -10,11 +10,11 @@ OpenViking is a context database where FS is the source of truth and VectorDB is
 
 ## Design Principles
 
-1. **Transactions cover synchronous operations only**: FS + VectorDB operations run inside transactions; SemanticQueue/EmbeddingQueue enqueue runs after commit (as post_actions) — they are idempotent and retriable
-2. **On by default**: All data operations automatically use transactions; no extra configuration needed
-3. **Write-exclusive**: Path locks ensure only one write transaction can operate on a path at a time
-4. **Undo Log model**: Record reverse operations before each change; replay them in reverse order on failure
-5. **Persistent journal**: Each transaction writes a journal file to AGFS for crash recovery
+1. **Write-exclusive**: Path locks ensure only one write operation can operate on a path at a time
+2. **On by default**: All data operations automatically acquire locks; no extra configuration needed
+3. **Lock as protection**: LockContext acquires locks on entry, releases on exit — no undo/journal/commit semantics
+4. **Only session_memory needs crash recovery**: RedoLog re-executes memory extraction after a process crash
+5. **Queue operations run outside locks**: SemanticQueue/EmbeddingQueue enqueue operations are idempotent and retriable
 
 ## Architecture
 
@@ -22,23 +22,63 @@ OpenViking is a context database where FS is the source of truth and VectorDB is
 Service Layer (rm / mv / add_resource / session.commit)
     |
     v
-+--[TransactionContext async context manager]--+
-|                                              |
-|  1. Create transaction + write journal       |
-|  2. Acquire path lock (poll + timeout)       |
-|  3. Execute operations (FS + VectorDB)       |
-|  4. Record Undo Log (mark completed)         |
-|  5. Commit / Rollback                        |
-|  6. Execute post_actions (enqueue etc)       |
-|  7. Release lock + clean up journal          |
-|                                              |
-|  On exception: reverse Undo Log + unlock     |
-+----------------------------------------------+
++--[LockContext async context manager]--+
+|                                       |
+|  1. Create LockHandle                 |
+|  2. Acquire path lock (poll+timeout)  |
+|  3. Execute operations (FS+VectorDB)  |
+|  4. Release lock                      |
+|                                       |
+|  On exception: auto-release lock,     |
+|  exception propagates unchanged       |
++---------------------------------------+
     |
     v
 Storage Layer (VikingFS, VectorDB, QueueManager)
 ```
 
+## Two Core Components
+
+### Component 1: PathLock + LockManager + LockContext (Path Lock System)
+
+**PathLock** implements file-based distributed locks with two lock types — POINT and SUBTREE — using fencing tokens to prevent TOCTOU races and automatic stale lock detection and cleanup.
+
+**LockHandle** is a lightweight lock holder token:
+
+```python
+@dataclass
+class LockHandle:
+    id: str          # Unique ID used to generate fencing tokens
+    locks: list[str] # Acquired lock file paths
+    created_at: float # Creation time
+```
+
+**LockManager** is a global singleton managing lock lifecycle:
+- Creates/releases LockHandles
+- Background cleanup of leaked locks (in-process safety net)
+- Executes RedoLog recovery on startup
+
+**LockContext** is an async context manager encapsulating the lock/unlock lifecycle:
+
+```python
+from openviking.storage.transaction import LockContext, get_lock_manager
+
+async with LockContext(get_lock_manager(), [path], lock_mode="point") as handle:
+    # Perform operations under lock protection
+    ...
+# Lock automatically released on exit (including exceptions)
+```
+
+### Component 2: RedoLog (Crash Recovery)
+
+Used only for the memory extraction phase of `session.commit`. Writes a marker before the operation, deletes it after success, and scans for leftover markers on startup to redo.
+
+```
+/local/_system/redo/{task_id}/redo.json
+```
+
+Memory extraction is idempotent — re-extracting from the same archive produces the same result.
+
 ## Consistency Issues and Solutions
 
 ### rm(uri)
@@ -47,134 +87,138 @@ Storage Layer (VikingFS, VectorDB, QueueManager)
 |---------|----------|
 | Delete file first, then index -> file gone but index remains -> search returns non-existent file | **Reverse order**: delete index first, then file. Index deletion failure -> both file and index intact |
 
-Transaction flow:
+**Locking strategy** (depends on target type):
+- Deleting a **directory**: `lock_mode="subtree"`, locks the directory itself
+- Deleting a **file**: `lock_mode="point"`, locks the file's parent directory
+
+Operation flow:
 
 ```
-1. Begin transaction, acquire lock (lock_mode="subtree")
-2. Snapshot VectorDB records (for rollback recovery)
+1. Check whether target is a directory or file, choose lock mode
+2. Acquire lock
 3. Delete VectorDB index -> immediately invisible to search
 4. Delete FS file
-5. Commit -> release lock -> delete journal
+5. Release lock
 ```
 
-Rollback: Step 4 fails -> restore VectorDB records from snapshot.
+VectorDB deletion fails -> exception thrown, lock auto-released, file and index both intact. FS deletion fails -> VectorDB already deleted but file remains, retry is safe.
 
 ### mv(old_uri, new_uri)
 
 | Problem | Solution |
 |---------|----------|
-| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Transaction wrapper; rollback on failure |
+| File moved to new path but index points to old path -> search returns old path (doesn't exist) | Copy first then update index; clean up copy on failure |
+
+**Locking strategy** (handled automatically via `lock_mode="mv"`):
+- Moving a **directory**: SUBTREE lock on both source path and destination parent
+- Moving a **file**: POINT lock on both source's parent and destination parent
 
-Transaction flow:
+Operation flow:
 
 ```
-1. Begin transaction, acquire lock (lock_mode="mv", SUBTREE on both source and destination for directories)
-2. Move FS file
-3. Update VectorDB URIs
-4. Commit -> release lock -> delete journal
+1. Check whether source is a directory or file, set src_is_dir
+2. Acquire mv lock (internally chooses SUBTREE or POINT based on src_is_dir)
+3. Copy to new location (source still intact, safe)
+4. If directory, remove the lock file carried over by cp into the copy
+5. Update VectorDB URIs
+   - Failure -> clean up copy, source and old index intact, consistent state
+6. Delete source
+7. Release lock
 ```
 
-Rollback: Step 3 fails -> move file back to original location.
-
 ### add_resource
 
 | Problem | Solution |
 |---------|----------|
 | File moved from temp to final directory, then crash -> file exists but never searchable | Two separate paths for first-time add vs incremental update |
 
-First-time add and incremental update are two independent paths:
-
 **First-time add** (target does not exist) — handled in `ResourceProcessor.process_resource` Phase 3.5:
 
 ```
-1. Begin transaction, lock parent_path of final_uri (lock_mode="point")
-2. Record undo: fs_write_new (uri=dst_path)
-3. agfs.mv temp directory -> final location
-4. Commit -> release lock -> delete journal
-5. Clean up temp directory
-6. Enqueue SemanticMsg(uri=final, target_uri=None) -> DAG runs on final, no callback
+1. Acquire lock on parent_path of final_uri (lock_mode="point")
+2. agfs.mv temp directory -> final location
+3. Release lock
+4. Clean up temp directory
+5. Enqueue SemanticMsg -> DAG runs on final
 ```
 
-Crash recovery: Undo deletes the incomplete dst_path; re-run `add_resource` to retry.
-
 **Incremental update** (target already exists) — temp stays in place:
 
 ```
 1. Enqueue SemanticMsg(uri=temp, target_uri=final) -> DAG runs on temp
 2. DAG completion triggers sync_diff_callback or move_temp_to_target_callback
-3. Each VikingFS.rm / VikingFS.mv inside callbacks creates its own independent transaction
+3. Each VikingFS.rm / VikingFS.mv inside callbacks acquires its own lock
 ```
 
-Note: DAG callbacks do NOT wrap operations in an outer TransactionContext. Each `VikingFS.rm` and `VikingFS.mv` has its own transaction internally. An outer lock would conflict with these inner locks (e.g. outer POINT lock on target_path vs inner SUBTREE lock from `rm`) causing deadlock.
+Note: DAG callbacks do NOT wrap operations in an outer lock. Each `VikingFS.rm` and `VikingFS.mv` has its own lock internally. An outer lock would conflict with these inner locks causing deadlock.
 
 ### session.commit()
 
 | Problem | Solution |
 |---------|----------|
-| Messages cleared but archive not written -> conversation data lost | Phase 1 without transaction (incomplete archive has no side effects) + Phase 2 with redo transaction |
+| Messages cleared but archive not written -> conversation data lost | Phase 1 without lock (incomplete archive has no side effects) + Phase 2 with RedoLog |
 
-LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding transaction. The design splits into two phases:
+LLM calls have unpredictable latency (5s~60s+) and cannot be inside a lock-holding operation. The design splits into two phases:
 
 ```
-Phase 1 — Archive (no transaction, no lock):
+Phase 1 — Archive (no lock):
   1. Generate archive summary (LLM)
   2. Write archive (history/archive_N/messages.jsonl + summaries)
   3. Clear messages.jsonl
   4. Clear in-memory message list
 
-Phase 2 — Memory extraction + write (transaction, lock_mode="none", redo semantics):
-  1. Record init_info (archive_uri, session_uri, user identity)
+Phase 2 — Memory extraction + write (RedoLog):
+  1. Write redo marker (archive_uri, session_uri, user identity)
   2. Extract memories from archived messages (LLM)
   3. Write current message state
   4. Write relations
-  5. Register post_action: enqueue SemanticQueue
-  6. Commit
+  5. Directly enqueue SemanticQueue
+  6. Delete redo marker
 ```
 
-**Redo semantics**: Phase 2 does not register undo log entries. On crash recovery, memory extraction and writing are re-executed from the archive (`_redo_session_memory`) instead of being rolled back.
-
 **Crash recovery analysis**:
 
 | Crash point | State | Recovery action |
 |------------|-------|----------------|
-| During Phase 1 archive write | No transaction | Incomplete archive; next commit scans history/ for index, unaffected |
-| Phase 1 archive complete but messages not cleared | No transaction | Archive complete + messages still present = redundant but safe |
-| During Phase 2 memory extraction/write | Journal EXEC | On startup: `_redo_session_memory` redoes extraction + write + enqueue from archive |
-| After Phase 2 commit | Journal COMMIT | On startup: replay `post_action("enqueue_semantic")` |
+| During Phase 1 archive write | No marker | Incomplete archive; next commit scans history/ for index, unaffected |
+| Phase 1 archive complete but messages not cleared | No marker | Archive complete + messages still present = redundant but safe |
+| During Phase 2 memory extraction/write | Redo marker exists | On startup: redo extraction + write + enqueue from archive |
+| Phase 2 complete | Redo marker deleted | No recovery needed |
 
-## TransactionContext
+## LockContext
 
-`TransactionContext` is an **async** context manager that encapsulates the full transaction lifecycle:
+`LockContext` is an **async** context manager that encapsulates lock acquisition and release:
 
 ```python
-from openviking.storage.transaction import TransactionContext, get_transaction_manager
+from openviking.storage.transaction import LockContext, get_lock_manager
 
-tx_manager = get_transaction_manager()
+lock_manager = get_lock_manager()
 
-async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx:
-    # Record undo (call before making changes)
-    seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot})
-    # Execute change
-    delete_from_vector_store(uris)
-    # Mark completed
-    tx.mark_completed(seq)
+# Point lock (write operations, semantic processing)
+async with LockContext(lock_manager, [path], lock_mode="point"):
+    # Perform operations...
+    pass
 
-    # Register post-commit action (optional)
-    tx.add_post_action("enqueue_semantic", {"uri": uri, ...})
+# Subtree lock (delete operations)
+async with LockContext(lock_manager, [path], lock_mode="subtree"):
+    # Perform operations...
+    pass
 
-    # Commit
-    await tx.commit()
-# Auto-rollback if commit() not called
+# MV lock (move operations)
+async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst):
+    # Perform operations...
+    pass
 ```
 
 **Lock modes**:
 
 | lock_mode | Use case | Behavior |
 |-----------|----------|----------|
-| `point` | Write operations | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors |
+| `point` | Write operations, semantic processing | Lock the specified path; conflicts with any lock on the same path and any SUBTREE lock on ancestors |
 | `subtree` | Delete operations | Lock the subtree root; conflicts with any lock on the same path, any lock on descendants, and any SUBTREE lock on ancestors |
 | `mv` | Move operations | Directory move: SUBTREE lock on both source and destination; File move: POINT lock on source parent and destination (controlled by `src_is_dir`) |
-| `none` | Lock-free operations | Skip lock acquisition, transition directly to EXEC status. Used for session.commit Phase 2 and other scenarios that don't require path mutual exclusion |
+
+**Exception handling**: `__aexit__` always releases locks and does not swallow exceptions. Lock acquisition failure raises `LockAcquisitionError`.
 
 ## Lock Types (POINT vs SUBTREE)
 
@@ -188,33 +232,6 @@ The lock mechanism uses two lock types to handle different conflict patterns:
 - **POINT (P)**: Used for write and semantic-processing operations. Only locks a single directory. Blocks if any ancestor holds a SUBTREE lock.
 - **SUBTREE (S)**: Used for rm and mv operations. Logically covers the entire subtree but only writes **one lock file** at the root. Before acquiring, scans all descendants and ancestor directories for conflicting locks.
 
-## Undo Log
-
-Each transaction maintains an Undo Log recording the reverse action for each step:
-
-| op_type | Forward operation | Rollback action |
-|---------|-------------------|-----------------|
-| `fs_mv` | Move file | Move back |
-| `fs_rm` | Delete file | Skip (irreversible; rm is always the last step by design) |
-| `fs_write_new` | Create new file/directory | Delete |
-| `fs_mkdir` | Create directory | Delete |
-| `vectordb_delete` | Delete index records | Restore from snapshot |
-| `vectordb_upsert` | Insert index records | Delete |
-| `vectordb_update_uri` | Update URI | Restore old value |
-
-Rollback rules: Only entries with `completed=True` are rolled back, in **reverse order**. Each step has independent try-catch (best-effort). During crash recovery, `recover_all=True` also reverses uncompleted entries to clean up partial operations.
-
-### Context Reconstruction
-
-VectorDB rollback operations require a `RequestContext` (containing account_id, user_id, agent_id, role). Since the original context is unavailable during crash recovery, `_ctx_*` fields are serialized into undo params when calling record_undo:
-
-- `_ctx_account_id`: Account ID
-- `_ctx_user_id`: User ID
-- `_ctx_agent_id`: Agent ID
-- `_ctx_role`: Role
-
-During rollback, `_reconstruct_ctx()` rebuilds the context from these fields. If reconstruction fails (missing fields), the VectorDB rollback step is skipped with a warning.
-
 ## Lock Mechanism
 
 ### Lock Protocol
@@ -223,7 +240,7 @@ Lock file path: `{path}/.path.ovlock`
 
 Lock file content (Fencing Token):
 ```
-{transaction_id}:{time_ns}:{lock_type}
+{handle_id}:{time_ns}:{lock_type}
 ```
 
 Where `lock_type` is `P` (POINT) or `S` (SUBTREE).
@@ -233,7 +250,7 @@ Where `lock_type` is `P` (POINT) or `S` (SUBTREE).
 ```
 loop until timeout (poll interval: 200ms):
     1. Check target directory exists
-    2. Check if target directory is locked by another transaction
+    2. Check if target directory is locked by another operation
        - Stale lock? -> remove and retry
        - Active lock? -> wait
     3. Check all ancestor directories for SUBTREE locks
@@ -241,8 +258,8 @@ loop until timeout (poll interval: 200ms):
        - Active lock? -> wait
     4. Write POINT (P) lock file
     5. TOCTOU double-check: re-scan ancestors for SUBTREE locks
-       - Conflict found: compare (timestamp, tx_id)
-       - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock
+       - Conflict found: compare (timestamp, handle_id)
+       - Later one (larger timestamp/handle_id) backs off (removes own lock) to prevent livelock
        - Wait and retry
     6. Verify lock file ownership (fencing token matches)
     7. Success
@@ -255,19 +272,19 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError
 ```
 loop until timeout (poll interval: 200ms):
     1. Check target directory exists
-    2. Check if target directory is locked by another transaction
+    2. Check if target directory is locked by another operation
        - Stale lock? -> remove and retry
        - Active lock? -> wait
     3. Check all ancestor directories for SUBTREE locks
        - Stale lock? -> remove and retry
        - Active lock? -> wait
-    4. Scan all descendant directories for any locks by other transactions
+    4. Scan all descendant directories for any locks by other operations
        - Stale lock? -> remove and retry
        - Active lock? -> wait
     5. Write SUBTREE (S) lock file (only one file, at the root path)
     6. TOCTOU double-check: re-scan descendants and ancestors
-       - Conflict found: compare (timestamp, tx_id)
-       - Later one (larger timestamp/tx_id) backs off (removes own lock) to prevent livelock
+       - Conflict found: compare (timestamp, handle_id)
+       - Later one (larger timestamp/handle_id) backs off (removes own lock) to prevent livelock
        - Wait and retry
     7. Verify lock file ownership (fencing token matches)
     8. Success
@@ -279,72 +296,33 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError
 
 **Stale lock detection**: PathLock checks the fencing token timestamp. Locks older than `lock_expire` (default 300s) are considered stale and are removed automatically during acquisition.
 
-**Transaction timeout**: TransactionManager checks active transactions every 60 seconds. Transactions with `updated_at` exceeding the transaction timeout (default 3600s) are rolled back.
-
-## Transaction Journal
-
-Each transaction persists a journal in AGFS:
+**In-process cleanup**: LockManager checks active LockHandles every 60 seconds. Handles created more than 3600 seconds ago are force-released.
 
-```
-/local/_system/transactions/{tx_id}/journal.json
-```
-
-Contains: transaction ID, status, lock paths, init_info, undo_log, post_actions.
-
-### Lifecycle
-
-```
-Create transaction -> write journal (INIT)
-Acquire lock       -> update journal (ACQUIRE -> EXEC)
-Execute changes    -> update journal per step (mark undo entry completed)
-Commit             -> update journal (COMMIT + post_actions)
-                   -> execute post_actions -> release locks -> delete journal
-Rollback           -> execute undo log -> release locks -> delete journal
-```
+**Orphan locks**: Lock files left behind after a process crash are automatically removed via stale lock detection when any operation next attempts to acquire a lock on the same path.
 
 ## Crash Recovery
 
-`TransactionManager.start()` automatically scans for residual journals on startup:
+`LockManager.start()` automatically scans for leftover markers in `/local/_system/redo/` on startup:
 
-| Journal status at crash | Recovery action |
-|------------------------|----------------|
-| `COMMIT` + non-empty post_actions | Replay post_actions -> release locks -> delete journal |
-| `COMMIT` + empty post_actions / `RELEASED` | Release locks -> delete journal |
-| `EXEC` / `FAIL` / `RELEASING` (`session_memory` operation) | Redo memory extraction + write from archive (`_redo_session_memory`) -> release locks -> delete journal |
-| `EXEC` / `FAIL` / `RELEASING` (all undo entries completed) | Roll forward (treat as committed, replay post_actions) -> release locks -> delete journal |
-| `EXEC` / `FAIL` / `RELEASING` (other) | Execute undo log rollback (`recover_all=True`) -> release locks -> delete journal |
-| `INIT` / `ACQUIRE` | Clean up orphan locks (using init_info.lock_paths) -> delete journal (no changes were made) |
+| Scenario | Recovery action |
+|----------|----------------|
+| session_memory extraction crash | Redo memory extraction + write + enqueue from archive |
+| Crash while holding lock | Lock file remains in AGFS; stale detection auto-cleans on next acquisition (default 300s expiry) |
+| Crash after enqueue, before worker processes | QueueFS SQLite persistence; worker auto-pulls after restart |
+| Orphan index | Cleaned on L2 on-demand load |
 
 ### Defense Summary
 
 | Failure scenario | Defense | Recovery timing |
 |-----------------|--------|-----------------|
-| Crash during transaction | Journal + undo log rollback | On restart |
-| Crash after commit, before enqueue | Journal post_actions replay | On restart |
-| Crash after enqueue, before worker processes | QueueFS SQLite persistence | Worker auto-pulls after restart |
-| Crash during session.commit Phase 2 | Journal + redo (re-extract memories from archive) | On restart |
-| Orphan index | Cleaned on L2 on-demand load | When user accesses |
-| Crash between lock creation and journal update | init_info records intended lock paths; recovery checks and cleans orphan locks | On restart |
-
-## Transaction State Machine
-
-```
-INIT -> ACQUIRE -> EXEC -> COMMIT -> RELEASING -> RELEASED
-                    |
-                   FAIL -> RELEASING -> RELEASED
-```
-
-- `INIT`: Transaction created, waiting for lock
-- `ACQUIRE`: Acquiring lock
-- `EXEC`: Transaction operations executing
-- `COMMIT`: Committed, post_actions may be pending
-- `FAIL`: Execution failed, entering rollback
-- `RELEASING`: Releasing locks
-- `RELEASED`: Locks released, transaction complete
+| Crash during operation | Lock auto-expires + stale detection | Next acquisition of same path lock |
+| Crash during session.commit Phase 2 | RedoLog marker + redo | On restart |
+| Crash after enqueue, before worker | QueueFS SQLite persistence | Worker restart |
+| Orphan index | L2 on-demand load cleanup | When user accesses |
 
 ## Configuration
 
-The transaction mechanism is enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section:
+Path locks are enabled by default with no extra configuration needed. **The default behavior is no-wait**: if the path is locked, `LockAcquisitionError` is raised immediately. To allow wait/retry, configure the `storage.transaction` section:
 
 ```json
 {
@@ -364,7 +342,7 @@ The transaction mechanism is enabled by default with no extra configuration need
 
 ### QueueFS Persistence
 
-The transaction mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup.
+The lock mechanism relies on QueueFS using the SQLite backend to ensure enqueued tasks survive process restarts. This is the default configuration and requires no manual setup.
 
 ## Related Documentation
 
diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md
index d9ddd547..1a60fa89 100644
--- a/docs/en/guides/01-configuration.md
+++ b/docs/en/guides/01-configuration.md
@@ -640,15 +640,14 @@ For startup and deployment details see [Deployment](./03-deployment.md), for aut
 
 ## storage.transaction Section
 
-The transaction mechanism is enabled by default and usually requires no configuration. **The default behavior is no-wait**: if the target path is already locked by another transaction, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry.
+Path locks are enabled by default and usually require no configuration. **The default behavior is no-wait**: if the target path is already locked by another operation, the operation fails immediately with `LockAcquisitionError`. Set `lock_timeout` to a positive value to allow polling/retry.
 
 ```json
 {
   "storage": {
     "transaction": {
       "lock_timeout": 5.0,
-      "lock_expire": 300.0,
-      "max_parallel_locks": 8
+      "lock_expire": 300.0
     }
   }
 }
@@ -658,9 +657,8 @@ The transaction mechanism is enabled by default and usually requires no configur
 |-----------|------|-------------|---------|
 | `lock_timeout` | float | Path lock acquisition timeout (seconds). `0` = fail immediately if locked (default). `> 0` = wait/retry up to this many seconds, then raise `LockAcquisitionError`. | `0.0` |
 | `lock_expire` | float | Stale lock expiry threshold (seconds). Locks held longer than this by a crashed process are force-released. | `300.0` |
-| `max_parallel_locks` | int | Max parallel locks during recursive locking for rm/mv operations | `8` |
 
-For details on the transaction mechanism, see [Transaction Mechanism](../concepts/09-transaction.md).
+For details on the lock mechanism, see [Path Locks and Crash Recovery](../concepts/09-transaction.md).
 
 ## Full Schema
 
@@ -698,8 +696,7 @@ For details on the transaction mechanism, see [Transaction Mechanism](../concept
     },
     "transaction": {
       "lock_timeout": 0.0,
-      "lock_expire": 300.0,
-      "max_parallel_locks": 8
+      "lock_expire": 300.0
     },
     "vectordb": {
       "backend": "local|remote",
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index 2d42815a..31d09c54 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -1,6 +1,6 @@
-# 事务机制
+# 路径锁与崩溃恢复
 
-OpenViking 的事务机制保护核心写操作（`rm`、`mv`、`add_resource`、`session.commit`）的一致性，确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。
+OpenViking 通过**路径锁**和**Redo Log** 两个简单原语保护核心写操作（`rm`、`mv`、`add_resource`、`session.commit`）的一致性，确保 VikingFS、VectorDB、QueueManager 三个子系统在故障时不会出现数据不一致。
 
 ## 设计哲学
 
@@ -10,171 +10,214 @@ OpenViking 是上下文数据库，FS 是源数据，VectorDB 是派生索引。
 
 ## 设计原则
 
-1. **事务只覆盖同步部分**：FS + VectorDB 操作在事务内；SemanticQueue/EmbeddingQueue 的 enqueue 在事务提交后执行（post_actions），它们是幂等的，失败可重试
-2. **默认生效**：所有数据操作命令自动开启事务机制，用户无需额外配置
-3. **写互斥**：通过路径锁保证同一路径同一时间只有一个写事务
-4. **Undo Log 模型**：变更前记录反向操作，失败时反序执行回滚
-5. **事务日志持久化**：每个事务在 AGFS 中写入 journal 文件，支持崩溃恢复
+1. **写互斥**：通过路径锁保证同一路径同一时间只有一个写操作
+2. **默认生效**：所有数据操作命令自动加锁，用户无需额外配置
+3. **锁即保护**：进入 LockContext 时加锁，退出时释放，没有 undo/journal/commit 语义
+4. **仅 session_memory 需要崩溃恢复**：通过 RedoLog 在进程崩溃后重做记忆提取
+5. **Queue 操作在锁外执行**：SemanticQueue/EmbeddingQueue 的 enqueue 是幂等的，失败可重试
 
 ## 架构
 
 ```
 Service Layer (rm / mv / add_resource / session.commit)
-    │
-    ▼
-┌──[TransactionContext 异步上下文管理器]──┐
-│                                         │
-│  1. 创建事务 + 写 journal               │
-│  2. 获取路径锁（轮询 + 超时）           │
-│  3. 执行操作（FS + VectorDB）           │
-│  4. 记录 Undo Log（每步完成后标记）     │
-│  5. Commit / Rollback                   │
-│  6. 执行 post_actions（enqueue 等）     │
-│  7. 释放锁 + 清理 journal               │
-│                                         │
-│  异常时：反序执行 Undo Log → 释放锁     │
-└─────────────────────────────────────────┘
-    │
-    ▼
+    |
+    v
++--[LockContext 异步上下文管理器]-------+
+|                                       |
+|  1. 创建 LockHandle                  |
+|  2. 获取路径锁（轮询 + 超时）        |
+|  3. 执行操作（FS + VectorDB）        |
+|  4. 释放锁                           |
+|                                       |
+|  异常时：自动释放锁，异常原样传播    |
++---------------------------------------+
+    |
+    v
 Storage Layer (VikingFS, VectorDB, QueueManager)
 ```
 
+## 两个核心组件
+
+### 组件 1：PathLock + LockManager + LockContext（路径锁系统）
+
+**PathLock** 实现基于文件的分布式锁，支持 POINT 和 SUBTREE 两种锁类型，使用 fencing token 防止 TOCTOU 竞争，自动检测并清理过期锁。
+
+**LockHandle** 是轻量的锁持有者令牌：
+
+```python
+@dataclass
+class LockHandle:
+    id: str          # 唯一标识，用于生成 fencing token
+    locks: list[str] # 已获取的锁文件路径
+    created_at: float # 创建时间
+```
+
+**LockManager** 是全局单例，管理锁生命周期：
+- 创建/释放 LockHandle
+- 后台清理泄漏的锁（进程内安全网）
+- 启动时执行 RedoLog 恢复
+
+**LockContext** 是异步上下文管理器，封装加锁/解锁生命周期：
+
+```python
+from openviking.storage.transaction import LockContext, get_lock_manager
+
+async with LockContext(get_lock_manager(), [path], lock_mode="point") as handle:
+    # 在锁保护下执行操作
+    ...
+# 退出时自动释放锁（包括异常情况）
+```
+
+### 组件 2：RedoLog（崩溃恢复）
+
+仅用于 `session.commit` 的记忆提取阶段。操作前写标记，成功后删标记，启动时扫描遗留标记并重做。
+
+```
+/local/_system/redo/{task_id}/redo.json
+```
+
+Memory 提取是幂等的 — 从同一个 archive 重新提取会得到相同结果。
+
 ## 一致性问题与解决方案
 
 ### rm(uri)
 
 | 问题 | 方案 |
 |------|------|
-| 先删文件再删索引 → 文件已删但索引残留 → 搜索返回不存在的文件 | **调换顺序**：先删索引再删文件。索引删除失败 → 文件和索引都在，搜索正常 |
+| 先删文件再删索引 -> 文件已删但索引残留 -> 搜索返回不存在的文件 | **调换顺序**：先删索引再删文件。索引删除失败 -> 文件和索引都在，搜索正常 |
+
+**加锁策略**（根据目标类型区分）：
+- 删除**目录**：`lock_mode="subtree"`，锁目录自身
+- 删除**文件**：`lock_mode="point"`，锁文件的父目录
 
-事务流程：
+操作流程：
 
 ```
-1. 开始事务，加锁（lock_mode="subtree"）
-2. 快照 VectorDB 中受影响的记录（用于回滚恢复）
-3. 删除 VectorDB 索引 → 搜索立刻不可见
+1. 检查目标是目录还是文件，选择锁模式
+2. 获取锁
+3. 删除 VectorDB 索引 -> 搜索立刻不可见
 4. 删除 FS 文件
-5. 提交 → 删锁 → 删 journal
+5. 释放锁
 ```
 
-回滚：第 4 步失败 → 从快照恢复 VectorDB 记录，文件和索引都在。
+VectorDB 删除失败 -> 直接抛异常，锁自动释放，文件和索引都在。FS 删除失败 -> VectorDB 已删但文件还在，重试即可。
 
 ### mv(old_uri, new_uri)
 
 | 问题 | 方案 |
 |------|------|
-| 文件移到新路径但索引指向旧路径 → 搜索返回旧路径（不存在） | 事务包装，移动失败则回滚 |
+| 文件移到新路径但索引指向旧路径 -> 搜索返回旧路径（不存在） | 先 copy 再更新索引，失败时清理副本 |
+
+**加锁策略**（通过 `lock_mode="mv"` 自动处理）：
+- 移动**目录**：源路径和目标父目录各加 SUBTREE 锁
+- 移动**文件**：源的父目录和目标父目录各加 POINT 锁
 
-事务流程：
+操作流程：
 
 ```
-1. 开始事务，加锁（lock_mode="mv"，目录移动时源和目标均 SUBTREE）
-2. 移动 FS 文件
-3. 更新 VectorDB 中的 URI
-4. 提交 → 删锁 → 删 journal
+1. 检查源是目录还是文件，确定 src_is_dir
+2. 获取 mv 锁（内部根据 src_is_dir 选择 SUBTREE 或 POINT）
+3. Copy 到新位置（源还在，安全）
+4. 如果是目录，删除副本中被 cp 带过去的锁文件
+5. 更新 VectorDB 中的 URI
+   - 失败 -> 清理副本，源和旧索引都在，一致状态
+6. 删除源
+7. 释放锁
 ```
 
-回滚：第 3 步失败 → 把文件移回原位。
-
 ### add_resource
 
 | 问题 | 方案 |
 |------|------|
-| 文件从临时目录移到正式目录后崩溃 → 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 |
-
-首次添加和增量更新是两条独立路径：
+| 文件从临时目录移到正式目录后崩溃 -> 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 |
 
 **首次添加**（target 不存在）— 在 `ResourceProcessor.process_resource` Phase 3.5 中处理：
 
 ```
-1. 开始事务，锁 final_uri 的父目录（lock_mode="point"）
-2. 记录 undo: fs_write_new（uri=dst_path）
-3. agfs.mv 临时目录 → 正式位置
-4. 提交 → 删锁 → 删 journal
-5. 清理临时目录
-6. 入队 SemanticMsg(uri=final, target_uri=None) → DAG 在 final 上跑，无 callback
+1. 获取锁，锁 final_uri 的父目录（lock_mode="point"）
+2. agfs.mv 临时目录 -> 正式位置
+3. 释放锁
+4. 清理临时目录
+5. 入队 SemanticMsg -> DAG 在 final 上跑
 ```
 
-崩溃恢复：undo 删除不完整的 dst_path；重新执行 `add_resource` 即可重试。
-
 **增量更新**（target 已存在）— temp 保持不动：
 
 ```
-1. 入队 SemanticMsg(uri=temp, target_uri=final) → DAG 在 temp 上跑
+1. 入队 SemanticMsg(uri=temp, target_uri=final) -> DAG 在 temp 上跑
 2. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback
-3. callback 内的每个 VikingFS.rm / VikingFS.mv 各自创建独立事务
+3. callback 内的每个 VikingFS.rm / VikingFS.mv 各自独立加锁
 ```
 
-注意：DAG callback 不在外层包裹 TransactionContext。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立事务保护。外层锁会与内部锁冲突（如外层 POINT lock on target_path 与内部 `rm` 的 SUBTREE lock 冲突）导致死锁。
+注意：DAG callback 不在外层加锁。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立锁保护。外层锁会与内部锁冲突导致死锁。
 
 ### session.commit()
 
 | 问题 | 方案 |
 |------|------|
-| 消息已清空但 archive 未写入 → 对话数据丢失 | Phase 1 无事务（archive 不完整无副作用）+ Phase 2 redo 事务 |
+| 消息已清空但 archive 未写入 -> 对话数据丢失 | Phase 1 无锁（archive 不完整无副作用）+ Phase 2 RedoLog |
 
-LLM 调用耗时不可控（5s~60s+），不能放在持锁事务内。设计拆为两个阶段：
+LLM 调用耗时不可控（5s~60s+），不能放在持锁操作内。设计拆为两个阶段：
 
 ```
-Phase 1 — 归档（无事务、无锁）：
+Phase 1 — 归档（无锁）：
   1. 生成归档摘要（LLM）
   2. 写 archive（history/archive_N/messages.jsonl + 摘要）
   3. 清空 messages.jsonl
   4. 清空内存中的消息列表
 
-Phase 2 — 记忆提取 + 写入（事务，lock_mode="none"，redo 语义）：
-  1. 记录 init_info（archive_uri、session_uri、用户身份信息）
+Phase 2 — 记忆提取 + 写入（RedoLog）：
+  1. 写 redo 标记（archive_uri、session_uri、用户身份信息）
   2. 从归档消息提取 memories（LLM）
   3. 写当前消息状态
   4. 写 relations
-  5. 注册 post_action: enqueue SemanticQueue
-  6. 提交
+  5. 直接 enqueue SemanticQueue
+  6. 删除 redo 标记
 ```
 
-**Redo 语义**：Phase 2 不注册 undo log。崩溃恢复时从 archive 重新执行记忆提取和写入（`_redo_session_memory`），而非回滚。
-
 **崩溃恢复分析**：
 
 | 崩溃时间点 | 状态 | 恢复动作 |
 |-----------|------|---------|
-| Phase 1 写 archive 中途 | 无事务 | archive 不完整，下次 commit 从 history/ 扫描 index，不受影响 |
-| Phase 1 archive 完成但 messages 未清空 | 无事务 | archive 完整 + messages 仍在 = 数据冗余但安全 |
-| Phase 2 记忆提取/写入中途 | journal EXEC | 启动恢复：`_redo_session_memory` 从 archive 重做提取+写入+入队 |
-| Phase 2 commit 后 | journal COMMIT | 启动恢复：重放 `post_action("enqueue_semantic")` |
+| Phase 1 写 archive 中途 | 无标记 | archive 不完整，下次 commit 从 history/ 扫描 index，不受影响 |
+| Phase 1 archive 完成但 messages 未清空 | 无标记 | archive 完整 + messages 仍在 = 数据冗余但安全 |
+| Phase 2 记忆提取/写入中途 | redo 标记存在 | 启动恢复：从 archive 重做提取+写入+入队 |
+| Phase 2 完成 | redo 标记已删 | 无需恢复 |
 
-## TransactionContext
+## LockContext
 
-`TransactionContext` 是**异步**上下文管理器，封装事务的完整生命周期：
+`LockContext` 是**异步**上下文管理器，封装锁的获取和释放：
 
 ```python
-from openviking.storage.transaction import TransactionContext, get_transaction_manager
+from openviking.storage.transaction import LockContext, get_lock_manager
 
-tx_manager = get_transaction_manager()
+lock_manager = get_lock_manager()
 
-async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx:
-    # 记录 undo（变更前调用）
-    seq = tx.record_undo("vectordb_delete", {"record_ids": ids, "records_snapshot": snapshot})
-    # 执行变更
-    delete_from_vector_store(uris)
-    # 标记完成
-    tx.mark_completed(seq)
+# Point 锁（写操作、语义处理）
+async with LockContext(lock_manager, [path], lock_mode="point"):
+    # 执行操作...
+    pass
 
-    # 注册提交后动作（可选）
-    tx.add_post_action("enqueue_semantic", {"uri": uri, ...})
+# Subtree 锁（删除操作）
+async with LockContext(lock_manager, [path], lock_mode="subtree"):
+    # 执行操作...
+    pass
 
-    # 提交
-    await tx.commit()
-# 未 commit 时自动回滚
+# MV 锁（移动操作）
+async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst):
+    # 执行操作...
+    pass
 ```
 
 **锁模式**：
 
 | lock_mode | 用途 | 行为 |
 |-----------|------|------|
-| `point` | 写操作 | 锁定指定路径；与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 |
+| `point` | 写操作、语义处理 | 锁定指定路径；与同路径的任何锁和祖先目录的 SUBTREE 锁冲突 |
 | `subtree` | 删除操作 | 锁定子树根节点；与同路径的任何锁、后代目录的任何锁和祖先目录的 SUBTREE 锁冲突 |
 | `mv` | 移动操作 | 目录移动：源和目标均加 SUBTREE 锁；文件移动：源父目录和目标均加 POINT 锁（通过 `src_is_dir` 控制） |
-| `none` | 无锁操作 | 跳过锁获取，直接进入 EXEC 状态。用于 session.commit Phase 2 等不需要路径互斥的场景 |
+
+**异常处理**：`__aexit__` 总是释放锁，不吞异常。获取锁失败时抛出 `LockAcquisitionError`。
 
 ## 锁类型（POINT vs SUBTREE）
 
@@ -188,33 +231,6 @@ async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as
 - **POINT (P)**：用于写操作和语义处理。只锁单个目录。若祖先目录持有 SUBTREE 锁则阻塞。
 - **SUBTREE (S)**：用于删除和移动操作。逻辑上覆盖整个子树，但只在根目录写**一个锁文件**。获取前扫描所有后代和祖先目录确认无冲突锁。
 
-## Undo Log
-
-每个事务维护一个 Undo Log，记录每步操作的反向动作：
-
-| op_type | 正向操作 | 回滚动作 |
-|---------|---------|---------|
-| `fs_mv` | 移动文件 | 移回原位 |
-| `fs_rm` | 删除文件 | 跳过（不可逆，设计上 rm 是最后一步） |
-| `fs_write_new` | 创建新文件/目录 | 删除 |
-| `fs_mkdir` | 创建目录 | 删除 |
-| `vectordb_delete` | 删除索引记录 | 从快照恢复 |
-| `vectordb_upsert` | 插入索引记录 | 删除 |
-| `vectordb_update_uri` | 更新 URI | 恢复旧值 |
-
-回滚规则：只回滚 `completed=True` 的条目，**反序执行**。每步独立 try-catch（best-effort）。崩溃恢复时使用 `recover_all=True`，也会回滚未完成的条目以清理部分操作残留。
-
-### 上下文重建
-
-VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、agent_id、role）。由于崩溃恢复时原始上下文不可用，record_undo 时在 undo params 中序列化 `_ctx_*` 字段：
-
-- `_ctx_account_id`：账户 ID
-- `_ctx_user_id`：用户 ID
-- `_ctx_agent_id`：代理 ID
-- `_ctx_role`：角色
-
-回滚时通过 `_reconstruct_ctx()` 从这些字段重建上下文。若重建失败（字段缺失），该 VectorDB 回滚步骤将被跳过并记录警告。
-
 ## 锁机制
 
 ### 锁协议
@@ -223,7 +239,7 @@ VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、age
 
 锁文件内容（Fencing Token）：
 ```
-{transaction_id}:{time_ns}:{lock_type}
+{handle_id}:{time_ns}:{lock_type}
 ```
 
 其中 `lock_type` 为 `P`（POINT）或 `S`（SUBTREE）。
@@ -233,16 +249,16 @@ VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、age
 ```
 循环直到超时（轮询间隔：200ms）：
     1. 检查目标目录存在
-    2. 检查目标路径是否被其他事务锁定
-       - 陈旧锁？ → 移除后重试
-       - 活跃锁？ → 等待
+    2. 检查目标路径是否被其他操作锁定
+       - 陈旧锁？ -> 移除后重试
+       - 活跃锁？ -> 等待
     3. 检查所有祖先目录是否有 SUBTREE 锁
-       - 陈旧锁？ → 移除后重试
-       - 活跃锁？ → 等待
+       - 陈旧锁？ -> 移除后重试
+       - 活跃锁？ -> 等待
     4. 写入 POINT (P) 锁文件
     5. TOCTOU 双重检查：重新扫描祖先目录的 SUBTREE 锁
-       - 发现冲突：比较 (timestamp, tx_id)
-       - 后到者（更大的 timestamp/tx_id）主动让步（删除自己的锁），防止活锁
+       - 发现冲突：比较 (timestamp, handle_id)
+       - 后到者（更大的 timestamp/handle_id）主动让步（删除自己的锁），防止活锁
        - 等待后重试
     6. 验证锁文件归属（fencing token 匹配）
     7. 成功
@@ -255,19 +271,19 @@ VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、age
 ```
 循环直到超时（轮询间隔：200ms）：
     1. 检查目标目录存在
-    2. 检查目标路径是否被其他事务锁定
-       - 陈旧锁？ → 移除后重试
-       - 活跃锁？ → 等待
+    2. 检查目标路径是否被其他操作锁定
+       - 陈旧锁？ -> 移除后重试
+       - 活跃锁？ -> 等待
     3. 检查所有祖先目录是否有 SUBTREE 锁
-       - 陈旧锁？ → 移除后重试
-       - 活跃锁？ → 等待
-    4. 扫描所有后代目录，检查是否有其他事务持有的锁
-       - 陈旧锁？ → 移除后重试
-       - 活跃锁？ → 等待
+       - 陈旧锁？ -> 移除后重试
+       - 活跃锁？ -> 等待
+    4. 扫描所有后代目录，检查是否有其他操作持有的锁
+       - 陈旧锁？ -> 移除后重试
+       - 活跃锁？ -> 等待
     5. 写入 SUBTREE (S) 锁文件（只写一个文件，在根路径）
     6. TOCTOU 双重检查：重新扫描后代目录和祖先目录
-       - 发现冲突：比较 (timestamp, tx_id)
-       - 后到者（更大的 timestamp/tx_id）主动让步（删除自己的锁），防止活锁
+       - 发现冲突：比较 (timestamp, handle_id)
+       - 后到者（更大的 timestamp/handle_id）主动让步（删除自己的锁），防止活锁
        - 等待后重试
     7. 验证锁文件归属（fencing token 匹配）
     8. 成功
@@ -279,72 +295,33 @@ VectorDB 回滚操作需要 `RequestContext`（包含 account_id、user_id、age
 
 **陈旧锁检测**：PathLock 检查 fencing token 中的时间戳。超过 `lock_expire`（默认 300s）的锁被视为陈旧锁，在加锁过程中自动移除。
 
-**事务超时**：TransactionManager 每 60 秒检查活跃事务，`updated_at` 超过事务超时时间（默认 3600s）的事务强制回滚。
-
-## 事务日志（Journal）
-
-每个事务在 AGFS 持久化一份 journal：
+**进程内清理**：LockManager 每 60 秒检查活跃的 LockHandle，创建超过 3600 秒的 handle 强制释放。
 
-```
-/local/_system/transactions/{tx_id}/journal.json
-```
-
-内容包含：事务 ID、状态、锁路径、init_info、undo_log、post_actions。
-
-### 生命周期
-
-```
-创建事务 → 写 journal（INIT）
-获取锁   → 更新 journal（ACQUIRE → EXEC）
-执行变更 → 每步更新 journal（标记 undo entry completed）
-提交     → 更新 journal（COMMIT + post_actions）
-         → 执行 post_actions → 删锁 → 删 journal
-回滚     → 执行 undo log → 删锁 → 删 journal
-```
+**孤儿锁**：进程崩溃后遗留的锁文件，在下次任何操作尝试获取同一路径锁时，通过 stale lock 检测自动移除。
 
 ## 崩溃恢复
 
-`TransactionManager.start()` 启动时自动扫描残留 journal：
+`LockManager.start()` 启动时自动扫描 `/local/_system/redo/` 目录中的遗留标记：
 
-| 崩溃时 journal 状态 | 恢复方式 |
-|---------------------|---------|
-| `COMMIT` + post_actions 非空 | 重放 post_actions → 删锁 → 删 journal |
-| `COMMIT` + post_actions 为空 / `RELEASED` | 删锁 → 删 journal |
-| `EXEC` / `FAIL` / `RELEASING`（`session_memory` 操作） | 从 archive 重做记忆提取+写入（`_redo_session_memory`） → 删锁 → 删 journal |
-| `EXEC` / `FAIL` / `RELEASING`（所有 undo 均 completed） | 前滚（视为已提交，重放 post_actions） → 删锁 → 删 journal |
-| `EXEC` / `FAIL` / `RELEASING`（其他） | 执行 undo log 回滚（`recover_all=True`） → 删锁 → 删 journal |
-| `INIT` / `ACQUIRE` | 通过 init_info.lock_paths 清理孤儿锁 → 删 journal（变更未执行） |
+| 场景 | 恢复方式 |
+|------|---------|
+| session_memory 提取中途崩溃 | 从 archive 重做记忆提取 + 写入 + enqueue |
+| 锁持有期间崩溃 | 锁文件留在 AGFS，下次获取时 stale 检测自动清理（默认 300s 过期）|
+| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化，worker 重启后自动拉取 |
+| 孤儿索引 | L2 按需加载时清理 |
 
 ### 防线总结
 
 | 异常场景 | 防线 | 恢复时机 |
 |---------|------|---------|
-| 事务内崩溃 | journal + undo log 回滚 | 重启时 |
-| 提交后 enqueue 前崩溃 | journal post_actions 重放 | 重启时 |
-| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后自动拉取 |
-| session.commit Phase 2 中崩溃 | journal + redo（从 archive 重做记忆提取） | 重启时 |
+| 操作中途崩溃 | 锁自动过期 + stale 检测 | 下次获取同路径锁时 |
+| session.commit Phase 2 崩溃 | RedoLog 标记 + 重做 | 重启时 |
+| enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后 |
 | 孤儿索引 | L2 按需加载时清理 | 用户访问时 |
-| 加锁后 journal 更新前崩溃 | init_info 记录预期锁路径，恢复时检查并清理孤儿锁 | 重启时 |
-
-## 事务状态机
-
-```
-INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED
-                   ↓
-                  FAIL → RELEASING → RELEASED
-```
-
-- `INIT`：事务已创建，等待锁获取
-- `ACQUIRE`：正在获取锁
-- `EXEC`：事务操作执行中
-- `COMMIT`：已提交，可能有 post_actions 待执行
-- `FAIL`：执行失败，进入回滚
-- `RELEASING`：正在释放锁
-- `RELEASED`：锁已释放，事务结束
 
 ## 配置
 
-事务机制默认启用，无需额外配置。**默认不等待**：若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试，可通过 `storage.transaction` 段配置：
+路径锁默认启用，无需额外配置。**默认不等待**：若路径被锁定则立即抛出 `LockAcquisitionError`。如需允许等待重试，可通过 `storage.transaction` 段配置：
 
 ```json
 {
@@ -360,11 +337,11 @@ INIT → ACQUIRE → EXEC → COMMIT → RELEASING → RELEASED
 | 参数 | 类型 | 说明 | 默认值 |
 |------|------|------|--------|
 | `lock_timeout` | float | 获取锁的等待超时（秒）。`0` = 立即失败（默认）；`> 0` = 最多等待此时间 | `0.0` |
-| `lock_expire` | float | 锁过期时间（秒），超过此时间的事务锁将被视为陈旧锁并强制释放 | `300.0` |
+| `lock_expire` | float | 锁过期时间（秒），超过此时间的锁将被视为陈旧锁并强制释放 | `300.0` |
 
 ### QueueFS 持久化
 
-事务机制依赖 QueueFS 使用 SQLite 后端，确保 enqueue 的任务在进程重启后可恢复。这是默认配置，无需手动设置。
+路径锁机制依赖 QueueFS 使用 SQLite 后端，确保 enqueue 的任务在进程重启后可恢复。这是默认配置，无需手动设置。
 
 ## 相关文档
 
diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md
index e4befcde..b0954bbf 100644
--- a/docs/zh/guides/01-configuration.md
+++ b/docs/zh/guides/01-configuration.md
@@ -615,15 +615,14 @@ HTTP 客户端（`SyncHTTPClient` / `AsyncHTTPClient`）和 CLI 工具连接远
 
 ## storage.transaction 段
 
-事务机制默认启用，通常无需配置。**默认行为是不等待**：若目标路径已被其他事务锁定，操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试，请将 `lock_timeout` 设为正数。
+路径锁默认启用，通常无需配置。**默认行为是不等待**：若目标路径已被其他操作锁定，操作立即失败并抛出 `LockAcquisitionError`。若需要等待重试，请将 `lock_timeout` 设为正数。
 
 ```json
 {
   "storage": {
     "transaction": {
       "lock_timeout": 5.0,
-      "lock_expire": 300.0,
-      "max_parallel_locks": 8
+      "lock_expire": 300.0
     }
   }
 }
@@ -632,10 +631,9 @@ HTTP 客户端（`SyncHTTPClient` / `AsyncHTTPClient`）和 CLI 工具连接远
 | 参数 | 类型 | 说明 | 默认值 |
 |------|------|------|--------|
 | `lock_timeout` | float | 获取路径锁的等待超时（秒）。`0` = 立即失败（默认）；`> 0` = 最多等待此时间后抛出 `LockAcquisitionError` | `0.0` |
-| `lock_expire` | float | 锁过期时间（秒）。超过此时间的事务锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` |
-| `max_parallel_locks` | int | rm/mv 操作递归加锁时的最大并行数 | `8` |
+| `lock_expire` | float | 锁过期时间（秒）。超过此时间的锁将被视为崩溃进程遗留的陈旧锁并强制释放 | `300.0` |
 
-事务机制的详细说明见 [事务机制](../concepts/09-transaction.md)。
+路径锁机制的详细说明见 [路径锁与崩溃恢复](../concepts/09-transaction.md)。
 
 ## 完整 Schema
 
@@ -673,8 +671,7 @@ HTTP 客户端（`SyncHTTPClient` / `AsyncHTTPClient`）和 CLI 工具连接远
     },
     "transaction": {
       "lock_timeout": 0.0,
-      "lock_expire": 300.0,
-      "max_parallel_locks": 8
+      "lock_expire": 300.0
     },
     "vectordb": {
       "backend": "local|remote",
diff --git a/openviking/async_client.py b/openviking/async_client.py
index 680b6ee8..b87b05ff 100644
--- a/openviking/async_client.py
+++ b/openviking/async_client.py
@@ -97,10 +97,10 @@ async def reset(cls) -> None:
                 await cls._instance.close()
                 cls._instance = None
 
-        # Also reset transaction manager singleton
-        from openviking.storage.transaction import reset_transaction_manager
+        # Also reset lock manager singleton
+        from openviking.storage.transaction import reset_lock_manager
 
-        reset_transaction_manager()
+        reset_lock_manager()
 
     # ============= Session methods =============
 
diff --git a/openviking/server/routers/content.py b/openviking/server/routers/content.py
index 9b4d0279..2463cc08 100644
--- a/openviking/server/routers/content.py
+++ b/openviking/server/routers/content.py
@@ -102,7 +102,7 @@ async def reindex(
     database. If regenerate=True, also regenerates L0/L1 summaries via LLM
     before re-embedding.
 
-    Uses transaction locking to prevent concurrent reindexes on the same URI.
+    Uses path locking to prevent concurrent reindexes on the same URI.
     Set wait=False to run in the background and track progress via task API.
     """
     from openviking.service.task_tracker import get_task_tracker
@@ -164,27 +164,17 @@ async def _do_reindex(
     regenerate: bool,
     ctx: RequestContext,
 ) -> dict:
-    """Execute reindex within a transaction."""
-    from openviking.storage.transaction import get_transaction_manager
+    """Execute reindex within a lock scope."""
+    from openviking.storage.transaction import LockContext, get_lock_manager
 
-    tm = get_transaction_manager()
-    tx = tm.create_transaction(init_info={"uri": uri, "regenerate": regenerate})
-    await tm.begin(tx.id)
+    viking_fs = service.viking_fs
+    path = viking_fs._uri_to_path(uri, ctx=ctx)
 
-    try:
-        await tm.acquire_lock_normal(tx.id, uri)
+    async with LockContext(get_lock_manager(), [path], lock_mode="point"):
         if regenerate:
-            result = await service.resources.summarize([uri], ctx=ctx)
+            return await service.resources.summarize([uri], ctx=ctx)
         else:
-            result = await service.resources.build_index([uri], ctx=ctx)
-        await tm.commit(tx.id)
-        return result
-    except Exception:
-        try:
-            await tm.rollback(tx.id)
-        except Exception:
-            pass
-        raise
+            return await service.resources.build_index([uri], ctx=ctx)
 
 
 async def _background_reindex_tracked(
diff --git a/openviking/server/routers/observer.py b/openviking/server/routers/observer.py
index 4d214cbf..e1910596 100644
--- a/openviking/server/routers/observer.py
+++ b/openviking/server/routers/observer.py
@@ -72,13 +72,13 @@ async def observer_vlm(
     return Response(status="ok", result=_component_to_dict(component))
 
 
-@router.get("/transaction")
-async def observer_transaction(
+@router.get("/lock")
+async def observer_lock(
     _ctx: RequestContext = Depends(get_request_context),
 ):
-    """Get transaction system status."""
+    """Get lock system status."""
     service = get_service()
-    component = service.debug.observer.transaction
+    component = service.debug.observer.lock
     return Response(status="ok", result=_component_to_dict(component))
 
 
diff --git a/openviking/service/core.py b/openviking/service/core.py
index 4c5a2670..8fd1c701 100644
--- a/openviking/service/core.py
+++ b/openviking/service/core.py
@@ -23,7 +23,7 @@
 from openviking.storage import VikingDBManager
 from openviking.storage.collection_schemas import init_context_collection
 from openviking.storage.queuefs.queue_manager import QueueManager, init_queue_manager
-from openviking.storage.transaction import TransactionManager, init_transaction_manager
+from openviking.storage.transaction import LockManager, init_lock_manager
 from openviking.storage.viking_fs import VikingFS, init_viking_fs
 from openviking.utils.resource_processor import ResourceProcessor
 from openviking.utils.skill_processor import SkillProcessor
@@ -75,7 +75,7 @@ def __init__(
         self._resource_processor: Optional[ResourceProcessor] = None
         self._skill_processor: Optional[SkillProcessor] = None
         self._session_compressor: Optional[SessionCompressor] = None
-        self._transaction_manager: Optional[TransactionManager] = None
+        self._lock_manager: Optional[LockManager] = None
         self._directory_initializer: Optional[DirectoryInitializer] = None
 
         # Sub-services
@@ -142,16 +142,14 @@ def _init_storage(
         if self._queue_manager:
             self._queue_manager.setup_standard_queues(self._vikingdb_manager, start=False)
 
-        # Initialize TransactionManager (fail-fast if AGFS missing)
+        # Initialize LockManager (fail-fast if AGFS missing)
         if self._agfs_client is None:
-            raise RuntimeError("AGFS client not initialized for TransactionManager")
+            raise RuntimeError("AGFS client not initialized for LockManager")
         tx_cfg = config.transaction
-        self._transaction_manager = init_transaction_manager(
+        self._lock_manager = init_lock_manager(
             agfs=self._agfs_client,
-            max_parallel_locks=tx_cfg.max_parallel_locks,
             lock_timeout=tx_cfg.lock_timeout,
             lock_expire=tx_cfg.lock_expire,
-            vector_store=self._vikingdb_manager,
         )
 
     @property
@@ -170,9 +168,9 @@ def vikingdb_manager(self) -> Optional[VikingDBManager]:
         return self._vikingdb_manager
 
     @property
-    def transaction_manager(self) -> Optional[TransactionManager]:
-        """Get TransactionManager instance."""
-        return self._transaction_manager
+    def lock_manager(self) -> Optional[LockManager]:
+        """Get LockManager instance."""
+        return self._lock_manager
 
     @property
     def session_compressor(self) -> Optional[SessionCompressor]:
@@ -293,10 +291,10 @@ async def initialize(self) -> None:
         self._skill_processor = SkillProcessor(vikingdb=self._vikingdb_manager)
         self._session_compressor = SessionCompressor(vikingdb=self._vikingdb_manager)
 
-        # Start TransactionManager if initialized
-        if self._transaction_manager:
-            await self._transaction_manager.start()
-            logger.info("TransactionManager started")
+        # Start LockManager if initialized
+        if self._lock_manager:
+            await self._lock_manager.start()
+            logger.info("LockManager started")
 
         # Wire up sub-services
         self._fs_service.set_viking_fs(self._viking_fs)
@@ -324,9 +322,9 @@ async def initialize(self) -> None:
 
     async def close(self) -> None:
         """Close OpenViking and release resources."""
-        if self._transaction_manager:
-            await self._transaction_manager.stop()
-            self._transaction_manager = None
+        if self._lock_manager:
+            await self._lock_manager.stop()
+            self._lock_manager = None
 
         if self._vikingdb_manager:
             self._vikingdb_manager.mark_closing()
diff --git a/openviking/service/debug_service.py b/openviking/service/debug_service.py
index 7dffff65..b99b4e73 100644
--- a/openviking/service/debug_service.py
+++ b/openviking/service/debug_service.py
@@ -9,14 +9,14 @@
 
 from openviking.storage import VikingDBManager
 from openviking.storage.observers import (
+    LockObserver,
     QueueObserver,
     RetrievalObserver,
-    TransactionObserver,
     VikingDBObserver,
     VLMObserver,
 )
 from openviking.storage.queuefs import get_queue_manager
-from openviking.storage.transaction import get_transaction_manager
+from openviking.storage.transaction import get_lock_manager
 from openviking_cli.utils.config import OpenVikingConfig
 
 
@@ -136,20 +136,20 @@ def vlm(self) -> ComponentStatus:
         )
 
     @property
-    def transaction(self) -> ComponentStatus:
-        """Get transaction status."""
+    def lock(self) -> ComponentStatus:
+        """Get lock system status."""
         try:
-            transaction_manager = get_transaction_manager()
+            lock_manager = get_lock_manager()
         except Exception:
             return ComponentStatus(
-                name="transaction",
+                name="lock",
                 is_healthy=False,
                 has_errors=True,
                 status="Not initialized",
             )
-        observer = TransactionObserver(transaction_manager)
+        observer = LockObserver(lock_manager)
         return ComponentStatus(
-            name="transaction",
+            name="lock",
             is_healthy=observer.is_healthy(),
             has_errors=observer.has_errors(),
             status=observer.get_status_table(),
@@ -173,7 +173,7 @@ def system(self) -> SystemStatus:
             "queue": self.queue,
             "vikingdb": self.vikingdb,
             "vlm": self.vlm,
-            "transaction": self.transaction,
+            "lock": self.lock,
             "retrieval": self.retrieval,
         }
         errors = [f"{c.name} has errors" for c in components.values() if c.has_errors]
diff --git a/openviking/session/session.py b/openviking/session/session.py
index 5726b8e3..c0f87bd9 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -226,10 +226,12 @@ def commit(self) -> Dict[str, Any]:
     async def commit_async(self) -> Dict[str, Any]:
         """Async commit session: two-phase approach.
 
-        Phase 1 (Archive, no transaction): Write archive, clear messages.
-        Phase 2 (Memory, transaction with redo semantics): Extract memories, write, enqueue.
+        Phase 1 (Archive): Write archive, clear messages.
+        Phase 2 (Memory, redo-log protected): Extract memories, write, enqueue.
         """
-        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+        import uuid
+
+        from openviking.storage.transaction import get_lock_manager
 
         result = {
             "session_id": self.session_id,
@@ -243,9 +245,7 @@ async def commit_async(self) -> Dict[str, Any]:
             get_current_telemetry().set("memory.extracted", 0)
             return result
 
-        tx_manager = get_transaction_manager()
-
-        # ===== Preparation (no transaction) =====
+        # ===== Preparation =====
         self._compression.compression_index += 1
         messages_to_archive = self._messages.copy()
 
@@ -253,7 +253,7 @@ async def commit_async(self) -> Dict[str, Any]:
         archive_abstract = self._extract_abstract_from_summary(summary)
         archive_overview = summary
 
-        # ===== Phase 1: Archive (no transaction, no lock) =====
+        # ===== Phase 1: Archive (no lock) =====
         archive_uri = (
             f"{self._session_uri}/history/archive_{self._compression.compression_index:03d}"
         )
@@ -273,54 +273,57 @@ async def commit_async(self) -> Dict[str, Any]:
             f"history/archive_{self._compression.compression_index:03d}/"
         )
 
-        # ===== Phase 2: Memory extraction + write (transaction, redo semantics) =====
-        async with TransactionContext(
-            tx_manager,
-            "session_memory",
-            [],
-            lock_mode="none",
-        ) as tx:
-            # Store redo info so _recover_one can redo from archive on crash
-            tx.record.init_info.update(
-                {
-                    "archive_uri": archive_uri,
-                    "session_uri": self._session_uri,
-                    "account_id": self.ctx.account_id,
-                    "user_id": self.ctx.user.user_id,
-                    "agent_id": self.ctx.user.agent_id,
-                    "role": self.ctx.role.value,
-                }
-            )
+        # ===== Phase 2: Memory extraction + write (redo-log protected) =====
+        redo_log = get_lock_manager().redo_log
+        task_id = str(uuid.uuid4())
+        redo_log.write_pending(
+            task_id,
+            {
+                "archive_uri": archive_uri,
+                "session_uri": self._session_uri,
+                "account_id": self.ctx.account_id,
+                "user_id": self.ctx.user.user_id,
+                "agent_id": self.ctx.user.agent_id,
+                "role": self.ctx.role.value,
+            },
+        )
 
-            if self._session_compressor:
-                logger.info(
-                    f"Starting memory extraction from {len(messages_to_archive)} archived messages"
-                )
-                memories = await self._session_compressor.extract_long_term_memories(
-                    messages=messages_to_archive,
-                    user=self.user,
-                    session_id=self.session_id,
-                    ctx=self.ctx,
-                )
-                logger.info(f"Extracted {len(memories)} memories")
-                result["memories_extracted"] = len(memories)
-                self._stats.memories_extracted += len(memories)
-                get_current_telemetry().set("memory.extracted", len(memories))
-
-            await self._write_to_agfs_async(self._messages)
-            await self._write_relations_async()
-            tx.add_post_action(
-                "enqueue_semantic",
-                {
-                    "uri": self._session_uri,
-                    "context_type": "memory",
-                    "account_id": self.ctx.account_id,
-                    "user_id": self.ctx.user.user_id,
-                    "agent_id": self.ctx.user.agent_id,
-                    "role": self.ctx.role.value,
-                },
+        if self._session_compressor:
+            logger.info(
+                f"Starting memory extraction from {len(messages_to_archive)} archived messages"
+            )
+            memories = await self._session_compressor.extract_long_term_memories(
+                messages=messages_to_archive,
+                user=self.user,
+                session_id=self.session_id,
+                ctx=self.ctx,
             )
-            await tx.commit()
+            logger.info(f"Extracted {len(memories)} memories")
+            result["memories_extracted"] = len(memories)
+            self._stats.memories_extracted += len(memories)
+            get_current_telemetry().set("memory.extracted", len(memories))
+
+        await self._write_to_agfs_async(self._messages)
+        await self._write_relations_async()
+
+        # Enqueue semantic processing directly
+        from openviking.storage.queuefs import get_queue_manager
+        from openviking.storage.queuefs.semantic_msg import SemanticMsg
+
+        queue_manager = get_queue_manager()
+        if queue_manager:
+            msg = SemanticMsg(
+                uri=self._session_uri,
+                context_type="memory",
+                account_id=self.ctx.account_id,
+                user_id=self.ctx.user.user_id,
+                agent_id=self.ctx.user.agent_id,
+                role=self.ctx.role.value,
+            )
+            semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC)
+            await semantic_queue.enqueue(msg)
+
+        redo_log.mark_done(task_id)
 
         # Update active_count
         active_count_updated = await self._update_active_counts_async()
diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py
index 7f6a483b..010200e7 100644
--- a/openviking/storage/errors.py
+++ b/openviking/storage/errors.py
@@ -31,13 +31,9 @@ class SchemaError(StorageException):
     """Raised when schema validation fails."""
 
 
-class TransactionError(VikingDBException):
-    """Raised when a transaction operation fails."""
+class LockError(VikingDBException):
+    """Raised when a lock operation fails."""
 
 
-class LockAcquisitionError(TransactionError):
+class LockAcquisitionError(LockError):
     """Raised when lock acquisition fails."""
-
-
-class TransactionRollbackError(TransactionError):
-    """Raised when transaction rollback fails."""
diff --git a/openviking/storage/observers/__init__.py b/openviking/storage/observers/__init__.py
index 4a36700a..dae5aed3 100644
--- a/openviking/storage/observers/__init__.py
+++ b/openviking/storage/observers/__init__.py
@@ -1,17 +1,17 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
 from .base_observer import BaseObserver
+from .lock_observer import LockObserver
 from .queue_observer import QueueObserver
 from .retrieval_observer import RetrievalObserver
-from .transaction_observer import TransactionObserver
 from .vikingdb_observer import VikingDBObserver
 from .vlm_observer import VLMObserver
 
 __all__ = [
     "BaseObserver",
+    "LockObserver",
     "QueueObserver",
     "RetrievalObserver",
-    "TransactionObserver",
     "VikingDBObserver",
     "VLMObserver",
 ]
diff --git a/openviking/storage/observers/lock_observer.py b/openviking/storage/observers/lock_observer.py
new file mode 100644
index 00000000..92521790
--- /dev/null
+++ b/openviking/storage/observers/lock_observer.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""LockObserver: Lock system observability."""
+
+import time
+from typing import Any, Dict, List
+
+from openviking.storage.observers.base_observer import BaseObserver
+from openviking.storage.transaction.lock_manager import LockManager
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class LockObserver(BaseObserver):
+    """Observability tool for the lock system."""
+
+    def __init__(self, lock_manager: LockManager):
+        self._manager = lock_manager
+
+    def get_active_locks(self) -> List[Dict[str, Any]]:
+        """Return info about every active lock handle."""
+        now = time.time()
+        return [
+            {
+                "id": h.id,
+                "lock_count": len(h.locks),
+                "created_at": h.created_at,
+                "duration_seconds": round(now - h.created_at, 1),
+            }
+            for h in self._manager.get_active_handles().values()
+        ]
+
+    def get_hanging_locks(self, threshold: float = 600) -> List[Dict[str, Any]]:
+        """Return locks that have been held longer than *threshold* seconds."""
+        now = time.time()
+        return [lock for lock in self.get_active_locks() if now - lock["created_at"] > threshold]
+
+    # ------ BaseObserver interface ------
+
+    def get_status_table(self) -> str:
+        locks = self.get_active_locks()
+        if not locks:
+            return "No active locks."
+
+        from tabulate import tabulate
+
+        data = [
+            {
+                "Handle ID": l["id"][:8] + "...",
+                "Locks": l["lock_count"],
+                "Duration": f"{l['duration_seconds']}s",
+                "Created": time.strftime("%H:%M:%S", time.localtime(l["created_at"])),
+            }
+            for l in locks
+        ]
+        data.append(
+            {
+                "Handle ID": f"TOTAL ({len(locks)})",
+                "Locks": sum(l["lock_count"] for l in locks),
+                "Duration": "",
+                "Created": "",
+            }
+        )
+        return tabulate(data, headers="keys", tablefmt="pretty")
+
+    def is_healthy(self) -> bool:
+        return not self.get_hanging_locks(600)
+
+    def has_errors(self) -> bool:
+        return bool(self.get_hanging_locks(600))
diff --git a/openviking/storage/observers/transaction_observer.py b/openviking/storage/observers/transaction_observer.py
deleted file mode 100644
index e29b7665..00000000
--- a/openviking/storage/observers/transaction_observer.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-TransactionObserver: Transaction system observability tool.
-
-Provides methods to observe and report transaction manager status.
-"""
-
-import time
-from typing import Any, Dict
-
-from openviking.storage.observers.base_observer import BaseObserver
-from openviking.storage.transaction import TransactionManager
-from openviking.storage.transaction.transaction_record import TransactionStatus
-from openviking_cli.utils import run_async
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-class TransactionObserver(BaseObserver):
-    """
-    TransactionObserver: System observability tool for transaction management.
-
-    Provides methods to query transaction status and format output.
-    """
-
-    def __init__(self, transaction_manager: TransactionManager):
-        """Initialize transaction observer.
-
-        Args:
-            transaction_manager: Transaction manager instance to observe
-        """
-        self._transaction_manager = transaction_manager
-
-    async def get_status_table_async(self) -> str:
-        """Get transaction status table asynchronously.
-
-        Returns:
-            Formatted table string showing transaction status
-        """
-        if not self._transaction_manager:
-            return "Transaction manager not initialized."
-
-        transactions = self._transaction_manager.get_active_transactions()
-
-        if not transactions:
-            return "No active transactions."
-
-        return self._format_status_as_table(transactions)
-
-    def get_status_table(self) -> str:
-        """Get transaction status table synchronously.
-
-        Returns:
-            Formatted table string showing transaction status
-        """
-        return run_async(self.get_status_table_async())
-
-    def __str__(self) -> str:
-        """String representation returns status table.
-
-        Returns:
-            Formatted table string
-        """
-        return self.get_status_table()
-
-    def _format_status_as_table(self, transactions: Dict[str, Any]) -> str:
-        """Format transaction statuses as a table.
-
-        Args:
-            transactions: Dict mapping transaction IDs to TransactionRecord
-
-        Returns:
-            Formatted table string
-        """
-        from tabulate import tabulate
-
-        data = []
-
-        # Group transactions by status
-        status_counts = {
-            TransactionStatus.INIT: 0,
-            TransactionStatus.ACQUIRE: 0,
-            TransactionStatus.EXEC: 0,
-            TransactionStatus.COMMIT: 0,
-            TransactionStatus.FAIL: 0,
-            TransactionStatus.RELEASING: 0,
-            TransactionStatus.RELEASED: 0,
-        }
-
-        for tx_id, tx in transactions.items():
-            duration = time.time() - tx.created_at
-            duration_str = f"{duration:.1f}s"
-
-            status_counts[tx.status] += 1
-
-            data.append(
-                {
-                    "Transaction ID": tx_id[:8] + "...",
-                    "Status": str(tx.status),
-                    "Locks": len(tx.locks),
-                    "Duration": duration_str,
-                    "Created": time.strftime("%H:%M:%S", time.localtime(tx.created_at)),
-                }
-            )
-
-        status_priority = {
-            TransactionStatus.EXEC: 0,
-            TransactionStatus.ACQUIRE: 1,
-            TransactionStatus.RELEASING: 2,
-            TransactionStatus.INIT: 3,
-            TransactionStatus.COMMIT: 4,
-            TransactionStatus.FAIL: 5,
-            TransactionStatus.RELEASED: 6,
-        }
-
-        data.sort(key=lambda x: status_priority.get(TransactionStatus(x["Status"]), 99))
-
-        total = len(transactions)
-        total_locks = sum(len(tx.locks) for tx in transactions.values())
-
-        summary_row = {
-            "Transaction ID": f"TOTAL ({total})",
-            "Status": "",
-            "Locks": total_locks,
-            "Duration": "",
-            "Created": "",
-        }
-        data.append(summary_row)
-
-        return tabulate(data, headers="keys", tablefmt="pretty")
-
-    def is_healthy(self) -> bool:
-        """Check if transaction system is healthy.
-
-        Returns:
-            True if system is healthy, False otherwise
-        """
-        return not self.has_errors()
-
-    def has_errors(self) -> bool:
-        """Check if transaction system has any errors.
-
-        Returns:
-            True if errors (failed transactions) exist, False otherwise
-        """
-        if not self._transaction_manager:
-            return True
-
-        transactions = self._transaction_manager.get_active_transactions()
-
-        # Check for failed transactions
-        for tx_id, tx in transactions.items():
-            if tx.status == TransactionStatus.FAIL:
-                logger.warning(f"Found failed transaction: {tx_id}")
-                return True
-
-        return False
-
-    def get_failed_transactions(self) -> Dict[str, Any]:
-        """Get all failed transactions.
-
-        Returns:
-            Dict mapping transaction IDs to failed TransactionRecord
-        """
-        if not self._transaction_manager:
-            return {}
-
-        transactions = self._transaction_manager.get_active_transactions()
-        return {
-            tx_id: tx for tx_id, tx in transactions.items() if tx.status == TransactionStatus.FAIL
-        }
-
-    def get_hanging_transactions(self, timeout_threshold: int = 300) -> Dict[str, Any]:
-        """Get transactions that have been running longer than threshold.
-
-        Args:
-            timeout_threshold: Timeout threshold in seconds (default: 300 = 5 minutes)
-
-        Returns:
-            Dict mapping transaction IDs to TransactionRecord that exceed threshold
-        """
-        if not self._transaction_manager:
-            return {}
-
-        transactions = self._transaction_manager.get_active_transactions()
-        current_time = time.time()
-
-        return {
-            tx_id: tx
-            for tx_id, tx in transactions.items()
-            if current_time - tx.created_at > timeout_threshold
-        }
-
-    def get_status_summary(self) -> Dict[str, int]:
-        """Get summary of transaction counts by status.
-
-        Returns:
-            Dict mapping status strings to counts
-        """
-        if not self._transaction_manager:
-            return {}
-
-        transactions = self._transaction_manager.get_active_transactions()
-
-        summary = {
-            "INIT": 0,
-            "ACQUIRE": 0,
-            "EXEC": 0,
-            "COMMIT": 0,
-            "FAIL": 0,
-            "RELEASING": 0,
-            "RELEASED": 0,
-            "TOTAL": 0,
-        }
-
-        for tx in transactions.values():
-            summary[str(tx.status)] += 1
-            summary["TOTAL"] += 1
-
-        return summary
diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index 32ac0501..4ee10a93 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -506,7 +506,7 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]:
 
     async def _overview_task(self, dir_uri: str) -> None:
         from openviking.storage.errors import LockAcquisitionError
-        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+        from openviking.storage.transaction import LockContext, get_lock_manager
 
         node = self._nodes.get(dir_uri)
         if not node:
@@ -538,18 +538,13 @@ async def _overview_task(self, dir_uri: str) -> None:
 
             dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx)
             try:
-                # No undo entries recorded: semantic files (.overview.md / .abstract.md) are
-                # regenerable, so residual writes after a crash are acceptable.
-                async with TransactionContext(
-                    get_transaction_manager(), "semantic_dag", [dir_path], lock_mode="point"
-                ) as tx:
+                async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"):
                     await self._viking_fs.write_file(
                         f"{dir_uri}/.overview.md", overview, ctx=self._ctx
                     )
                     await self._viking_fs.write_file(
                         f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx
                     )
-                    await tx.commit()
             except LockAcquisitionError:
                 logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping")
 
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 830dbd12..0db4019b 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -285,17 +285,13 @@ async def _process_single_directory(
     ) -> None:
         """Process single directory, generate .abstract.md and .overview.md."""
         from openviking.storage.errors import LockAcquisitionError
-        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+        from openviking.storage.transaction import LockContext, get_lock_manager
 
         viking_fs = get_viking_fs()
         dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx)
 
         try:
-            # No undo entries recorded: semantic files (.overview.md / .abstract.md) are
-            # regenerable, so residual writes after a crash are acceptable.
-            async with TransactionContext(
-                get_transaction_manager(), "semantic", [dir_path], lock_mode="point"
-            ) as tx:
+            async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"):
                 # 1. Collect .abstract.md from subdirectories
                 children_abstracts = await self._collect_children_abstracts(children_uris)
 
@@ -335,8 +331,6 @@ async def _process_single_directory(
                 for result in results:
                     if isinstance(result, Exception):
                         logger.error(f"Vectorization failed: {result}", exc_info=True)
-
-                await tx.commit()
         except LockAcquisitionError:
             logger.info(f"[SemanticProcessor] {uri} does not exist or is locked, skipping")
 
diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py
index afbc3e1e..0fca8816 100644
--- a/openviking/storage/transaction/__init__.py
+++ b/openviking/storage/transaction/__init__.py
@@ -3,34 +3,28 @@
 """
 Transaction module for OpenViking.
 
-Provides transaction management and lock mechanisms for data operations.
+Provides path-lock management and redo-log crash recovery.
 """
 
-from openviking.storage.transaction.context_manager import TransactionContext
-from openviking.storage.transaction.journal import TransactionJournal
-from openviking.storage.transaction.path_lock import PathLock
-from openviking.storage.transaction.transaction_manager import (
-    TransactionManager,
-    get_transaction_manager,
-    init_transaction_manager,
-    reset_transaction_manager,
-)
-from openviking.storage.transaction.transaction_record import (
-    TransactionRecord,
-    TransactionStatus,
+from openviking.storage.transaction.lock_context import LockContext
+from openviking.storage.transaction.lock_handle import LockHandle, LockOwner
+from openviking.storage.transaction.lock_manager import (
+    LockManager,
+    get_lock_manager,
+    init_lock_manager,
+    reset_lock_manager,
 )
-from openviking.storage.transaction.undo import UndoEntry, execute_rollback
+from openviking.storage.transaction.path_lock import PathLock
+from openviking.storage.transaction.redo_log import RedoLog
 
 __all__ = [
+    "LockContext",
+    "LockHandle",
+    "LockManager",
+    "LockOwner",
     "PathLock",
-    "TransactionContext",
-    "TransactionJournal",
-    "TransactionManager",
-    "TransactionRecord",
-    "TransactionStatus",
-    "UndoEntry",
-    "execute_rollback",
-    "get_transaction_manager",
-    "init_transaction_manager",
-    "reset_transaction_manager",
+    "RedoLog",
+    "get_lock_manager",
+    "init_lock_manager",
+    "reset_lock_manager",
 ]
diff --git a/openviking/storage/transaction/context_manager.py b/openviking/storage/transaction/context_manager.py
deleted file mode 100644
index 09697e10..00000000
--- a/openviking/storage/transaction/context_manager.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Transaction context manager for OpenViking.
-
-Provides an async context manager that wraps a set of operations in a
-transaction with automatic rollback on failure.
-"""
-
-from typing import Any, Dict, List, Optional
-
-from openviking.storage.errors import LockAcquisitionError, TransactionError
-from openviking.storage.transaction.transaction_record import TransactionRecord
-from openviking.storage.transaction.undo import UndoEntry
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-class TransactionContext:
-    """Async context manager for transactional operations.
-
-    Usage::
-
-        async with TransactionContext(tx_manager, "rm", [path], lock_mode="subtree") as tx:
-            seq = tx.record_undo("fs_rm", {"uri": uri})
-            # ... do work ...
-            tx.mark_completed(seq)
-            await tx.commit()
-    """
-
-    def __init__(
-        self,
-        tx_manager: Any,
-        operation: str,
-        lock_paths: List[str],
-        lock_mode: str = "point",
-        mv_dst_path: Optional[str] = None,
-        src_is_dir: bool = True,
-    ):
-        self._tx_manager = tx_manager
-        self._operation = operation
-        self._lock_paths = lock_paths
-        self._lock_mode = lock_mode
-        self._mv_dst_path = mv_dst_path
-        self._src_is_dir = src_is_dir
-        self._record: Optional[TransactionRecord] = None
-        self._committed = False
-        self._sequence = 0
-
-    @property
-    def record(self) -> TransactionRecord:
-        if self._record is None:
-            raise TransactionError("Transaction not started")
-        return self._record
-
-    async def __aenter__(self) -> "TransactionContext":
-        self._record = self._tx_manager.create_transaction(
-            init_info={
-                "operation": self._operation,
-                "lock_paths": self._lock_paths,
-                "lock_mode": self._lock_mode,
-                "mv_dst_path": self._mv_dst_path,
-            }
-        )
-        tx_id = self._record.id
-
-        # Write journal BEFORE acquiring locks so that crash recovery can
-        # find orphan locks via init_info even if the process dies between
-        # lock creation and journal update.
-        try:
-            self._tx_manager.journal.write(self._record.to_journal())
-        except Exception as e:
-            logger.warning(f"[Transaction] Failed to write journal for {tx_id}: {e}")
-
-        success = False
-        if self._lock_mode == "none":
-            # No lock acquisition — transition directly to EXEC status
-            tx = self._tx_manager.get_transaction(tx_id)
-            if tx:
-                from openviking.storage.transaction.transaction_record import TransactionStatus
-
-                tx.update_status(TransactionStatus.EXEC)
-            success = True
-        elif self._lock_mode == "subtree":
-            for path in self._lock_paths:
-                success = await self._tx_manager.acquire_lock_subtree(tx_id, path)
-                if not success:
-                    break
-        elif self._lock_mode == "mv":
-            if len(self._lock_paths) < 1 or not self._mv_dst_path:
-                raise TransactionError("mv lock mode requires lock_paths[0] and mv_dst_path")
-            success = await self._tx_manager.acquire_lock_mv(
-                tx_id,
-                self._lock_paths[0],
-                self._mv_dst_path,
-                src_is_dir=self._src_is_dir,
-            )
-        else:
-            # "point" mode (default)
-            for path in self._lock_paths:
-                success = await self._tx_manager.acquire_lock_point(tx_id, path)
-                if not success:
-                    break
-
-        if not success:
-            await self._tx_manager.rollback(tx_id)
-            raise LockAcquisitionError(
-                f"Failed to acquire {self._lock_mode} lock for {self._lock_paths}"
-            )
-
-        # Update journal with actual lock paths now populated in the record.
-        try:
-            self._tx_manager.journal.update(self._record.to_journal())
-        except Exception as e:
-            logger.warning(f"[Transaction] Failed to update journal for {tx_id}: {e}")
-
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if not self._committed:
-            try:
-                await self._tx_manager.rollback(self._record.id)
-            except Exception as e:
-                logger.error(f"Rollback failed during __aexit__: {e}")
-        return False
-
-    def record_undo(self, op_type: str, params: Dict[str, Any]) -> int:
-        seq = self._sequence
-        self._sequence += 1
-        entry = UndoEntry(sequence=seq, op_type=op_type, params=params)
-        self.record.undo_log.append(entry)
-
-        try:
-            self._tx_manager.journal.update(self.record.to_journal())
-        except Exception as e:
-            logger.debug(f"[Transaction] Failed to persist journal: {e}")
-
-        return seq
-
-    def mark_completed(self, sequence: int) -> None:
-        for entry in self.record.undo_log:
-            if entry.sequence == sequence:
-                entry.completed = True
-                break
-
-        try:
-            self._tx_manager.journal.update(self.record.to_journal())
-        except Exception as e:
-            logger.debug(f"[Transaction] Failed to persist journal: {e}")
-
-    def add_post_action(self, action_type: str, params: Dict[str, Any]) -> None:
-        self.record.post_actions.append({"type": action_type, "params": params})
-
-    async def commit(self) -> None:
-        success = await self._tx_manager.commit(self._record.id)
-        if not success:
-            raise TransactionError(f"Failed to commit transaction {self._record.id}")
-        self._committed = True
diff --git a/openviking/storage/transaction/journal.py b/openviking/storage/transaction/journal.py
deleted file mode 100644
index 6cb14474..00000000
--- a/openviking/storage/transaction/journal.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Transaction journal for crash recovery.
-
-Persists transaction state to AGFS so that incomplete transactions can be
-detected and recovered after a process restart.
-"""
-
-import json
-from typing import Any, Dict, List
-
-from openviking.pyagfs import AGFSClient
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-# Journal root path (global, not behind VikingFS URI mapping)
-_JOURNAL_ROOT = "/local/_system/transactions"
-
-
-class TransactionJournal:
-    """Persists transaction records to AGFS for crash recovery.
-
-    Journal files live at ``/local/_system/transactions/{tx_id}/journal.json``.
-    """
-
-    def __init__(self, agfs: AGFSClient):
-        self._agfs = agfs
-
-    def _tx_dir(self, tx_id: str) -> str:
-        return f"{_JOURNAL_ROOT}/{tx_id}"
-
-    def _journal_path(self, tx_id: str) -> str:
-        return f"{_JOURNAL_ROOT}/{tx_id}/journal.json"
-
-    def _ensure_dir(self, path: str) -> None:
-        """Create directory, ignoring already-exists errors."""
-        try:
-            self._agfs.mkdir(path)
-        except Exception as e:
-            logger.warning(f"[Journal] mkdir {path}: {e}")
-
-    def write(self, data: Dict[str, Any]) -> None:
-        """Create a new journal entry for a transaction.
-
-        Args:
-            data: Serialized transaction record (from TransactionRecord.to_journal()).
-        """
-        tx_id = data["id"]
-        self._ensure_dir("/local/_system")
-        self._ensure_dir(_JOURNAL_ROOT)
-        self._ensure_dir(self._tx_dir(tx_id))
-        payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8")
-        self._agfs.write(self._journal_path(tx_id), payload)
-        logger.info(f"[Journal] Written: {self._journal_path(tx_id)}")
-
-    def update(self, data: Dict[str, Any]) -> None:
-        """Overwrite an existing journal entry.
-
-        Args:
-            data: Updated serialized transaction record.
-        """
-        tx_id = data["id"]
-        payload = json.dumps(data, ensure_ascii=False, default=str).encode("utf-8")
-        self._agfs.write(self._journal_path(tx_id), payload)
-
-    def read(self, tx_id: str) -> Dict[str, Any]:
-        """Read a journal entry.
-
-        Args:
-            tx_id: Transaction ID.
-
-        Returns:
-            Parsed journal data.
-
-        Raises:
-            FileNotFoundError: If journal does not exist.
-        """
-        content = self._agfs.cat(self._journal_path(tx_id))
-        if isinstance(content, bytes):
-            content = content.decode("utf-8")
-        return json.loads(content)
-
-    def delete(self, tx_id: str) -> None:
-        """Delete a transaction's journal directory.
-
-        Args:
-            tx_id: Transaction ID.
-        """
-        try:
-            self._agfs.rm(self._tx_dir(tx_id), recursive=True)
-            logger.debug(f"[Journal] Deleted journal for tx {tx_id}")
-        except Exception as e:
-            logger.warning(f"[Journal] Failed to delete journal for tx {tx_id}: {e}")
-
-    def list_all(self) -> List[str]:
-        """List all transaction IDs that have journal entries.
-
-        Returns:
-            List of transaction ID strings.
-        """
-        try:
-            entries = self._agfs.ls(_JOURNAL_ROOT)
-            tx_ids = []
-            if isinstance(entries, list):
-                for entry in entries:
-                    name = entry.get("name", "") if isinstance(entry, dict) else str(entry)
-                    if name and name not in (".", "..") and entry.get("isDir", True):
-                        tx_ids.append(name)
-            return tx_ids
-        except Exception:
-            return []
diff --git a/openviking/storage/transaction/lock_context.py b/openviking/storage/transaction/lock_context.py
new file mode 100644
index 00000000..62fc15ba
--- /dev/null
+++ b/openviking/storage/transaction/lock_context.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""LockContext — async context manager for acquiring/releasing path locks."""
+
+from typing import Optional
+
+from openviking.storage.errors import LockAcquisitionError
+from openviking.storage.transaction.lock_handle import LockHandle
+from openviking.storage.transaction.lock_manager import LockManager
+
+
+class LockContext:
+    """``async with LockContext(manager, paths, mode) as handle: ...``
+
+    Acquires locks on entry, releases them on exit. No undo / journal / commit
+    semantics — just a lock scope.
+    """
+
+    def __init__(
+        self,
+        lock_manager: LockManager,
+        paths: list[str],
+        lock_mode: str = "point",
+        mv_dst_path: Optional[str] = None,
+        src_is_dir: bool = True,
+    ):
+        self._manager = lock_manager
+        self._paths = paths
+        self._lock_mode = lock_mode
+        self._mv_dst_path = mv_dst_path
+        self._src_is_dir = src_is_dir
+        self._handle: Optional[LockHandle] = None
+
+    async def __aenter__(self) -> LockHandle:
+        self._handle = self._manager.create_handle()
+        success = False
+
+        if self._lock_mode == "subtree":
+            for path in self._paths:
+                success = await self._manager.acquire_subtree(self._handle, path)
+                if not success:
+                    break
+        elif self._lock_mode == "mv":
+            if self._mv_dst_path is None:
+                raise LockAcquisitionError("mv lock mode requires mv_dst_path")
+            success = await self._manager.acquire_mv(
+                self._handle,
+                self._paths[0],
+                self._mv_dst_path,
+                src_is_dir=self._src_is_dir,
+            )
+        else:  # "point"
+            for path in self._paths:
+                success = await self._manager.acquire_point(self._handle, path)
+                if not success:
+                    break
+
+        if not success:
+            await self._manager.release(self._handle)
+            raise LockAcquisitionError(
+                f"Failed to acquire {self._lock_mode} lock for {self._paths}"
+            )
+        return self._handle
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self._handle:
+            await self._manager.release(self._handle)
+        return False
diff --git a/openviking/storage/transaction/lock_handle.py b/openviking/storage/transaction/lock_handle.py
new file mode 100644
index 00000000..7b5be5d9
--- /dev/null
+++ b/openviking/storage/transaction/lock_handle.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Lock handle and LockOwner protocol for PathLock integration."""
+
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Protocol, runtime_checkable
+
+
+@runtime_checkable
+class LockOwner(Protocol):
+    """Minimal interface that PathLock requires from its caller."""
+
+    id: str
+    locks: list[str]
+
+    def add_lock(self, path: str) -> None: ...
+    def remove_lock(self, path: str) -> None: ...
+
+
+@dataclass
+class LockHandle:
+    """Identifies a lock holder. PathLock uses ``id`` to generate fencing tokens
+    and ``locks`` to track acquired lock files."""
+
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    locks: list[str] = field(default_factory=list)
+    created_at: float = field(default_factory=time.time)
+
+    def add_lock(self, lock_path: str) -> None:
+        if lock_path not in self.locks:
+            self.locks.append(lock_path)
+
+    def remove_lock(self, lock_path: str) -> None:
+        if lock_path in self.locks:
+            self.locks.remove(lock_path)
diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py
new file mode 100644
index 00000000..5e2e5076
--- /dev/null
+++ b/openviking/storage/transaction/lock_manager.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""LockManager — global singleton managing lock lifecycle and redo recovery."""
+
+import asyncio
+import json
+import time
+from typing import Any, Dict, Optional
+
+from openviking.pyagfs import AGFSClient
+from openviking.storage.transaction.lock_handle import LockHandle
+from openviking.storage.transaction.path_lock import PathLock
+from openviking.storage.transaction.redo_log import RedoLog
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class LockManager:
+    """Global singleton. Manages lock lifecycle and stale cleanup."""
+
+    def __init__(
+        self,
+        agfs: AGFSClient,
+        lock_timeout: float = 0.0,
+        lock_expire: float = 300.0,
+    ):
+        self._agfs = agfs
+        self._path_lock = PathLock(agfs, lock_expire=lock_expire)
+        self._lock_timeout = lock_timeout
+        self._redo_log = RedoLog(agfs)
+        self._handles: Dict[str, LockHandle] = {}
+        self._cleanup_task: Optional[asyncio.Task] = None
+        self._running = False
+
+    @property
+    def redo_log(self) -> RedoLog:
+        return self._redo_log
+
+    def get_active_handles(self) -> Dict[str, LockHandle]:
+        return dict(self._handles)
+
+    async def start(self) -> None:
+        """Start background cleanup and redo recovery."""
+        self._running = True
+        self._cleanup_task = asyncio.create_task(self._stale_cleanup_loop())
+        await self._recover_pending_redo()
+
+    async def stop(self) -> None:
+        """Stop cleanup and release all active locks."""
+        self._running = False
+        if self._cleanup_task:
+            self._cleanup_task.cancel()
+            try:
+                await self._cleanup_task
+            except asyncio.CancelledError:
+                pass
+        for handle in list(self._handles.values()):
+            await self._path_lock.release(handle)
+        self._handles.clear()
+
+    def create_handle(self) -> LockHandle:
+        handle = LockHandle()
+        self._handles[handle.id] = handle
+        return handle
+
+    async def acquire_point(
+        self, handle: LockHandle, path: str, timeout: Optional[float] = None
+    ) -> bool:
+        return await self._path_lock.acquire_point(
+            path, handle, timeout=timeout if timeout is not None else self._lock_timeout
+        )
+
+    async def acquire_subtree(
+        self, handle: LockHandle, path: str, timeout: Optional[float] = None
+    ) -> bool:
+        return await self._path_lock.acquire_subtree(
+            path, handle, timeout=timeout if timeout is not None else self._lock_timeout
+        )
+
+    async def acquire_mv(
+        self,
+        handle: LockHandle,
+        src: str,
+        dst: str,
+        src_is_dir: bool = True,
+        timeout: Optional[float] = None,
+    ) -> bool:
+        return await self._path_lock.acquire_mv(
+            src,
+            dst,
+            handle,
+            timeout=timeout if timeout is not None else self._lock_timeout,
+            src_is_dir=src_is_dir,
+        )
+
+    async def release(self, handle: LockHandle) -> None:
+        await self._path_lock.release(handle)
+        self._handles.pop(handle.id, None)
+
+    async def _stale_cleanup_loop(self) -> None:
+        """Check and release leaked handles every 60 s (in-process safety net)."""
+        while self._running:
+            await asyncio.sleep(60)
+            now = time.time()
+            stale = [h for h in self._handles.values() if now - h.created_at > 3600]
+            for handle in stale:
+                logger.warning(f"Releasing stale lock handle {handle.id}")
+                await self.release(handle)
+
+    # ------------------------------------------------------------------
+    # Redo recovery (session_memory only)
+    # ------------------------------------------------------------------
+
+    async def _recover_pending_redo(self) -> None:
+        pending_ids = self._redo_log.list_pending()
+        for task_id in pending_ids:
+            logger.info(f"Recovering pending redo task: {task_id}")
+            try:
+                info = self._redo_log.read(task_id)
+                if info:
+                    await self._redo_session_memory(info)
+                self._redo_log.mark_done(task_id)
+            except Exception as e:
+                logger.error(f"Redo recovery failed for {task_id}: {e}", exc_info=True)
+
+    async def _redo_session_memory(self, info: Dict[str, Any]) -> None:
+        """Re-extract memories from archive."""
+        from openviking.message import Message
+        from openviking.server.identity import RequestContext, Role
+        from openviking.session.compressor import SessionCompressor
+        from openviking_cli.session.user_id import UserIdentifier
+
+        archive_uri = info.get("archive_uri")
+        session_uri = info.get("session_uri")
+        account_id = info.get("account_id", "default")
+        user_id = info.get("user_id", "default")
+        agent_id = info.get("agent_id", "default")
+        role_str = info.get("role", "root")
+
+        if not archive_uri or not session_uri:
+            logger.warning("Cannot redo session_memory: missing archive_uri or session_uri")
+            return
+
+        # 1. Read archived messages
+        messages_path = f"{archive_uri}/messages.jsonl"
+        try:
+            agfs_path = messages_path.replace("viking://", "")
+            content = self._agfs.cat(agfs_path)
+            if isinstance(content, bytes):
+                content = content.decode("utf-8")
+        except Exception as e:
+            logger.warning(f"Cannot read archive for redo: {messages_path}: {e}")
+            return
+
+        messages = []
+        for line in content.strip().split("\n"):
+            if line.strip():
+                try:
+                    messages.append(Message.from_dict(json.loads(line)))
+                except Exception:
+                    pass
+
+        if not messages:
+            logger.warning(f"No messages found in archive for redo: {archive_uri}")
+            return
+
+        # 2. Build request context
+        user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id)
+        ctx = RequestContext(user=user, role=Role(role_str))
+
+        # 3. Re-extract memories (best-effort: skip if compressor not available)
+        session_id = session_uri.rstrip("/").rsplit("/", 1)[-1]
+        try:
+            compressor = SessionCompressor(vikingdb=None)
+            memories = await compressor.extract_long_term_memories(
+                messages=messages,
+                user=user,
+                session_id=session_id,
+                ctx=ctx,
+            )
+            logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}")
+        except Exception as e:
+            logger.warning(f"Redo: memory extraction skipped ({e}), will retry via queue")
+
+        # 4. Enqueue semantic processing
+        await self._enqueue_semantic(
+            uri=session_uri,
+            context_type="memory",
+            account_id=account_id,
+            user_id=user_id,
+            agent_id=agent_id,
+            role=role_str,
+        )
+
+    async def _enqueue_semantic(self, **params: Any) -> None:
+        from openviking.storage.queuefs import get_queue_manager
+        from openviking.storage.queuefs.semantic_msg import SemanticMsg
+        from openviking.storage.queuefs.semantic_queue import SemanticQueue
+
+        queue_manager = get_queue_manager()
+        if queue_manager is None:
+            logger.debug("No queue manager available, skipping enqueue_semantic")
+            return
+
+        uri = params.get("uri")
+        if not uri:
+            return
+
+        msg = SemanticMsg(
+            uri=uri,
+            context_type=params.get("context_type", "resource"),
+            account_id=params.get("account_id", "default"),
+            user_id=params.get("user_id", "default"),
+            agent_id=params.get("agent_id", "default"),
+            role=params.get("role", "root"),
+        )
+        semantic_queue: SemanticQueue = queue_manager.get_queue(queue_manager.SEMANTIC)  # type: ignore[assignment]
+        await semantic_queue.enqueue(msg)
+
+
+# ---------------------------------------------------------------------------
+# Module-level singleton
+# ---------------------------------------------------------------------------
+
+_lock_manager: Optional[LockManager] = None
+
+
+def init_lock_manager(
+    agfs: AGFSClient,
+    lock_timeout: float = 0.0,
+    lock_expire: float = 300.0,
+) -> LockManager:
+    global _lock_manager
+    _lock_manager = LockManager(agfs=agfs, lock_timeout=lock_timeout, lock_expire=lock_expire)
+    return _lock_manager
+
+
+def get_lock_manager() -> LockManager:
+    if _lock_manager is None:
+        raise RuntimeError("LockManager not initialized. Call init_lock_manager() first.")
+    return _lock_manager
+
+
+def reset_lock_manager() -> None:
+    global _lock_manager
+    _lock_manager = None
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index 5de99743..d9212b3b 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -3,7 +3,7 @@
 from typing import Optional, Tuple
 
 from openviking.pyagfs import AGFSClient
-from openviking.storage.transaction.transaction_record import TransactionRecord
+from openviking.storage.transaction.lock_handle import LockOwner
 from openviking_cli.utils.logger import get_logger
 
 logger = get_logger(__name__)
@@ -19,8 +19,8 @@
 _POLL_INTERVAL = 0.2
 
 
-def _make_fencing_token(tx_id: str, lock_type: str = LOCK_TYPE_POINT) -> str:
-    return f"{tx_id}:{time.time_ns()}:{lock_type}"
+def _make_fencing_token(owner_id: str, lock_type: str = LOCK_TYPE_POINT) -> str:
+    return f"{owner_id}:{time.time_ns()}:{lock_type}"
 
 
 def _parse_fencing_token(token: str) -> Tuple[str, int, str]:
@@ -29,20 +29,20 @@ def _parse_fencing_token(token: str) -> Tuple[str, int, str]:
         rest = token[:-2]
         idx = rest.rfind(":")
         if idx >= 0:
-            tx_id_part = rest[:idx]
+            owner_id_part = rest[:idx]
             ts_part = rest[idx + 1 :]
             try:
-                return tx_id_part, int(ts_part), lock_type
+                return owner_id_part, int(ts_part), lock_type
             except ValueError:
                 pass
         return rest, 0, lock_type
 
     if ":" in token:
         idx = token.rfind(":")
-        tx_id_part = token[:idx]
+        owner_id_part = token[:idx]
         ts_part = token[idx + 1 :]
         try:
-            return tx_id_part, int(ts_part), LOCK_TYPE_POINT
+            return owner_id_part, int(ts_part), LOCK_TYPE_POINT
         except ValueError:
             pass
 
@@ -76,25 +76,25 @@ def _read_token(self, lock_path: str) -> Optional[str]:
         except Exception:
             return None
 
-    async def _is_locked_by_other(self, lock_path: str, transaction_id: str) -> bool:
+    async def _is_locked_by_other(self, lock_path: str, owner_id: str) -> bool:
         token = self._read_token(lock_path)
         if token is None:
             return False
         lock_owner, _, _ = _parse_fencing_token(token)
-        return lock_owner != transaction_id
+        return lock_owner != owner_id
 
     async def _create_lock_file(
-        self, lock_path: str, transaction_id: str, lock_type: str = LOCK_TYPE_POINT
+        self, lock_path: str, owner_id: str, lock_type: str = LOCK_TYPE_POINT
     ) -> None:
-        token = _make_fencing_token(transaction_id, lock_type)
+        token = _make_fencing_token(owner_id, lock_type)
         self._agfs.write(lock_path, token.encode("utf-8"))
 
-    async def _verify_lock_ownership(self, lock_path: str, transaction_id: str) -> bool:
+    async def _verify_lock_ownership(self, lock_path: str, owner_id: str) -> bool:
         token = self._read_token(lock_path)
         if token is None:
             return False
         lock_owner, _, _ = _parse_fencing_token(token)
-        return lock_owner == transaction_id
+        return lock_owner == owner_id
 
     async def _remove_lock_file(self, lock_path: str) -> bool:
         try:
@@ -115,19 +115,19 @@ def is_lock_stale(self, lock_path: str, expire_seconds: float = 300.0) -> bool:
         age = (time.time_ns() - ts) / 1e9
         return age > expire_seconds
 
-    async def _check_ancestors_for_subtree(self, path: str, exclude_tx_id: str) -> Optional[str]:
+    async def _check_ancestors_for_subtree(self, path: str, exclude_owner_id: str) -> Optional[str]:
         parent = self._get_parent_path(path)
         while parent:
             lock_path = self._get_lock_path(parent)
             token = self._read_token(lock_path)
             if token is not None:
                 owner_id, _, lock_type = _parse_fencing_token(token)
-                if owner_id != exclude_tx_id and lock_type == LOCK_TYPE_SUBTREE:
+                if owner_id != exclude_owner_id and lock_type == LOCK_TYPE_SUBTREE:
                     return lock_path
             parent = self._get_parent_path(parent)
         return None
 
-    async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Optional[str]:
+    async def _scan_descendants_for_locks(self, path: str, exclude_owner_id: str) -> Optional[str]:
         try:
             entries = self._agfs.ls(path)
             if not isinstance(entries, list):
@@ -145,19 +145,17 @@ async def _scan_descendants_for_locks(self, path: str, exclude_tx_id: str) -> Op
                 token = self._read_token(subdir_lock)
                 if token is not None:
                     owner_id, _, _ = _parse_fencing_token(token)
-                    if owner_id != exclude_tx_id:
+                    if owner_id != exclude_owner_id:
                         return subdir_lock
-                result = await self._scan_descendants_for_locks(subdir, exclude_tx_id)
+                result = await self._scan_descendants_for_locks(subdir, exclude_owner_id)
                 if result:
                     return result
         except Exception as e:
             logger.warning(f"Failed to scan descendants of {path}: {e}")
         return None
 
-    async def acquire_point(
-        self, path: str, transaction: TransactionRecord, timeout: float = 0.0
-    ) -> bool:
-        transaction_id = transaction.id
+    async def acquire_point(self, path: str, owner: LockOwner, timeout: float = 0.0) -> bool:
+        owner_id = owner.id
         lock_path = self._get_lock_path(path)
         deadline = asyncio.get_running_loop().time() + timeout
 
@@ -168,7 +166,7 @@ async def acquire_point(
             return False
 
         while True:
-            if await self._is_locked_by_other(lock_path, transaction_id):
+            if await self._is_locked_by_other(lock_path, owner_id):
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[POINT] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
@@ -179,7 +177,7 @@ async def acquire_point(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id)
+            ancestor_conflict = await self._check_ancestors_for_subtree(path, owner_id)
             if ancestor_conflict:
                 if self.is_lock_stale(ancestor_conflict, self._lock_expire):
                     logger.warning(
@@ -196,22 +194,22 @@ async def acquire_point(
                 continue
 
             try:
-                await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_POINT)
+                await self._create_lock_file(lock_path, owner_id, LOCK_TYPE_POINT)
             except Exception as e:
                 logger.error(f"[POINT] Failed to create lock file: {e}")
                 return False
 
             backed_off = False
-            conflict_after = await self._check_ancestors_for_subtree(path, transaction_id)
+            conflict_after = await self._check_ancestors_for_subtree(path, owner_id)
             if conflict_after:
                 their_token = self._read_token(conflict_after)
                 if their_token:
-                    their_tx_id, their_ts, _ = _parse_fencing_token(their_token)
+                    their_owner_id, their_ts, _ = _parse_fencing_token(their_token)
                     my_token = self._read_token(lock_path)
                     _, my_ts, _ = (
                         _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_POINT)
                     )
-                    if (my_ts, transaction_id) > (their_ts, their_tx_id):
+                    if (my_ts, owner_id) > (their_ts, their_owner_id):
                         logger.debug(f"[POINT] Backing off (livelock guard) on {path}")
                         await self._remove_lock_file(lock_path)
                         backed_off = True
@@ -222,21 +220,19 @@ async def acquire_point(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            if not await self._verify_lock_ownership(lock_path, transaction_id):
+            if not await self._verify_lock_ownership(lock_path, owner_id):
                 logger.debug(f"[POINT] Lock ownership verification failed: {path}")
                 if asyncio.get_running_loop().time() >= deadline:
                     return False
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            transaction.add_lock(lock_path)
+            owner.add_lock(lock_path)
             logger.debug(f"[POINT] Lock acquired: {lock_path}")
             return True
 
-    async def acquire_subtree(
-        self, path: str, transaction: TransactionRecord, timeout: float = 0.0
-    ) -> bool:
-        transaction_id = transaction.id
+    async def acquire_subtree(self, path: str, owner: LockOwner, timeout: float = 0.0) -> bool:
+        owner_id = owner.id
         lock_path = self._get_lock_path(path)
         deadline = asyncio.get_running_loop().time() + timeout
 
@@ -247,7 +243,7 @@ async def acquire_subtree(
             return False
 
         while True:
-            if await self._is_locked_by_other(lock_path, transaction_id):
+            if await self._is_locked_by_other(lock_path, owner_id):
                 if self.is_lock_stale(lock_path, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale lock: {lock_path}")
                     await self._remove_lock_file(lock_path)
@@ -258,8 +254,8 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            # Check ancestor paths for SUBTREE locks held by other transactions
-            ancestor_conflict = await self._check_ancestors_for_subtree(path, transaction_id)
+            # Check ancestor paths for SUBTREE locks held by other owners
+            ancestor_conflict = await self._check_ancestors_for_subtree(path, owner_id)
             if ancestor_conflict:
                 if self.is_lock_stale(ancestor_conflict, self._lock_expire):
                     logger.warning(
@@ -275,7 +271,7 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            desc_conflict = await self._scan_descendants_for_locks(path, transaction_id)
+            desc_conflict = await self._scan_descendants_for_locks(path, owner_id)
             if desc_conflict:
                 if self.is_lock_stale(desc_conflict, self._lock_expire):
                     logger.warning(f"[SUBTREE] Removing stale descendant lock: {desc_conflict}")
@@ -290,24 +286,24 @@ async def acquire_subtree(
                 continue
 
             try:
-                await self._create_lock_file(lock_path, transaction_id, LOCK_TYPE_SUBTREE)
+                await self._create_lock_file(lock_path, owner_id, LOCK_TYPE_SUBTREE)
             except Exception as e:
                 logger.error(f"[SUBTREE] Failed to create lock file: {e}")
                 return False
 
             backed_off = False
-            conflict_after = await self._scan_descendants_for_locks(path, transaction_id)
+            conflict_after = await self._scan_descendants_for_locks(path, owner_id)
             if not conflict_after:
-                conflict_after = await self._check_ancestors_for_subtree(path, transaction_id)
+                conflict_after = await self._check_ancestors_for_subtree(path, owner_id)
             if conflict_after:
                 their_token = self._read_token(conflict_after)
                 if their_token:
-                    their_tx_id, their_ts, _ = _parse_fencing_token(their_token)
+                    their_owner_id, their_ts, _ = _parse_fencing_token(their_token)
                     my_token = self._read_token(lock_path)
                     _, my_ts, _ = (
                         _parse_fencing_token(my_token) if my_token else ("", 0, LOCK_TYPE_SUBTREE)
                     )
-                    if (my_ts, transaction_id) > (their_ts, their_tx_id):
+                    if (my_ts, owner_id) > (their_ts, their_owner_id):
                         logger.debug(f"[SUBTREE] Backing off (livelock guard) on {path}")
                         await self._remove_lock_file(lock_path)
                         backed_off = True
@@ -318,14 +314,14 @@ async def acquire_subtree(
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            if not await self._verify_lock_ownership(lock_path, transaction_id):
+            if not await self._verify_lock_ownership(lock_path, owner_id):
                 logger.debug(f"[SUBTREE] Lock ownership verification failed: {path}")
                 if asyncio.get_running_loop().time() >= deadline:
                     return False
                 await asyncio.sleep(_POLL_INTERVAL)
                 continue
 
-            transaction.add_lock(lock_path)
+            owner.add_lock(lock_path)
             logger.debug(f"[SUBTREE] Lock acquired: {lock_path}")
             return True
 
@@ -333,35 +329,35 @@ async def acquire_mv(
         self,
         src_path: str,
         dst_path: str,
-        transaction: TransactionRecord,
+        owner: LockOwner,
         timeout: float = 0.0,
         src_is_dir: bool = True,
     ) -> bool:
         if src_is_dir:
-            if not await self.acquire_subtree(src_path, transaction, timeout=timeout):
+            if not await self.acquire_subtree(src_path, owner, timeout=timeout):
                 logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}")
                 return False
-            if not await self.acquire_subtree(dst_path, transaction, timeout=timeout):
+            if not await self.acquire_subtree(dst_path, owner, timeout=timeout):
                 logger.warning(f"[MV] Failed to acquire SUBTREE lock on destination: {dst_path}")
-                await self.release(transaction)
+                await self.release(owner)
                 return False
         else:
             src_parent = src_path.rsplit("/", 1)[0] if "/" in src_path else src_path
-            if not await self.acquire_point(src_parent, transaction, timeout=timeout):
+            if not await self.acquire_point(src_parent, owner, timeout=timeout):
                 logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}")
                 return False
-            if not await self.acquire_point(dst_path, transaction, timeout=timeout):
+            if not await self.acquire_point(dst_path, owner, timeout=timeout):
                 logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}")
-                await self.release(transaction)
+                await self.release(owner)
                 return False
 
         logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}")
         return True
 
-    async def release(self, transaction: TransactionRecord) -> None:
-        lock_count = len(transaction.locks)
-        for lock_path in reversed(transaction.locks):
+    async def release(self, owner: LockOwner) -> None:
+        lock_count = len(owner.locks)
+        for lock_path in reversed(owner.locks):
             await self._remove_lock_file(lock_path)
-            transaction.remove_lock(lock_path)
+            owner.remove_lock(lock_path)
 
-        logger.debug(f"Released {lock_count} locks for transaction {transaction.id}")
+        logger.debug(f"Released {lock_count} locks for owner {owner.id}")
diff --git a/openviking/storage/transaction/redo_log.py b/openviking/storage/transaction/redo_log.py
new file mode 100644
index 00000000..80d07dff
--- /dev/null
+++ b/openviking/storage/transaction/redo_log.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Lightweight redo log for crash recovery of session_memory operations."""
+
+import json
+from typing import Any, Dict, List
+
+from openviking.pyagfs import AGFSClient
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+_REDO_ROOT = "/local/_system/redo"
+
+
+class RedoLog:
+    """Lightweight pending-task marker.
+
+    Write a marker before the operation starts; delete it after success.
+    On startup, scan for leftover markers and redo.
+    """
+
+    def __init__(self, agfs: AGFSClient):
+        self._agfs = agfs
+
+    def _task_path(self, task_id: str) -> str:
+        return f"{_REDO_ROOT}/{task_id}/redo.json"
+
+    def _ensure_dirs(self, dir_path: str) -> None:
+        parts = dir_path.strip("/").split("/")
+        current = ""
+        for part in parts:
+            current = f"{current}/{part}"
+            try:
+                self._agfs.mkdir(current)
+            except Exception:
+                pass
+
+    def write_pending(self, task_id: str, info: Dict[str, Any]) -> None:
+        """Write a redo marker before the operation starts."""
+        dir_path = f"{_REDO_ROOT}/{task_id}"
+        self._ensure_dirs(dir_path)
+        data = json.dumps(info, default=str).encode("utf-8")
+        self._agfs.write(self._task_path(task_id), data)
+
+    def mark_done(self, task_id: str) -> None:
+        """Delete the redo marker after a successful operation."""
+        try:
+            self._agfs.rm(f"{_REDO_ROOT}/{task_id}", recursive=True)
+        except Exception as e:
+            logger.warning(f"Failed to clean redo marker {task_id}: {e}")
+
+    def list_pending(self) -> List[str]:
+        """Return all pending task IDs (directories under _REDO_ROOT)."""
+        try:
+            entries = self._agfs.ls(_REDO_ROOT)
+            if not isinstance(entries, list):
+                return []
+            return [
+                e["name"]
+                for e in entries
+                if isinstance(e, dict) and e.get("isDir") and e.get("name") not in (".", "..")
+            ]
+        except Exception:
+            return []
+
+    def read(self, task_id: str) -> Dict[str, Any]:
+        """Read the info dict of a pending task."""
+        try:
+            content = self._agfs.cat(self._task_path(task_id))
+            if isinstance(content, bytes):
+                content = content.decode("utf-8")
+            return json.loads(content)
+        except Exception as e:
+            logger.warning(f"Failed to read redo info for {task_id}: {e}")
+            return {}
diff --git a/openviking/storage/transaction/transaction_manager.py b/openviking/storage/transaction/transaction_manager.py
deleted file mode 100644
index e80df09b..00000000
--- a/openviking/storage/transaction/transaction_manager.py
+++ /dev/null
@@ -1,739 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Transaction manager for OpenViking.
-
-Global singleton that manages transaction lifecycle and lock mechanisms.
-"""
-
-import asyncio
-import threading
-import time
-from typing import Any, Dict, List, Optional
-
-from openviking.pyagfs import AGFSClient
-from openviking.storage.transaction.path_lock import PathLock
-from openviking.storage.transaction.transaction_record import (
-    TransactionRecord,
-    TransactionStatus,
-)
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-# Global singleton instance
-_transaction_manager: Optional["TransactionManager"] = None
-_lock = threading.Lock()
-
-
-class TransactionManager:
-    """Transaction manager for OpenViking.
-
-    Global singleton that manages transaction lifecycle and lock mechanisms.
-    Responsible for:
-    - Allocating transaction IDs
-    - Managing transaction lifecycle (start, commit, rollback)
-    - Providing transaction lock mechanism interface, preventing deadlocks
-    - Persisting transaction state to journal for crash recovery
-    """
-
-    def __init__(
-        self,
-        agfs_client: AGFSClient,
-        timeout: int = 3600,
-        max_parallel_locks: int = 8,
-        lock_timeout: float = 0.0,
-        lock_expire: float = 300.0,
-        vector_store: Optional[Any] = None,
-    ):
-        """Initialize transaction manager.
-
-        Args:
-            agfs_client: AGFS client for file system operations
-            timeout: Transaction timeout in seconds (default: 3600)
-            max_parallel_locks: Maximum number of parallel lock operations (default: 8)
-            lock_timeout: Path lock acquisition timeout in seconds.
-                0 (default) = fail immediately if locked.
-                > 0 = wait/retry up to this many seconds.
-            lock_expire: Stale lock expiry threshold in seconds (default: 300s).
-            vector_store: Optional vector store for VectorDB rollback operations.
-        """
-        from openviking.storage.transaction.journal import TransactionJournal
-
-        self._agfs = agfs_client
-        self._timeout = timeout
-        self._max_parallel_locks = max_parallel_locks
-        self._lock_timeout = lock_timeout
-        self._vector_store = vector_store
-        self._path_lock = PathLock(agfs_client, lock_expire=lock_expire)
-        self._journal = TransactionJournal(agfs_client)
-
-        # Active transactions: {transaction_id: TransactionRecord}
-        self._transactions: Dict[str, TransactionRecord] = {}
-
-        # Background task for timeout cleanup
-        self._cleanup_task: Optional[asyncio.Task] = None
-        self._running = False
-
-        logger.info(
-            f"TransactionManager initialized (timeout={timeout}s, max_parallel_locks={max_parallel_locks})"
-        )
-
-    @property
-    def journal(self):
-        return self._journal
-
-    async def start(self) -> None:
-        """Start transaction manager.
-
-        Starts the background cleanup task and recovers any pending transactions
-        left from a previous process crash.
-        """
-        if self._running:
-            logger.debug("TransactionManager already running")
-            return
-
-        self._running = True
-        self._cleanup_task = asyncio.create_task(self._cleanup_loop())
-
-        # Recover any transactions that were interrupted by a previous crash.
-        # Journal entries are written BEFORE lock acquisition, so every orphan
-        # lock has a corresponding journal entry that recovery can use to clean it up.
-        await self._recover_pending_transactions()
-
-        logger.info("TransactionManager started")
-
-    async def stop(self) -> None:
-        """Stop transaction manager.
-
-        Stops the background cleanup task and releases all resources.
-        """
-        if not self._running:
-            logger.debug("TransactionManager already stopped")
-            return
-
-        self._running = False
-
-        # Cancel cleanup task
-        if self._cleanup_task:
-            self._cleanup_task.cancel()
-            try:
-                await self._cleanup_task
-            except asyncio.CancelledError:
-                pass
-            self._cleanup_task = None
-
-        # Release all active transactions' locks
-        for tx_id in list(self._transactions.keys()):
-            tx = self._transactions.pop(tx_id, None)
-            if tx:
-                await self._path_lock.release(tx)
-
-        logger.info("TransactionManager stopped")
-
-    async def _cleanup_loop(self) -> None:
-        """Background loop for cleaning up timed-out transactions."""
-        while self._running:
-            try:
-                await asyncio.sleep(60)  # Check every minute
-                await self._cleanup_timed_out()
-            except asyncio.CancelledError:
-                break
-            except Exception as e:
-                logger.error(f"Error in cleanup loop: {e}")
-
-    async def _cleanup_timed_out(self) -> None:
-        """Clean up timed-out transactions."""
-        current_time = time.time()
-        timed_out = []
-
-        for tx_id, tx in self._transactions.items():
-            if current_time - tx.updated_at > self._timeout:
-                timed_out.append(tx_id)
-
-        for tx_id in timed_out:
-            logger.warning(f"Transaction timed out: {tx_id}")
-            await self.rollback(tx_id)
-
-    async def _recover_pending_transactions(self) -> None:
-        """Recover pending transactions from journal after a crash.
-
-        Reads all journal entries and rolls back any transactions that were
-        not cleanly committed or rolled back.
-        """
-        try:
-            pending_ids = self._journal.list_all()
-        except Exception as e:
-            logger.warning(f"Failed to list journal entries for recovery: {e}")
-            return
-
-        if not pending_ids:
-            return
-
-        logger.info(f"Found {len(pending_ids)} pending transaction(s) to recover")
-
-        for tx_id in pending_ids:
-            try:
-                await self._recover_one(tx_id)
-            except Exception as e:
-                logger.error(f"Failed to recover transaction {tx_id}: {e}")
-
-    async def _recover_one(self, tx_id: str) -> None:
-        """Recover a single transaction from journal.
-
-        Recovery strategy by status:
-          COMMITTED + post_actions  → replay post_actions (enqueue etc.), then clean up
-          COMMITTED, no post_actions / RELEASED → just clean up
-          EXEC / FAIL / RELEASING   → rollback completed+partial ops, then clean up
-          INIT / ACQUIRE            → nothing executed yet, just clean up
-        """
-        from openviking.storage.transaction.undo import execute_rollback
-
-        try:
-            data = self._journal.read(tx_id)
-        except Exception as e:
-            logger.warning(f"Cannot read journal for tx {tx_id}: {e}")
-            return
-
-        tx = TransactionRecord.from_journal(data)
-        logger.info(f"Recovering transaction {tx_id} (status={tx.status})")
-
-        if tx.status == TransactionStatus.COMMIT:
-            # Transaction was committed — replay any unfinished post_actions
-            if tx.post_actions:
-                logger.info(
-                    f"Replaying {len(tx.post_actions)} post_action(s) for committed tx {tx_id}"
-                )
-                try:
-                    await self._execute_post_actions(tx.post_actions)
-                except Exception as e:
-                    logger.warning(f"Post-action replay failed for tx {tx_id}: {e}")
-        elif tx.status in (TransactionStatus.INIT, TransactionStatus.ACQUIRE):
-            # Transaction never executed any operations — nothing to rollback.
-            # However, locks may have been created before the journal was updated
-            # with the actual locks list. Use init_info.lock_paths to find and
-            # clean up orphan lock files owned by this transaction.
-            logger.info(f"Transaction {tx_id} never executed, cleaning up orphan locks")
-            if not tx.locks:
-                await self._cleanup_orphan_locks_from_init_info(tx_id, tx.init_info)
-        else:
-            # EXEC / FAIL / RELEASING: process crashed mid-operation
-            operation = tx.init_info.get("operation", "")
-            if operation == "session_memory":
-                # Redo: re-extract memories from archive and write
-                try:
-                    await self._redo_session_memory(tx)
-                except Exception as e:
-                    logger.warning(f"Redo session_memory failed for tx {tx_id}: {e}")
-            else:
-                # Default: rollback completed+partial ops
-                # Pass recover_all=True so partial (completed=False) ops are also reversed,
-                # e.g. a directory mv that started but never finished still leaves residue.
-                try:
-                    await execute_rollback(
-                        tx.undo_log,
-                        self._agfs,
-                        vector_store=self._vector_store,
-                        recover_all=True,
-                    )
-                except Exception as e:
-                    logger.warning(f"Rollback during recovery failed for tx {tx_id}: {e}")
-
-        # Release any lock files still present
-        await self._path_lock.release(tx)
-
-        # Clean up journal
-        try:
-            self._journal.delete(tx_id)
-        except Exception:
-            pass
-
-        logger.info(f"Recovered transaction {tx_id}")
-
-    async def _cleanup_orphan_locks_from_init_info(
-        self, tx_id: str, init_info: Dict[str, Any]
-    ) -> None:
-        """Clean up orphan lock files using lock path hints from init_info.
-
-        When a crash occurs between lock creation and journal update, the
-        journal's ``locks`` list is empty but ``init_info.lock_paths`` records
-        the paths that were intended to be locked. This method checks those
-        paths and removes any lock files still owned by this transaction.
-        """
-        from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _parse_fencing_token
-
-        lock_paths = init_info.get("lock_paths", [])
-        lock_mode = init_info.get("lock_mode", "point")
-        mv_dst_path = init_info.get("mv_dst_path")
-
-        # Collect all candidate paths to check
-        paths_to_check = list(lock_paths)
-        if lock_mode == "mv" and mv_dst_path:
-            paths_to_check.append(mv_dst_path)
-
-        for path in paths_to_check:
-            lock_file = f"{path.rstrip('/')}/{LOCK_FILE_NAME}"
-            try:
-                token = self._path_lock._read_token(lock_file)
-                if token is None:
-                    continue
-                owner_id, _, _ = _parse_fencing_token(token)
-                if owner_id == tx_id:
-                    await self._path_lock._remove_lock_file(lock_file)
-                    logger.info(f"Removed orphan lock for tx {tx_id}: {lock_file}")
-            except Exception as e:
-                logger.warning(f"Failed to check orphan lock {lock_file}: {e}")
-
-    async def _redo_session_memory(self, tx: TransactionRecord) -> None:
-        """Redo a session_memory transaction from its archived messages.
-
-        On crash during Phase 2 of session commit, we redo memory extraction
-        from the archive rather than rolling back.
-        """
-        import json
-
-        from openviking.message import Message
-        from openviking.server.identity import RequestContext, Role
-        from openviking_cli.session.user_id import UserIdentifier
-
-        archive_uri = tx.init_info.get("archive_uri")
-        session_uri = tx.init_info.get("session_uri")
-        account_id = tx.init_info.get("account_id", "default")
-        user_id = tx.init_info.get("user_id", "default")
-        agent_id = tx.init_info.get("agent_id", "default")
-        role_str = tx.init_info.get("role", "root")
-
-        if not archive_uri or not session_uri:
-            logger.warning("Cannot redo session_memory: missing archive_uri or session_uri")
-            return
-
-        # 1. Read archived messages from AGFS
-        messages_path = f"{archive_uri}/messages.jsonl"
-        try:
-            agfs_path = messages_path.replace("viking://", "")
-            content = self._agfs.cat(agfs_path)
-            if isinstance(content, bytes):
-                content = content.decode("utf-8")
-        except Exception as e:
-            logger.warning(f"Cannot read archive for redo: {messages_path}: {e}")
-            return
-
-        messages = []
-        for line in content.strip().split("\n"):
-            if line.strip():
-                try:
-                    messages.append(Message.from_dict(json.loads(line)))
-                except Exception:
-                    pass
-
-        if not messages:
-            logger.warning(f"No messages found in archive for redo: {archive_uri}")
-            return
-
-        # 2. Build request context for memory extraction
-        user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id)
-        ctx = RequestContext(user=user, role=Role(role_str), account_id=account_id)
-
-        # 3. Re-extract memories
-        from openviking.session.compressor import SessionCompressor
-
-        compressor = SessionCompressor()
-        session_id = session_uri.rstrip("/").rsplit("/", 1)[-1]
-        memories = await compressor.extract_long_term_memories(
-            messages=messages,
-            user=user,
-            session_id=session_id,
-            ctx=ctx,
-        )
-        logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}")
-
-        # 4. Enqueue semantic processing
-        await self._execute_post_actions(
-            [
-                {
-                    "type": "enqueue_semantic",
-                    "params": {
-                        "uri": session_uri,
-                        "context_type": "memory",
-                        "account_id": account_id,
-                        "user_id": user_id,
-                        "agent_id": agent_id,
-                        "role": role_str,
-                    },
-                }
-            ]
-        )
-
-    def create_transaction(self, init_info: Optional[Dict[str, Any]] = None) -> TransactionRecord:
-        """Create a new transaction.
-
-        Args:
-            init_info: Transaction initialization information
-
-        Returns:
-            New transaction record
-        """
-        tx = TransactionRecord(init_info=init_info or {})
-        self._transactions[tx.id] = tx
-        logger.debug(f"Transaction created: {tx.id}")
-        return tx
-
-    def get_transaction(self, transaction_id: str) -> Optional[TransactionRecord]:
-        """Get transaction by ID.
-
-        Args:
-            transaction_id: Transaction ID
-
-        Returns:
-            Transaction record or None if not found
-        """
-        return self._transactions.get(transaction_id)
-
-    async def begin(self, transaction_id: str) -> bool:
-        """Begin a transaction.
-
-        Args:
-            transaction_id: Transaction ID
-
-        Returns:
-            True if transaction started successfully, False otherwise
-        """
-        tx = self.get_transaction(transaction_id)
-        if not tx:
-            logger.error(f"Transaction not found: {transaction_id}")
-            return False
-
-        tx.update_status(TransactionStatus.ACQUIRE)
-        logger.debug(f"Transaction begun: {transaction_id}")
-        return True
-
-    async def commit(self, transaction_id: str) -> bool:
-        """Commit a transaction.
-
-        Executes post-actions, releases all locks, and removes the journal entry.
-
-        Args:
-            transaction_id: Transaction ID
-
-        Returns:
-            True if transaction committed successfully, False otherwise
-        """
-        tx = self.get_transaction(transaction_id)
-        if not tx:
-            logger.error(f"Transaction not found: {transaction_id}")
-            return False
-
-        # Update status to COMMIT
-        tx.update_status(TransactionStatus.COMMIT)
-
-        # Persist final committed state before releasing
-        try:
-            self._journal.update(tx.to_journal())
-        except Exception:
-            pass
-
-        # Execute post-actions (best-effort, errors are logged but don't fail commit)
-        if tx.post_actions:
-            await self._execute_post_actions(tx.post_actions)
-
-        # Release all locks
-        tx.update_status(TransactionStatus.RELEASING)
-        await self._path_lock.release(tx)
-
-        # Update status to RELEASED
-        tx.update_status(TransactionStatus.RELEASED)
-
-        # Remove from active transactions
-        self._transactions.pop(transaction_id, None)
-
-        # Clean up journal entry (last step — lock is already released)
-        try:
-            self._journal.delete(transaction_id)
-        except Exception as e:
-            logger.warning(f"Failed to delete journal on commit for {transaction_id}: {e}")
-
-        logger.debug(f"Transaction committed: {transaction_id}")
-        return True
-
-    async def rollback(self, transaction_id: str) -> bool:
-        """Rollback a transaction.
-
-        Executes undo log entries in reverse order, releases all locks,
-        and removes the journal entry.
-
-        Args:
-            transaction_id: Transaction ID
-
-        Returns:
-            True if transaction rolled back successfully, False otherwise
-        """
-        from openviking.storage.transaction.undo import execute_rollback
-
-        tx = self.get_transaction(transaction_id)
-        if not tx:
-            logger.error(f"Transaction not found: {transaction_id}")
-            return False
-
-        # Update status to FAIL
-        tx.update_status(TransactionStatus.FAIL)
-
-        # Persist rollback state
-        try:
-            self._journal.update(tx.to_journal())
-        except Exception:
-            pass
-
-        # Execute undo log (best-effort)
-        if tx.undo_log:
-            try:
-                await execute_rollback(
-                    tx.undo_log,
-                    self._agfs,
-                    vector_store=self._vector_store,
-                )
-            except Exception as e:
-                logger.warning(
-                    f"Undo log execution failed during rollback of {transaction_id}: {e}"
-                )
-
-        # Release all locks
-        tx.update_status(TransactionStatus.RELEASING)
-        await self._path_lock.release(tx)
-
-        # Update status to RELEASED
-        tx.update_status(TransactionStatus.RELEASED)
-
-        # Remove from active transactions
-        self._transactions.pop(transaction_id, None)
-
-        # Clean up journal entry (last step — lock is already released)
-        try:
-            self._journal.delete(transaction_id)
-        except Exception as e:
-            logger.warning(f"Failed to delete journal on rollback for {transaction_id}: {e}")
-
-        logger.debug(f"Transaction rolled back: {transaction_id}")
-        return True
-
-    async def _execute_post_actions(self, post_actions: List[Dict[str, Any]]) -> None:
-        """Execute post-commit actions.
-
-        Post-actions are executed after a successful commit. Errors are logged
-        but do not affect the commit outcome.
-
-        Args:
-            post_actions: List of post-action dicts with 'type' and 'params' keys
-        """
-        for action in post_actions:
-            action_type = action.get("type", "")
-            params = action.get("params", {})
-            try:
-                if action_type == "enqueue_semantic":
-                    await self._post_enqueue_semantic(params)
-                else:
-                    logger.warning(f"Unknown post-action type: {action_type}")
-            except Exception as e:
-                logger.warning(f"Post-action '{action_type}' failed: {e}")
-
-    async def _post_enqueue_semantic(self, params: Dict[str, Any]) -> None:
-        """Execute enqueue_semantic post-action."""
-        from openviking.storage.queuefs import get_queue_manager
-        from openviking.storage.queuefs.semantic_msg import SemanticMsg
-
-        queue_manager = get_queue_manager()
-        if queue_manager is None:
-            logger.debug("No queue manager available, skipping enqueue_semantic post-action")
-            return
-
-        uri = params.get("uri")
-        context_type = params.get("context_type", "resource")
-        account_id = params.get("account_id", "default")
-        user_id = params.get("user_id", "default")
-        agent_id = params.get("agent_id", "default")
-        role = params.get("role", "root")
-        if not uri:
-            return
-
-        msg = SemanticMsg(
-            uri=uri,
-            context_type=context_type,
-            account_id=account_id,
-            user_id=user_id,
-            agent_id=agent_id,
-            role=role,
-        )
-        semantic_queue = queue_manager.get_queue(queue_manager.SEMANTIC)
-        await semantic_queue.enqueue(msg)
-
-    async def acquire_lock_point(self, transaction_id: str, path: str) -> bool:
-        """Acquire POINT lock for write/semantic-processing operations.
-
-        Args:
-            transaction_id: Transaction ID
-            path: Directory path to lock
-
-        Returns:
-            True if lock acquired successfully, False otherwise
-        """
-        tx = self.get_transaction(transaction_id)
-        if not tx:
-            logger.error(f"Transaction not found: {transaction_id}")
-            return False
-
-        tx.update_status(TransactionStatus.ACQUIRE)
-        success = await self._path_lock.acquire_point(path, tx, timeout=self._lock_timeout)
-
-        if success:
-            tx.update_status(TransactionStatus.EXEC)
-        else:
-            tx.update_status(TransactionStatus.FAIL)
-
-        return success
-
-    async def acquire_lock_subtree(
-        self, transaction_id: str, path: str, timeout: Optional[float] = None
-    ) -> bool:
-        """Acquire SUBTREE lock for rm/mv-source operations.
-
-        Args:
-            transaction_id: Transaction ID
-            path: Directory path to lock (root of the subtree)
-            timeout: Maximum time to wait for the lock in seconds (default: from config)
-
-        Returns:
-            True if lock acquired successfully, False otherwise
-        """
-        tx = self.get_transaction(transaction_id)
-        if not tx:
-            logger.error(f"Transaction not found: {transaction_id}")
-            return False
-
-        tx.update_status(TransactionStatus.ACQUIRE)
-        effective_timeout = timeout if timeout is not None else self._lock_timeout
-        success = await self._path_lock.acquire_subtree(path, tx, timeout=effective_timeout)
-
-        if success:
-            tx.update_status(TransactionStatus.EXEC)
-        else:
-            tx.update_status(TransactionStatus.FAIL)
-
-        return success
-
-    async def acquire_lock_mv(
-        self,
-        transaction_id: str,
-        src_path: str,
-        dst_path: str,
-        timeout: Optional[float] = None,
-        src_is_dir: bool = True,
-    ) -> bool:
-        """Acquire path lock for mv operation.
-
-        Args:
-            transaction_id: Transaction ID
-            src_path: Source path
-            dst_path: Destination parent directory path
-            timeout: Maximum time to wait for each lock in seconds (default: from config)
-            src_is_dir: Whether the source is a directory
-
-        Returns:
-            True if lock acquired successfully, False otherwise
-        """
-        tx = self.get_transaction(transaction_id)
-        if not tx:
-            logger.error(f"Transaction not found: {transaction_id}")
-            return False
-
-        tx.update_status(TransactionStatus.ACQUIRE)
-        effective_timeout = timeout if timeout is not None else self._lock_timeout
-        success = await self._path_lock.acquire_mv(
-            src_path, dst_path, tx, timeout=effective_timeout, src_is_dir=src_is_dir
-        )
-
-        if success:
-            tx.update_status(TransactionStatus.EXEC)
-        else:
-            tx.update_status(TransactionStatus.FAIL)
-
-        return success
-
-    def get_active_transactions(self) -> Dict[str, TransactionRecord]:
-        """Get all active transactions.
-
-        Returns:
-            Dictionary of active transactions {transaction_id: TransactionRecord}
-        """
-        return self._transactions.copy()
-
-    def get_transaction_count(self) -> int:
-        """Get the number of active transactions.
-
-        Returns:
-            Number of active transactions
-        """
-        return len(self._transactions)
-
-
-def init_transaction_manager(
-    agfs: AGFSClient,
-    tx_timeout: int = 3600,
-    max_parallel_locks: int = 8,
-    lock_timeout: float = 0.0,
-    lock_expire: float = 300.0,
-    vector_store: Optional[Any] = None,
-) -> TransactionManager:
-    """Initialize transaction manager singleton.
-
-    Args:
-        agfs: AGFS client instance
-        tx_timeout: Transaction timeout in seconds (default: 3600)
-        max_parallel_locks: Maximum number of parallel lock operations (default: 8)
-        lock_timeout: Path lock acquisition timeout in seconds.
-            0 (default) = fail immediately if locked.
-            > 0 = wait/retry up to this many seconds.
-        lock_expire: Stale lock expiry threshold in seconds (default: 300s).
-        vector_store: Optional vector store for VectorDB rollback operations.
-
-    Returns:
-        TransactionManager instance
-    """
-    global _transaction_manager
-
-    with _lock:
-        if _transaction_manager is not None:
-            logger.debug("TransactionManager already initialized")
-            return _transaction_manager
-
-        # Create transaction manager
-        _transaction_manager = TransactionManager(
-            agfs_client=agfs,
-            timeout=tx_timeout,
-            max_parallel_locks=max_parallel_locks,
-            lock_timeout=lock_timeout,
-            lock_expire=lock_expire,
-            vector_store=vector_store,
-        )
-
-        logger.info("TransactionManager initialized as singleton")
-        return _transaction_manager
-
-
-def get_transaction_manager() -> TransactionManager:
-    """Get transaction manager singleton."""
-    if _transaction_manager is None:
-        raise RuntimeError(
-            "TransactionManager not initialized. Call init_transaction_manager() first."
-        )
-    return _transaction_manager
-
-
-def reset_transaction_manager() -> None:
-    """Reset the transaction manager singleton (for testing).
-
-    This function should ONLY be used in tests to clean up state between tests.
-    It clears the global singleton instance without performing cleanup - make sure
-    to call stop() first if the manager is still running.
-    """
-    global _transaction_manager
-    with _lock:
-        _transaction_manager = None
diff --git a/openviking/storage/transaction/transaction_record.py b/openviking/storage/transaction/transaction_record.py
deleted file mode 100644
index b9eb0656..00000000
--- a/openviking/storage/transaction/transaction_record.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Transaction record and status definitions.
-
-Defines the data structures for tracking transaction lifecycle and state.
-"""
-
-import time
-import uuid
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Any, Dict, List
-
-
-class TransactionStatus(str, Enum):
-    """Transaction status enumeration.
-
-    Status machine: INIT -> ACQUIRE -> EXEC -> COMMIT/FAIL -> RELEASING -> RELEASED
-    """
-
-    INIT = "INIT"  # Transaction initialized, waiting for lock acquisition
-    ACQUIRE = "ACQUIRE"  # Acquiring lock resources
-    EXEC = "EXEC"  # Transaction operation in progress
-    COMMIT = "COMMIT"  # Transaction completed successfully
-    FAIL = "FAIL"  # Transaction failed
-    RELEASING = "RELEASING"  # Releasing lock resources
-    RELEASED = "RELEASED"  # Lock resources fully released, transaction ended
-
-    def __str__(self) -> str:
-        return self.value
-
-
-@dataclass
-class TransactionRecord:
-    """Transaction record for tracking transaction lifecycle.
-
-    Attributes:
-        id: Transaction ID in UUID format, uniquely identifies a transaction
-        locks: List of lock paths held by this transaction
-        status: Current transaction status
-        init_info: Transaction initialization information
-        rollback_info: Information for rollback operations
-        undo_log: List of undo entries for rollback
-        post_actions: Actions to execute after successful commit
-        created_at: Creation timestamp (Unix timestamp in seconds)
-        updated_at: Last update timestamp (Unix timestamp in seconds)
-    """
-
-    id: str = field(default_factory=lambda: str(uuid.uuid4()))
-    locks: List[str] = field(default_factory=list)
-    status: TransactionStatus = field(default=TransactionStatus.INIT)
-    init_info: Dict[str, Any] = field(default_factory=dict)
-    rollback_info: Dict[str, Any] = field(default_factory=dict)
-    undo_log: List[Any] = field(default_factory=list)
-    post_actions: List[Dict[str, Any]] = field(default_factory=list)
-    created_at: float = field(default_factory=time.time)
-    updated_at: float = field(default_factory=time.time)
-
-    def update_status(self, status: TransactionStatus) -> None:
-        """Update transaction status and timestamp."""
-        self.status = status
-        self.updated_at = time.time()
-
-    def add_lock(self, lock_path: str) -> None:
-        """Add a lock to the transaction."""
-        if lock_path not in self.locks:
-            self.locks.append(lock_path)
-            self.updated_at = time.time()
-
-    def remove_lock(self, lock_path: str) -> None:
-        """Remove a lock from the transaction."""
-        if lock_path in self.locks:
-            self.locks.remove(lock_path)
-            self.updated_at = time.time()
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert transaction record to dictionary."""
-        return {
-            "id": self.id,
-            "locks": self.locks,
-            "status": str(self.status),
-            "init_info": self.init_info,
-            "rollback_info": self.rollback_info,
-            "created_at": self.created_at,
-            "updated_at": self.updated_at,
-        }
-
-    def to_journal(self) -> Dict[str, Any]:
-        """Serialize to journal format (includes undo_log and post_actions)."""
-        from openviking.storage.transaction.undo import UndoEntry
-
-        return {
-            "id": self.id,
-            "locks": self.locks,
-            "status": str(self.status),
-            "init_info": self.init_info,
-            "undo_log": [e.to_dict() if isinstance(e, UndoEntry) else e for e in self.undo_log],
-            "post_actions": self.post_actions,
-            "created_at": self.created_at,
-            "updated_at": self.updated_at,
-        }
-
-    @classmethod
-    def from_journal(cls, data: Dict[str, Any]) -> "TransactionRecord":
-        """Restore from journal format."""
-        from openviking.storage.transaction.undo import UndoEntry
-
-        status_str = data.get("status", "INIT")
-        status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str
-        undo_log = [UndoEntry.from_dict(e) for e in data.get("undo_log", [])]
-
-        return cls(
-            id=data.get("id", str(uuid.uuid4())),
-            locks=data.get("locks", []),
-            status=status,
-            init_info=data.get("init_info", {}),
-            rollback_info={},
-            undo_log=undo_log,
-            post_actions=data.get("post_actions", []),
-            created_at=data.get("created_at", time.time()),
-            updated_at=data.get("updated_at", time.time()),
-        )
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "TransactionRecord":
-        """Create transaction record from dictionary."""
-        status_str = data.get("status", "INIT")
-        status = TransactionStatus(status_str) if isinstance(status_str, str) else status_str
-
-        return cls(
-            id=data.get("id", str(uuid.uuid4())),
-            locks=data.get("locks", []),
-            status=status,
-            init_info=data.get("init_info", {}),
-            rollback_info=data.get("rollback_info", {}),
-            created_at=data.get("created_at", time.time()),
-            updated_at=data.get("updated_at", time.time()),
-        )
diff --git a/openviking/storage/transaction/undo.py b/openviking/storage/transaction/undo.py
deleted file mode 100644
index 0b5b3113..00000000
--- a/openviking/storage/transaction/undo.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""
-Undo log and rollback executor for transaction management.
-
-Records operations performed within a transaction so they can be reversed
-on rollback. Each UndoEntry captures one atomic sub-operation.
-"""
-
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
-
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-def _reconstruct_ctx(params: Dict[str, Any]) -> Optional[Any]:
-    """Reconstruct a RequestContext from serialized _ctx_* fields in undo params.
-
-    Returns None if the required fields are missing.
-    """
-    account_id = params.get("_ctx_account_id")
-    user_id = params.get("_ctx_user_id")
-    agent_id = params.get("_ctx_agent_id")
-    role_value = params.get("_ctx_role")
-    if account_id is None or user_id is None:
-        return None
-    try:
-        from openviking.server.identity import RequestContext, Role
-        from openviking_cli.session.user_id import UserIdentifier
-
-        role = Role(role_value) if role_value in {r.value for r in Role} else Role.ROOT
-        user = UserIdentifier(account_id, user_id, agent_id or "default")
-        return RequestContext(user=user, role=role)
-    except Exception as e:
-        logger.warning(f"[Rollback] Failed to reconstruct ctx: {e}")
-        return None
-
-
-@dataclass
-class UndoEntry:
-    """A single undo log entry representing one reversible sub-operation.
-
-    Attributes:
-        sequence: Monotonically increasing index within the transaction.
-        op_type: Operation type (fs_mv, fs_rm, fs_mkdir, fs_write_new,
-                 vectordb_upsert, vectordb_delete, vectordb_update_uri).
-        params: Parameters needed to reverse the operation.
-        completed: Whether the forward operation completed successfully.
-    """
-
-    sequence: int
-    op_type: str
-    params: Dict[str, Any] = field(default_factory=dict)
-    completed: bool = False
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "sequence": self.sequence,
-            "op_type": self.op_type,
-            "params": self.params,
-            "completed": self.completed,
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "UndoEntry":
-        return cls(
-            sequence=data.get("sequence", 0),
-            op_type=data.get("op_type", ""),
-            params=data.get("params", {}),
-            completed=data.get("completed", False),
-        )
-
-
-async def execute_rollback(
-    undo_log: List[UndoEntry],
-    agfs: Any,
-    vector_store: Optional[Any] = None,
-    ctx: Optional[Any] = None,
-    recover_all: bool = False,
-) -> None:
-    """Execute rollback by reversing operations in reverse order.
-
-    Best-effort: each step is wrapped in try-except so a single failure
-    does not prevent subsequent undo steps from running.
-
-    Args:
-        undo_log: List of undo entries to process.
-        agfs: AGFS client for filesystem operations.
-        vector_store: Optional vector store client.
-        ctx: Optional request context.
-        recover_all: If True, also attempt to reverse entries that were not
-            marked completed (used during crash recovery to clean up partial
-            operations such as a directory mv that only half-finished).
-    """
-    if recover_all:
-        entries = list(undo_log)
-    else:
-        entries = [e for e in undo_log if e.completed]
-    entries.sort(key=lambda e: e.sequence, reverse=True)
-
-    for entry in entries:
-        try:
-            await _rollback_entry(entry, agfs, vector_store, ctx)
-            logger.info(f"[Rollback] Reversed {entry.op_type} seq={entry.sequence}")
-        except Exception as e:
-            logger.warning(
-                f"[Rollback] Failed to reverse {entry.op_type} seq={entry.sequence}: {e}"
-            )
-
-
-async def _rollback_entry(
-    entry: UndoEntry,
-    agfs: Any,
-    vector_store: Optional[Any],
-    ctx: Optional[Any],
-) -> None:
-    """Dispatch rollback for a single undo entry."""
-    op = entry.op_type
-    params = entry.params
-
-    if op == "fs_mv":
-        agfs.mv(params["dst"], params["src"])
-
-    elif op == "fs_rm":
-        logger.debug("[Rollback] fs_rm is not reversible, skipping")
-
-    elif op == "fs_mkdir":
-        try:
-            agfs.rm(params["uri"])
-        except Exception:
-            pass
-
-    elif op == "fs_write_new":
-        try:
-            agfs.rm(params["uri"], recursive=True)
-        except Exception:
-            pass
-
-    elif op == "vectordb_upsert":
-        if vector_store:
-            record_id = params.get("record_id")
-            if record_id:
-                restored_ctx = _reconstruct_ctx(params)
-                if restored_ctx:
-                    await vector_store.delete([record_id], ctx=restored_ctx)
-                else:
-                    logger.warning("[Rollback] vectordb_upsert: cannot reconstruct ctx, skipping")
-
-    elif op == "vectordb_delete":
-        if vector_store:
-            restored_ctx = _reconstruct_ctx(params)
-            if restored_ctx is None:
-                logger.warning("[Rollback] vectordb_delete: cannot reconstruct ctx, skipping")
-            else:
-                records_snapshot = params.get("records_snapshot", [])
-                for record in records_snapshot:
-                    try:
-                        await vector_store.upsert(record, ctx=restored_ctx)
-                    except Exception as e:
-                        logger.warning(f"[Rollback] Failed to restore vector record: {e}")
-
-    elif op == "vectordb_update_uri":
-        if vector_store:
-            restored_ctx = _reconstruct_ctx(params)
-            if restored_ctx is None:
-                logger.warning("[Rollback] vectordb_update_uri: cannot reconstruct ctx, skipping")
-            else:
-                await vector_store.update_uri_mapping(
-                    ctx=restored_ctx,
-                    uri=params["new_uri"],
-                    new_uri=params["old_uri"],
-                    new_parent_uri=params.get("old_parent_uri", ""),
-                )
-
-    else:
-        logger.warning(f"[Rollback] Unknown op_type: {op}")
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index dc20acd1..72475573 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -289,17 +289,14 @@ async def rm(
         This method is idempotent: deleting a non-existent file succeeds
         after cleaning up any orphan index records.
 
-        Wrapped in a transaction: deletes VectorDB records first, then FS files.
-        On rollback, VectorDB records are restored from snapshot.
+        Acquires a path lock, deletes VectorDB records, then FS files.
         """
-        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+        from openviking.storage.transaction import LockContext, get_lock_manager
 
         self._ensure_access(uri, ctx)
         path = self._uri_to_path(uri, ctx=ctx)
         target_uri = self._path_to_uri(path, ctx=ctx)
 
-        tx_manager = get_transaction_manager()
-
         # Check existence and determine lock strategy
         try:
             stat = self.agfs.stat(path)
@@ -320,36 +317,11 @@ async def rm(
             lock_paths = [parent]
             lock_mode = "point"
 
-        async with TransactionContext(tx_manager, "rm", lock_paths, lock_mode=lock_mode) as tx:
-            # Collect URIs inside the lock to avoid race conditions
+        async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode):
             uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
             uris_to_delete.append(target_uri)
-
-            # Snapshot vector records for rollback
-            records_snapshot = await self._snapshot_vector_records(uris_to_delete, ctx=ctx)
-
-            # Step 1: Delete from VectorDB first
-            real_ctx = self._ctx_or_default(ctx)
-            seq_vdb = tx.record_undo(
-                "vectordb_delete",
-                {
-                    "uris": uris_to_delete,
-                    "records_snapshot": records_snapshot,
-                    "_ctx_account_id": real_ctx.account_id,
-                    "_ctx_user_id": real_ctx.user.user_id,
-                    "_ctx_agent_id": real_ctx.user.agent_id,
-                    "_ctx_role": real_ctx.role.value,
-                },
-            )
             await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
-            tx.mark_completed(seq_vdb)
-
-            # Step 2: Delete from FS
-            seq_fs = tx.record_undo("fs_rm", {"uri": path, "recursive": recursive})
             result = self.agfs.rm(path, recursive=recursive)
-            tx.mark_completed(seq_fs)
-
-            await tx.commit()
             return result
 
     async def mv(
@@ -361,10 +333,10 @@ async def mv(
         """Move file/directory + recursively update vector index.
 
         Implemented as cp + rm to avoid lock files being carried by FS mv.
-        On rollback, the copy is deleted and the source remains intact.
+        On VectorDB update failure the copy is cleaned up so the source stays intact.
         """
         from openviking.pyagfs.helpers import cp as agfs_cp
-        from openviking.storage.transaction import TransactionContext, get_transaction_manager
+        from openviking.storage.transaction import LockContext, get_lock_manager
 
         self._ensure_access(old_uri, ctx)
         self._ensure_access(new_uri, ctx)
@@ -372,8 +344,6 @@ async def mv(
         new_path = self._uri_to_path(new_uri, ctx=ctx)
         target_uri = self._path_to_uri(old_path, ctx=ctx)
 
-        tx_manager = get_transaction_manager()
-
         # Verify source exists and determine type before locking
         try:
             stat = self.agfs.stat(old_path)
@@ -383,20 +353,17 @@ async def mv(
 
         dst_parent = new_path.rsplit("/", 1)[0] if "/" in new_path else new_path
 
-        async with TransactionContext(
-            tx_manager,
-            "mv",
+        async with LockContext(
+            get_lock_manager(),
             [old_path],
             lock_mode="mv",
             mv_dst_path=dst_parent,
             src_is_dir=is_dir,
-        ) as tx:
-            # Collect URIs inside the lock to avoid race conditions
+        ):
             uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
             uris_to_move.append(target_uri)
 
-            # Step 1: Copy source to destination
-            seq_cp = tx.record_undo("fs_write_new", {"uri": new_path})
+            # Copy source to destination (source still intact)
             try:
                 agfs_cp(self.agfs, old_path, new_path, recursive=is_dir)
             except Exception as e:
@@ -404,9 +371,8 @@ async def mv(
                     await self._delete_from_vector_store(uris_to_move, ctx=ctx)
                     logger.info(f"[VikingFS] mv source not found, cleaned orphan index: {old_uri}")
                 raise
-            tx.mark_completed(seq_cp)
 
-            # Step 2: Remove carried lock file from the copy (directory only)
+            # Remove carried lock file from the copy (directory only)
             if is_dir:
                 carried_lock = new_path.rstrip("/") + "/.path.ovlock"
                 try:
@@ -414,34 +380,18 @@ async def mv(
                 except Exception:
                     pass
 
-            # Step 3: Update VectorDB URIs
-            old_uri_stripped = old_uri.rstrip("/")
-            old_parent_uri = (
-                old_uri_stripped.rsplit("/", 1)[0] + "/" if "/" in old_uri_stripped else ""
-            )
-            real_ctx = self._ctx_or_default(ctx)
-            seq_vdb = tx.record_undo(
-                "vectordb_update_uri",
-                {
-                    "old_uri": old_uri,
-                    "new_uri": new_uri,
-                    "old_parent_uri": old_parent_uri,
-                    "uris": uris_to_move,
-                    "_ctx_account_id": real_ctx.account_id,
-                    "_ctx_user_id": real_ctx.user.user_id,
-                    "_ctx_agent_id": real_ctx.user.agent_id,
-                    "_ctx_role": real_ctx.role.value,
-                },
-            )
-            await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
-            tx.mark_completed(seq_vdb)
+            # Update VectorDB URIs (on failure, clean up the copy)
+            try:
+                await self._update_vector_store_uris(uris_to_move, old_uri, new_uri, ctx=ctx)
+            except Exception:
+                try:
+                    self.agfs.rm(new_path, recursive=is_dir)
+                except Exception:
+                    pass
+                raise
 
-            # Step 4: Remove source (lock file gets deleted along with it)
-            seq_rm = tx.record_undo("fs_rm", {"uri": old_path, "recursive": is_dir})
+            # Delete source
             self.agfs.rm(old_path, recursive=is_dir)
-            tx.mark_completed(seq_rm)
-
-            await tx.commit()
             return {}
 
     async def grep(
@@ -1131,7 +1081,7 @@ def _is_accessible(self, uri: str, ctx: RequestContext) -> bool:
             return True
 
         scope = parts[0]
-        if scope in {"resources", "temp", "transactions"}:
+        if scope in {"resources", "temp"}:
             return True
         if scope == "_system":
             return False
@@ -1206,33 +1156,6 @@ def _infer_context_type(self, uri: str):
 
     # ========== Vector Sync Helper Methods ==========
 
-    async def _snapshot_vector_records(
-        self, uris: List[str], ctx: Optional[RequestContext] = None
-    ) -> List[Dict[str, Any]]:
-        """Snapshot vector records for the given URIs (for rollback).
-
-        Queries VectorDB metadata (without embedding vectors) so that
-        records can be restored during rollback.
-        """
-        vector_store = self._get_vector_store()
-        if not vector_store:
-            return []
-
-        real_ctx = self._ctx_or_default(ctx)
-        snapshots = []
-        for uri in uris:
-            try:
-                records = await vector_store.get_context_by_uri(
-                    uri=uri,
-                    limit=10,
-                    ctx=real_ctx,
-                )
-                if records:
-                    snapshots.extend(records)
-            except Exception as e:
-                logger.debug(f"[VikingFS] Failed to snapshot vector record for {uri}: {e}")
-        return snapshots
-
     async def _collect_uris(
         self, path: str, recursive: bool, ctx: Optional[RequestContext] = None
     ) -> List[str]:
diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py
index d6c44194..42ca8752 100644
--- a/openviking/utils/resource_processor.py
+++ b/openviking/utils/resource_processor.py
@@ -218,11 +218,8 @@ async def process_resource(
             viking_fs = get_viking_fs()
             target_exists = await viking_fs.exists(root_uri, ctx=ctx)
             if not target_exists:
-                # 第一次添加：事务保护下将 temp 移到 final
-                from openviking.storage.transaction import (
-                    TransactionContext,
-                    get_transaction_manager,
-                )
+                # 第一次添加：锁保护下将 temp 移到 final
+                from openviking.storage.transaction import LockContext, get_lock_manager
 
                 dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx)
                 parent_path = dst_path.rsplit("/", 1)[0] if "/" in dst_path else dst_path
@@ -232,17 +229,9 @@ async def process_resource(
                 if parent_uri:
                     await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx)
 
-                async with TransactionContext(
-                    get_transaction_manager(),
-                    "finalize_from_temp",
-                    [parent_path],
-                    lock_mode="point",
-                ) as tx:
-                    seq = tx.record_undo("fs_write_new", {"uri": dst_path})
+                async with LockContext(get_lock_manager(), [parent_path], lock_mode="point"):
                     src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx)
                     await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path)
-                    tx.mark_completed(seq)
-                    await tx.commit()
 
                 # 清理 temp 根目录
                 try:
diff --git a/openviking_cli/utils/config/transaction_config.py b/openviking_cli/utils/config/transaction_config.py
index fac8c2aa..86d153f8 100644
--- a/openviking_cli/utils/config/transaction_config.py
+++ b/openviking_cli/utils/config/transaction_config.py
@@ -29,9 +29,4 @@ class TransactionConfig(BaseModel):
         ),
     )
 
-    max_parallel_locks: int = Field(
-        default=8,
-        description="Maximum parallel lock operations during recursive rm/mv.",
-    )
-
     model_config = {"extra": "forbid"}
diff --git a/tests/agfs/test_fs_binding.py b/tests/agfs/test_fs_binding.py
index e55ff6fd..3e76ee8f 100644
--- a/tests/agfs/test_fs_binding.py
+++ b/tests/agfs/test_fs_binding.py
@@ -13,7 +13,7 @@
 
 import pytest
 
-from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
+from openviking.storage.transaction import init_lock_manager, reset_lock_manager
 from openviking.storage.viking_fs import init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -33,15 +33,15 @@ async def viking_fs_binding_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize TransactionManager and VikingFS with client
-    init_transaction_manager(agfs=agfs_client)
+    # Initialize LockManager and VikingFS with client
+    init_lock_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
     # make sure default/temp directory exists
     await vfs.mkdir("viking://temp/", exist_ok=True)
 
     yield vfs
 
-    reset_transaction_manager()
+    reset_lock_manager()
 
 
 @pytest.mark.asyncio
diff --git a/tests/agfs/test_fs_binding_s3.py b/tests/agfs/test_fs_binding_s3.py
index aa7a753b..802d4f6d 100644
--- a/tests/agfs/test_fs_binding_s3.py
+++ b/tests/agfs/test_fs_binding_s3.py
@@ -13,7 +13,7 @@
 
 import pytest
 
-from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
+from openviking.storage.transaction import init_lock_manager, reset_lock_manager
 from openviking.storage.viking_fs import init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -58,13 +58,13 @@ async def viking_fs_binding_s3_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize TransactionManager and VikingFS with client
-    init_transaction_manager(agfs=agfs_client)
+    # Initialize LockManager and VikingFS with client
+    init_lock_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
 
     yield vfs
 
-    reset_transaction_manager()
+    reset_lock_manager()
 
 
 @pytest.mark.asyncio
diff --git a/tests/agfs/test_fs_local.py b/tests/agfs/test_fs_local.py
index 9e59f610..41ef0730 100644
--- a/tests/agfs/test_fs_local.py
+++ b/tests/agfs/test_fs_local.py
@@ -10,7 +10,7 @@
 import pytest
 
 from openviking.agfs_manager import AGFSManager
-from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
+from openviking.storage.transaction import init_lock_manager, reset_lock_manager
 from openviking.storage.viking_fs import init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -40,15 +40,15 @@ async def viking_fs_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize TransactionManager and VikingFS with client
-    init_transaction_manager(agfs=agfs_client)
+    # Initialize LockManager and VikingFS with client
+    init_lock_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
     # make sure default/temp directory exists
     await vfs.mkdir("viking://temp/", exist_ok=True)
 
     yield vfs
 
-    reset_transaction_manager()
+    reset_lock_manager()
     # AGFSManager.stop is synchronous
     manager.stop()
 
diff --git a/tests/agfs/test_fs_s3.py b/tests/agfs/test_fs_s3.py
index 67a54e40..00504fad 100644
--- a/tests/agfs/test_fs_s3.py
+++ b/tests/agfs/test_fs_s3.py
@@ -13,7 +13,7 @@
 import pytest
 
 from openviking.agfs_manager import AGFSManager
-from openviking.storage.transaction import init_transaction_manager, reset_transaction_manager
+from openviking.storage.transaction import init_lock_manager, reset_lock_manager
 from openviking.storage.viking_fs import VikingFS, init_viking_fs
 from openviking_cli.utils.config.agfs_config import AGFSConfig
 
@@ -83,13 +83,13 @@ async def viking_fs_instance():
     # Create AGFS client
     agfs_client = create_agfs_client(AGFS_CONF)
 
-    # Initialize TransactionManager and VikingFS with client
-    init_transaction_manager(agfs=agfs_client)
+    # Initialize LockManager and VikingFS with client
+    init_lock_manager(agfs=agfs_client)
     vfs = init_viking_fs(agfs=agfs_client)
 
     yield vfs
 
-    reset_transaction_manager()
+    reset_lock_manager()
     # AGFSManager.stop is synchronous
     manager.stop()
 
diff --git a/tests/server/conftest.py b/tests/server/conftest.py
index 78dbb63e..98cf606f 100644
--- a/tests/server/conftest.py
+++ b/tests/server/conftest.py
@@ -20,7 +20,7 @@
 from openviking.server.config import ServerConfig
 from openviking.server.identity import RequestContext, Role
 from openviking.service.core import OpenVikingService
-from openviking.storage.transaction import reset_transaction_manager
+from openviking.storage.transaction import reset_lock_manager
 from openviking_cli.session.user_id import UserIdentifier
 from openviking_cli.utils.config.embedding_config import EmbeddingConfig
 from openviking_cli.utils.config.vlm_config import VLMConfig
@@ -110,7 +110,7 @@ def sample_markdown_file(temp_dir: Path) -> Path:
 @pytest_asyncio.fixture(scope="function")
 async def service(temp_dir: Path, monkeypatch):
     """Create and initialize an OpenVikingService in embedded mode."""
-    reset_transaction_manager()
+    reset_lock_manager()
     fake_embedder_cls = _install_fake_embedder(monkeypatch)
     _install_fake_vlm(monkeypatch)
     svc = OpenVikingService(
@@ -120,7 +120,7 @@ async def service(temp_dir: Path, monkeypatch):
     svc.viking_fs.query_embedder = fake_embedder_cls()
     yield svc
     await svc.close()
-    reset_transaction_manager()
+    reset_lock_manager()
 
 
 @pytest_asyncio.fixture(scope="function")
@@ -165,7 +165,7 @@ async def client_with_resource(client, service, sample_markdown_file):
 async def running_server(temp_dir: Path, monkeypatch):
     """Start a real uvicorn server in a background thread."""
     await AsyncOpenViking.reset()
-    reset_transaction_manager()
+    reset_lock_manager()
     fake_embedder_cls = _install_fake_embedder(monkeypatch)
     _install_fake_vlm(monkeypatch)
 
diff --git a/tests/storage/test_semantic_dag_skip_files.py b/tests/storage/test_semantic_dag_skip_files.py
index 6fdf30ee..3eaeaa2f 100644
--- a/tests/storage/test_semantic_dag_skip_files.py
+++ b/tests/storage/test_semantic_dag_skip_files.py
@@ -11,19 +11,18 @@
 
 
 def _mock_transaction_layer(monkeypatch):
-    """Patch transaction layer to no-op for DAG tests."""
-    mock_tx = MagicMock()
-    mock_tx.commit = AsyncMock()
+    """Patch lock layer to no-op for DAG tests."""
+    mock_handle = MagicMock()
     monkeypatch.setattr(
-        "openviking.storage.transaction.context_manager.TransactionContext.__aenter__",
-        AsyncMock(return_value=mock_tx),
+        "openviking.storage.transaction.lock_context.LockContext.__aenter__",
+        AsyncMock(return_value=mock_handle),
     )
     monkeypatch.setattr(
-        "openviking.storage.transaction.context_manager.TransactionContext.__aexit__",
+        "openviking.storage.transaction.lock_context.LockContext.__aexit__",
         AsyncMock(return_value=False),
     )
     monkeypatch.setattr(
-        "openviking.storage.transaction.get_transaction_manager",
+        "openviking.storage.transaction.get_lock_manager",
         lambda: MagicMock(),
     )
 
@@ -58,6 +57,9 @@ async def _generate_overview(self, dir_uri, file_summaries, children_abstracts):
     def _extract_abstract_from_overview(self, overview):
         return "abstract"
 
+    def _enforce_size_limits(self, overview, abstract):
+        return overview, abstract
+
     async def _vectorize_directory(
         self, uri, context_type, abstract, overview, ctx=None, semantic_msg_id=None
     ):
diff --git a/tests/storage/test_semantic_dag_stats.py b/tests/storage/test_semantic_dag_stats.py
index 23dde041..94f9441f 100644
--- a/tests/storage/test_semantic_dag_stats.py
+++ b/tests/storage/test_semantic_dag_stats.py
@@ -40,6 +40,9 @@ async def _generate_overview(self, dir_uri, file_summaries, children_abstracts):
     def _extract_abstract_from_overview(self, overview):
         return "abstract"
 
+    def _enforce_size_limits(self, overview, abstract):
+        return overview, abstract
+
     async def _vectorize_directory(
         self, uri, context_type, abstract, overview, ctx=None, semantic_msg_id=None
     ):
@@ -79,19 +82,18 @@ async def test_semantic_dag_stats_collects_nodes(monkeypatch):
         lambda: _DummyTracker(),
     )
 
-    # Mock transaction layer: TransactionContext as no-op passthrough
-    mock_tx = MagicMock()
-    mock_tx.commit = AsyncMock()
+    # Mock lock layer: LockContext as no-op passthrough
+    mock_handle = MagicMock()
     monkeypatch.setattr(
-        "openviking.storage.transaction.context_manager.TransactionContext.__aenter__",
-        AsyncMock(return_value=mock_tx),
+        "openviking.storage.transaction.lock_context.LockContext.__aenter__",
+        AsyncMock(return_value=mock_handle),
     )
     monkeypatch.setattr(
-        "openviking.storage.transaction.context_manager.TransactionContext.__aexit__",
+        "openviking.storage.transaction.lock_context.LockContext.__aexit__",
         AsyncMock(return_value=False),
     )
     monkeypatch.setattr(
-        "openviking.storage.transaction.get_transaction_manager",
+        "openviking.storage.transaction.get_lock_manager",
         lambda: MagicMock(),
     )
 
diff --git a/tests/transaction/conftest.py b/tests/transaction/conftest.py
index 05fac402..a0952289 100644
--- a/tests/transaction/conftest.py
+++ b/tests/transaction/conftest.py
@@ -11,9 +11,9 @@
 from openviking.agfs_manager import AGFSManager
 from openviking.server.identity import RequestContext, Role
 from openviking.storage.collection_schemas import CollectionSchemas
-from openviking.storage.transaction.journal import TransactionJournal
+from openviking.storage.transaction.lock_manager import LockManager
 from openviking.storage.transaction.path_lock import LOCK_FILE_NAME, _make_fencing_token
-from openviking.storage.transaction.transaction_manager import TransactionManager
+from openviking.storage.transaction.redo_log import RedoLog
 from openviking.storage.viking_vector_index_backend import VikingVectorIndexBackend
 from openviking.utils.agfs_utils import create_agfs_client
 from openviking_cli.session.user_id import UserIdentifier
@@ -55,7 +55,6 @@ def _mkdir_ok(agfs_client, path):
 
 @pytest.fixture
 def test_dir(agfs_client):
-    """每个测试独享隔离目录，自动清理。"""
     path = f"/local/tx-tests/{uuid.uuid4().hex}"
     _mkdir_ok(agfs_client, "/local")
     _mkdir_ok(agfs_client, "/local/tx-tests")
@@ -102,20 +101,20 @@ def request_ctx():
 
 
 # ---------------------------------------------------------------------------
-# Transaction fixtures
+# Lock fixtures
 # ---------------------------------------------------------------------------
 
 
 @pytest.fixture
-def tx_manager(agfs_client, vector_store):
-    """Function-scoped TransactionManager with real backends."""
-    return TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
+def lock_manager(agfs_client):
+    """Function-scoped LockManager with real AGFS backend."""
+    return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0)
 
 
 @pytest.fixture
-def journal(agfs_client):
-    """Function-scoped TransactionJournal with real AGFS backend."""
-    return TransactionJournal(agfs_client)
+def redo_log(agfs_client):
+    """Function-scoped RedoLog with real AGFS backend."""
+    return RedoLog(agfs_client)
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/transaction/test_concurrent_lock.py b/tests/transaction/test_concurrent_lock.py
index e98279e4..7e25ab57 100644
--- a/tests/transaction/test_concurrent_lock.py
+++ b/tests/transaction/test_concurrent_lock.py
@@ -5,8 +5,8 @@
 import asyncio
 import uuid
 
+from openviking.storage.transaction.lock_handle import LockHandle
 from openviking.storage.transaction.path_lock import PathLock
-from openviking.storage.transaction.transaction_record import TransactionRecord
 
 
 class TestConcurrentLock:
@@ -17,7 +17,7 @@ async def test_point_mutual_exclusion_same_path(self, agfs_client, test_dir):
         results = {}
 
         async def holder(tx_id):
-            tx = TransactionRecord(id=tx_id)
+            tx = LockHandle(id=tx_id)
             ok = await lock.acquire_point(test_dir, tx, timeout=5.0)
             if ok:
                 await asyncio.sleep(0.3)
@@ -45,7 +45,7 @@ async def test_subtree_blocks_concurrent_point_child(self, agfs_client, test_dir
         child_result = {}
 
         async def parent_holder():
-            tx = TransactionRecord(id="tx-sub-parent")
+            tx = LockHandle(id="tx-sub-parent")
             ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0)
             assert ok is True
             parent_acquired.set()
@@ -55,7 +55,7 @@ async def parent_holder():
 
         async def child_worker():
             await parent_acquired.wait()
-            tx = TransactionRecord(id="tx-sub-child")
+            tx = LockHandle(id="tx-sub-child")
             ok = await lock.acquire_point(child, tx, timeout=5.0)
             child_result["ok"] = ok
             child_result["after_release"] = parent_released.is_set()
@@ -80,7 +80,7 @@ async def test_point_child_blocks_concurrent_subtree_parent(self, agfs_client, t
         parent_result = {}
 
         async def child_holder():
-            tx = TransactionRecord(id="tx-rev-child")
+            tx = LockHandle(id="tx-rev-child")
             ok = await lock.acquire_point(child, tx, timeout=5.0)
             assert ok is True
             child_acquired.set()
@@ -90,7 +90,7 @@ async def child_holder():
 
         async def parent_worker():
             await child_acquired.wait()
-            tx = TransactionRecord(id="tx-rev-parent")
+            tx = LockHandle(id="tx-rev-parent")
             ok = await lock.acquire_subtree(test_dir, tx, timeout=5.0)
             parent_result["ok"] = ok
             parent_result["after_release"] = child_released.is_set()
diff --git a/tests/transaction/test_context_manager.py b/tests/transaction/test_context_manager.py
deleted file mode 100644
index bf077bf9..00000000
--- a/tests/transaction/test_context_manager.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for TransactionContext."""
-
-from unittest.mock import AsyncMock, MagicMock
-
-import pytest
-
-from openviking.storage.errors import LockAcquisitionError
-from openviking.storage.transaction.context_manager import TransactionContext
-from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus
-
-
-def _make_tx_manager(lock_succeeds=True):
-    """Create a mock TransactionManager with async methods."""
-    tx_manager = MagicMock()
-    record = TransactionRecord(id="tx-test", status=TransactionStatus.INIT)
-
-    tx_manager.create_transaction.return_value = record
-    tx_manager.acquire_lock_point = AsyncMock(return_value=lock_succeeds)
-    tx_manager.acquire_lock_subtree = AsyncMock(return_value=lock_succeeds)
-    tx_manager.acquire_lock_mv = AsyncMock(return_value=lock_succeeds)
-    tx_manager.commit = AsyncMock(return_value=True)
-    tx_manager.rollback = AsyncMock(return_value=True)
-
-    journal = MagicMock()
-    tx_manager.journal = journal
-
-    return tx_manager, record
-
-
-class TestTransactionContextNormal:
-    async def test_commit_success(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx:
-            seq = tx.record_undo("fs_write_new", {"uri": "/path/file"})
-            tx.mark_completed(seq)
-            await tx.commit()
-
-        tx_manager.commit.assert_called_once_with("tx-test")
-        tx_manager.rollback.assert_not_called()
-
-    async def test_rollback_on_exception(self):
-        tx_manager, record = _make_tx_manager()
-
-        with pytest.raises(ValueError):
-            async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx:
-                seq = tx.record_undo("fs_write_new", {"uri": "/path/file"})
-                tx.mark_completed(seq)
-                raise ValueError("something went wrong")
-
-        tx_manager.rollback.assert_called_once_with("tx-test")
-        tx_manager.commit.assert_not_called()
-
-    async def test_rollback_on_no_commit(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "test_op", ["/path"]) as tx:
-            tx.record_undo("fs_write_new", {"uri": "/path/file"})
-            # Forgot to call tx.commit()
-
-        tx_manager.rollback.assert_called_once_with("tx-test")
-
-    async def test_lock_failure_raises(self):
-        tx_manager, record = _make_tx_manager(lock_succeeds=False)
-
-        with pytest.raises(LockAcquisitionError):
-            async with TransactionContext(tx_manager, "test_op", ["/path"]) as _tx:
-                pass
-
-
-class TestTransactionContextLockModes:
-    async def test_subtree_lock_mode(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "rm_op", ["/path"], lock_mode="subtree") as tx:
-            await tx.commit()
-
-        tx_manager.acquire_lock_subtree.assert_called_once()
-
-    async def test_mv_lock_mode(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(
-            tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path="/dst"
-        ) as tx:
-            await tx.commit()
-
-        tx_manager.acquire_lock_mv.assert_called_once_with(
-            "tx-test", "/src", "/dst", src_is_dir=True
-        )
-
-    async def test_point_lock_mode(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "write_op", ["/path"], lock_mode="point") as tx:
-            await tx.commit()
-
-        tx_manager.acquire_lock_point.assert_called_once()
-
-
-class TestTransactionContextUndoLog:
-    async def test_undo_entries_tracked(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
-            s0 = tx.record_undo("fs_mkdir", {"uri": "/a"})
-            s1 = tx.record_undo("fs_write_new", {"uri": "/a/f.txt"})
-            tx.mark_completed(s0)
-            tx.mark_completed(s1)
-            await tx.commit()
-
-        assert len(record.undo_log) == 2
-        assert record.undo_log[0].completed is True
-        assert record.undo_log[1].completed is True
-
-
-class TestTransactionContextPostActions:
-    async def test_post_actions_added(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
-            tx.add_post_action("enqueue_semantic", {"uri": "viking://test"})
-            await tx.commit()
-
-        assert len(record.post_actions) == 1
-        assert record.post_actions[0]["type"] == "enqueue_semantic"
-
-
-class TestTransactionContextEdgeCases:
-    async def test_commit_failure_raises_transaction_error(self):
-        """When TransactionManager.commit() returns False, TransactionError is raised."""
-        from openviking.storage.errors import TransactionError
-
-        tx_manager, record = _make_tx_manager()
-        tx_manager.commit = AsyncMock(return_value=False)
-
-        with pytest.raises(TransactionError, match="Failed to commit"):
-            async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
-                await tx.commit()
-
-    async def test_mv_mode_missing_dst_raises(self):
-        """mv lock mode without mv_dst_path raises TransactionError."""
-        from openviking.storage.errors import TransactionError
-
-        tx_manager, record = _make_tx_manager()
-
-        with pytest.raises(TransactionError, match="mv lock mode requires"):
-            async with TransactionContext(
-                tx_manager, "mv_op", ["/src"], lock_mode="mv", mv_dst_path=None
-            ) as _tx:
-                pass
-
-    async def test_mark_completed_nonexistent_sequence_is_noop(self):
-        """mark_completed with a sequence not in undo_log doesn't crash."""
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
-            seq = tx.record_undo("fs_mkdir", {"uri": "/a"})
-            tx.mark_completed(999)  # Nonexistent sequence
-            # Original entry should remain unmarked
-            assert record.undo_log[0].completed is False
-            tx.mark_completed(seq)
-            assert record.undo_log[0].completed is True
-            await tx.commit()
-
-    async def test_journal_update_failure_does_not_break_transaction(self):
-        """Journal update failures during record_undo/mark_completed are silently ignored."""
-        tx_manager, record = _make_tx_manager()
-        tx_manager.journal.update.side_effect = Exception("disk full")
-
-        # Should not raise despite journal failures
-        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
-            seq = tx.record_undo("fs_mkdir", {"uri": "/a"})
-            tx.mark_completed(seq)
-            await tx.commit()
-
-        assert len(record.undo_log) == 1
-        assert record.undo_log[0].completed is True
-
-    async def test_record_property_before_enter_raises(self):
-        """Accessing tx.record before __aenter__ raises TransactionError."""
-        from openviking.storage.errors import TransactionError
-
-        tx_manager, _ = _make_tx_manager()
-        ctx = TransactionContext(tx_manager, "test", ["/path"])
-
-        with pytest.raises(TransactionError, match="Transaction not started"):
-            _ = ctx.record
-
-    async def test_multiple_undo_entries_sequence_increments(self):
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(tx_manager, "test", ["/path"]) as tx:
-            s0 = tx.record_undo("fs_mkdir", {"uri": "/a"})
-            s1 = tx.record_undo("fs_write_new", {"uri": "/a/f"})
-            s2 = tx.record_undo("fs_mv", {"src": "/a", "dst": "/b"})
-            assert s0 == 0
-            assert s1 == 1
-            assert s2 == 2
-            await tx.commit()
-
-    async def test_multiple_lock_paths_point_mode(self):
-        """Multiple lock_paths in point mode: each path gets acquire_lock_point called."""
-        tx_manager, record = _make_tx_manager()
-
-        async with TransactionContext(
-            tx_manager, "multi", ["/path1", "/path2"], lock_mode="point"
-        ) as tx:
-            await tx.commit()
-
-        assert tx_manager.acquire_lock_point.call_count == 2
-
-    async def test_subtree_multiple_paths_stops_on_first_failure(self):
-        """If acquiring subtree lock on first path fails, second path is not attempted."""
-        tx_manager, record = _make_tx_manager(lock_succeeds=False)
-
-        with pytest.raises(LockAcquisitionError):
-            async with TransactionContext(
-                tx_manager, "rm", ["/path1", "/path2"], lock_mode="subtree"
-            ) as _tx:
-                pass
-
-        # Only called once (failed on first path)
-        assert tx_manager.acquire_lock_subtree.call_count == 1
diff --git a/tests/transaction/test_crash_recovery.py b/tests/transaction/test_crash_recovery.py
deleted file mode 100644
index 21569edd..00000000
--- a/tests/transaction/test_crash_recovery.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Integration test: crash recovery from journal using real AGFS and VectorDB backends."""
-
-import uuid
-from unittest.mock import AsyncMock, patch
-
-from openviking.storage.transaction.journal import TransactionJournal
-from openviking.storage.transaction.transaction_manager import TransactionManager
-from openviking.storage.transaction.transaction_record import (
-    TransactionRecord,
-    TransactionStatus,
-)
-from openviking.storage.transaction.undo import UndoEntry
-
-from .conftest import VECTOR_DIM, _mkdir_ok, file_exists, make_lock_file
-
-
-def _write_journal(journal, record):
-    """Write a TransactionRecord to real journal storage."""
-    journal.write(record.to_journal())
-
-
-class TestCrashRecovery:
-    """
-    Core technique: simulate crash recovery.
-
-    1. Create real FS state via agfs_client
-    2. Build TransactionRecord, write to real journal
-    3. Create fresh TransactionManager (simulates process restart)
-    4. Call manager._recover_pending_transactions()
-    5. Verify final state via agfs_client.stat()/cat() and vector_store.get()
-    """
-
-    async def test_recover_commit_no_rollback(self, agfs_client, vector_store, test_dir):
-        """COMMIT status → committed files NOT rolled back, journal cleaned up."""
-        # Create a file that was part of a committed transaction
-        committed_file = f"{test_dir}/committed.txt"
-        agfs_client.write(committed_file, b"committed data")
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-commit-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.COMMIT,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_write_new",
-                    params={"uri": committed_file},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        # New manager (simulates restart)
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        # File should still exist (no rollback for committed tx)
-        assert file_exists(agfs_client, committed_file)
-        # Journal should be cleaned up
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_commit_replays_post_actions(self, agfs_client, vector_store, test_dir):
-        """COMMIT + post_actions → replay post_actions."""
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-post-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.COMMIT,
-            locks=[],
-            undo_log=[],
-            post_actions=[
-                {
-                    "type": "enqueue_semantic",
-                    "params": {
-                        "uri": "viking://test-post",
-                        "context_type": "resource",
-                        "account_id": "acc",
-                    },
-                }
-            ],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-
-        with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post:
-            await manager._recover_pending_transactions()
-
-        mock_post.assert_called_once()
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_exec_rollback_fs_mv(self, agfs_client, vector_store, test_dir):
-        """EXEC status with fs_mv → recovery rolls back → file moved back."""
-        src = f"{test_dir}/exec-mv-src"
-        dst = f"{test_dir}/exec-mv-dst"
-        _mkdir_ok(agfs_client, src)
-        agfs_client.write(f"{src}/data.txt", b"mv-data")
-
-        # Simulate: forward mv happened, then crash
-        agfs_client.mv(src, dst)
-        assert not file_exists(agfs_client, src)
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-exec-mv-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mv",
-                    params={"src": src, "dst": dst},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert file_exists(agfs_client, src)
-        assert not file_exists(agfs_client, dst)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_exec_rollback_fs_mkdir(self, agfs_client, vector_store, test_dir):
-        """EXEC with fs_mkdir → recovery → directory removed."""
-        new_dir = f"{test_dir}/exec-mkdir"
-        _mkdir_ok(agfs_client, new_dir)
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-exec-mkdir-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mkdir",
-                    params={"uri": new_dir},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, new_dir)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_exec_rollback_fs_write_new(self, agfs_client, vector_store, test_dir):
-        """EXEC with fs_write_new → recovery → file removed."""
-        file_path = f"{test_dir}/exec-write.txt"
-        agfs_client.write(file_path, b"to-be-rolled-back")
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-exec-write-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_write_new",
-                    params={"uri": file_path},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, file_path)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_exec_rollback_vectordb_upsert(
-        self, agfs_client, vector_store, request_ctx, test_dir
-    ):
-        """EXEC with vectordb_upsert → recovery → record deleted from VectorDB."""
-        record_id = str(uuid.uuid4())
-        record = {
-            "id": record_id,
-            "uri": f"viking://resources/crash-upsert-{record_id}.md",
-            "parent_uri": "viking://resources/",
-            "account_id": "default",
-            "context_type": "resource",
-            "level": 2,
-            "vector": [0.5] * VECTOR_DIM,
-            "name": "crash-upsert",
-            "description": "test",
-            "abstract": "test",
-        }
-        await vector_store.upsert(record, ctx=request_ctx)
-        assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-exec-vdb-{uuid.uuid4().hex[:8]}"
-        tx_record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="vectordb_upsert",
-                    params={
-                        "record_id": record_id,
-                        "_ctx_account_id": "default",
-                        "_ctx_user_id": "test_user",
-                        "_ctx_role": "root",
-                    },
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, tx_record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        results = await vector_store.get([record_id], ctx=request_ctx)
-        assert len(results) == 0
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_fail_triggers_rollback(self, agfs_client, vector_store, test_dir):
-        """FAIL status → also triggers rollback."""
-        new_dir = f"{test_dir}/fail-dir"
-        _mkdir_ok(agfs_client, new_dir)
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-fail-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.FAIL,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mkdir",
-                    params={"uri": new_dir},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, new_dir)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_releasing_triggers_rollback(self, agfs_client, vector_store, test_dir):
-        """RELEASING status → rollback + lock cleanup."""
-        new_dir = f"{test_dir}/releasing-dir"
-        _mkdir_ok(agfs_client, new_dir)
-
-        lock_path = make_lock_file(agfs_client, test_dir, "tx-releasing-placeholder", "S")
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-releasing-{uuid.uuid4().hex[:8]}"
-        # Rewrite lock with correct tx_id
-        lock_path = make_lock_file(agfs_client, test_dir, tx_id, "S")
-
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.RELEASING,
-            locks=[lock_path],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mkdir",
-                    params={"uri": new_dir},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, new_dir)
-        assert not file_exists(agfs_client, lock_path)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_exec_includes_incomplete(self, agfs_client, vector_store, test_dir):
-        """EXEC recovery uses recover_all=True → also reverses incomplete entries."""
-        new_dir = f"{test_dir}/exec-incomplete"
-        _mkdir_ok(agfs_client, new_dir)
-
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-exec-inc-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mkdir",
-                    params={"uri": new_dir},
-                    completed=False,  # incomplete, but recover_all=True reverses it
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, new_dir)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_init_cleans_locks(self, agfs_client, vector_store, test_dir):
-        """INIT status → no rollback, just lock cleanup + journal delete."""
-        lock_dir = f"{test_dir}/init-lock-dir"
-        _mkdir_ok(agfs_client, lock_dir)
-
-        tx_id = f"tx-init-{uuid.uuid4().hex[:8]}"
-        lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P")
-
-        journal = TransactionJournal(agfs_client)
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.INIT,
-            locks=[lock_path],
-            undo_log=[],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, lock_path)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_acquire_cleans_locks(self, agfs_client, vector_store, test_dir):
-        """ACQUIRE status → same as INIT, clean up only."""
-        lock_dir = f"{test_dir}/acquire-lock-dir"
-        _mkdir_ok(agfs_client, lock_dir)
-
-        tx_id = f"tx-acq-{uuid.uuid4().hex[:8]}"
-        lock_path = make_lock_file(agfs_client, lock_dir, tx_id, "P")
-
-        journal = TransactionJournal(agfs_client)
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.ACQUIRE,
-            locks=[lock_path],
-            undo_log=[],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, lock_path)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_init_orphan_lock_via_init_info(
-        self, agfs_client, vector_store, test_dir
-    ):
-        """INIT with empty locks but init_info.lock_paths → clean orphan lock owned by tx."""
-        orphan_dir = f"{test_dir}/orphan-dir"
-        _mkdir_ok(agfs_client, orphan_dir)
-
-        tx_id = f"tx-orphan-{uuid.uuid4().hex[:8]}"
-        lock_path = make_lock_file(agfs_client, orphan_dir, tx_id, "S")
-
-        journal = TransactionJournal(agfs_client)
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.INIT,
-            locks=[],  # Empty — crash happened before journal recorded locks
-            init_info={
-                "operation": "rm",
-                "lock_paths": [orphan_dir],
-                "lock_mode": "subtree",
-            },
-            undo_log=[],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, lock_path)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_init_orphan_lock_other_owner(self, agfs_client, vector_store, test_dir):
-        """INIT with orphan lock owned by different tx → not removed."""
-        orphan_dir = f"{test_dir}/orphan-other"
-        _mkdir_ok(agfs_client, orphan_dir)
-
-        other_tx_id = f"tx-OTHER-{uuid.uuid4().hex[:8]}"
-        lock_path = make_lock_file(agfs_client, orphan_dir, other_tx_id, "S")
-
-        tx_id = f"tx-innocent-{uuid.uuid4().hex[:8]}"
-        journal = TransactionJournal(agfs_client)
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.INIT,
-            locks=[],
-            init_info={
-                "operation": "rm",
-                "lock_paths": [orphan_dir],
-                "lock_mode": "subtree",
-            },
-            undo_log=[],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        # Lock file should still exist — owned by different tx
-        assert file_exists(agfs_client, lock_path)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_mv_orphan_both_paths(self, agfs_client, vector_store, test_dir):
-        """INIT mv operation → check both lock_paths and mv_dst_path for orphan locks."""
-        src_dir = f"{test_dir}/mv-orphan-src"
-        dst_dir = f"{test_dir}/mv-orphan-dst"
-        _mkdir_ok(agfs_client, src_dir)
-        _mkdir_ok(agfs_client, dst_dir)
-
-        tx_id = f"tx-mv-orphan-{uuid.uuid4().hex[:8]}"
-        src_lock = make_lock_file(agfs_client, src_dir, tx_id, "S")
-        dst_lock = make_lock_file(agfs_client, dst_dir, tx_id, "P")
-
-        journal = TransactionJournal(agfs_client)
-        record = TransactionRecord(
-            id=tx_id,
-            status=TransactionStatus.INIT,
-            locks=[],
-            init_info={
-                "operation": "mv",
-                "lock_paths": [src_dir],
-                "lock_mode": "mv",
-                "mv_dst_path": dst_dir,
-            },
-            undo_log=[],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        # Both orphan locks should be cleaned up
-        assert not file_exists(agfs_client, src_lock)
-        assert not file_exists(agfs_client, dst_lock)
-        assert tx_id not in journal.list_all()
-
-    async def test_recover_multiple_transactions(self, agfs_client, vector_store, test_dir):
-        """Multiple journal entries are all recovered."""
-        dir_a = f"{test_dir}/multi-tx-a"
-        _mkdir_ok(agfs_client, dir_a)
-
-        journal = TransactionJournal(agfs_client)
-
-        # tx-a: EXEC with mkdir → should rollback
-        tx_a = f"tx-multi-a-{uuid.uuid4().hex[:8]}"
-        record_a = TransactionRecord(
-            id=tx_a,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mkdir",
-                    params={"uri": dir_a},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record_a)
-
-        # tx-b: COMMIT → no rollback, just cleanup
-        tx_b = f"tx-multi-b-{uuid.uuid4().hex[:8]}"
-        record_b = TransactionRecord(
-            id=tx_b,
-            status=TransactionStatus.COMMIT,
-            locks=[],
-            undo_log=[],
-            post_actions=[],
-        )
-        _write_journal(journal, record_b)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        assert not file_exists(agfs_client, dir_a)  # rolled back
-        assert tx_a not in journal.list_all()
-        assert tx_b not in journal.list_all()
-
-    async def test_recover_corrupted_journal_skips(self, agfs_client, vector_store, test_dir):
-        """Corrupted journal entry → skipped, others still processed."""
-        journal = TransactionJournal(agfs_client)
-
-        # Write a corrupted journal entry (invalid JSON)
-        bad_tx_id = f"tx-bad-{uuid.uuid4().hex[:8]}"
-        _mkdir_ok(agfs_client, "/local/_system")
-        _mkdir_ok(agfs_client, "/local/_system/transactions")
-        bad_dir = f"/local/_system/transactions/{bad_tx_id}"
-        _mkdir_ok(agfs_client, bad_dir)
-        agfs_client.write(f"{bad_dir}/journal.json", b"NOT VALID JSON {{{{")
-
-        # Write a good journal entry
-        good_dir = f"{test_dir}/good-recovery"
-        _mkdir_ok(agfs_client, good_dir)
-
-        good_tx_id = f"tx-good-{uuid.uuid4().hex[:8]}"
-        record = TransactionRecord(
-            id=good_tx_id,
-            status=TransactionStatus.EXEC,
-            locks=[],
-            undo_log=[
-                UndoEntry(
-                    sequence=0,
-                    op_type="fs_mkdir",
-                    params={"uri": good_dir},
-                    completed=True,
-                )
-            ],
-            post_actions=[],
-        )
-        _write_journal(journal, record)
-
-        manager = TransactionManager(agfs_client=agfs_client, vector_store=vector_store)
-        await manager._recover_pending_transactions()
-
-        # Good tx should still be recovered
-        assert not file_exists(agfs_client, good_dir)
-        assert good_tx_id not in journal.list_all()
diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py
index d7b850c4..1c79414d 100644
--- a/tests/transaction/test_e2e.py
+++ b/tests/transaction/test_e2e.py
@@ -1,437 +1,125 @@
 # Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
 # SPDX-License-Identifier: Apache-2.0
-"""End-to-end transaction tests using real AGFS backend.
+"""End-to-end lock tests using real AGFS backend.
 
-These tests exercise the full stack: TransactionContext → TransactionManager →
-PathLock → Journal → AGFS, verifying the complete acquire → operate → commit/rollback
-→ release → journal cleanup lifecycle.
+These tests exercise LockContext -> LockManager -> PathLock -> AGFS,
+verifying the acquire -> operate -> release lifecycle.
 """
 
 import uuid
 
 import pytest
 
-from openviking.storage.transaction.context_manager import TransactionContext
-from openviking.storage.transaction.journal import TransactionJournal
+from openviking.storage.errors import LockAcquisitionError
+from openviking.storage.transaction.lock_context import LockContext
+from openviking.storage.transaction.lock_manager import LockManager
 from openviking.storage.transaction.path_lock import LOCK_FILE_NAME
-from openviking.storage.transaction.transaction_manager import TransactionManager
 
 
-@pytest.fixture
-def tx_manager(agfs_client):
-    """Create a real TransactionManager backed by the test AGFS."""
-    manager = TransactionManager(
-        agfs_client=agfs_client,
-        timeout=3600,
-        max_parallel_locks=8,
-        lock_timeout=1.0,
-        lock_expire=1.0,
-    )
-    return manager
-
-
-class TestE2ECommit:
-    async def test_full_commit_lifecycle(self, agfs_client, tx_manager, test_dir):
-        """Full lifecycle: context enter → record undo → commit → locks released → journal cleaned."""
-        async with TransactionContext(
-            tx_manager, "test_write", [test_dir], lock_mode="point"
-        ) as tx:
-            # Lock should be acquired
-            lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
-            token = agfs_client.cat(lock_path)
-            assert token is not None
+def _lock_file_gone(agfs_client, lock_path: str) -> bool:
+    """Return True if the lock file does not exist in AGFS."""
+    try:
+        agfs_client.stat(lock_path)
+        return False
+    except Exception:
+        return True
 
-            # Record some operations
-            seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/file.txt"})
-            agfs_client.write(f"{test_dir}/file.txt", b"hello")
-            tx.mark_completed(seq)
 
-            # Add post action
-            tx.add_post_action(
-                "enqueue_semantic",
-                {"uri": "viking://test", "context_type": "resource", "account_id": "default"},
-            )
+@pytest.fixture
+def lock_manager(agfs_client):
+    return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0)
 
-            await tx.commit()
 
-        # After commit: lock should be released
-        try:
-            agfs_client.cat(lock_path)
-            raise AssertionError("Lock file should be gone after commit")
-        except Exception:
-            pass  # Expected
+class TestLockContextCommit:
+    async def test_lock_acquired_and_released(self, agfs_client, lock_manager, test_dir):
+        """Lock is held inside the context and released after exit."""
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+
+        async with LockContext(lock_manager, [test_dir], lock_mode="point"):
+            token = agfs_client.cat(lock_path)
+            assert token is not None
 
-        # Transaction should be removed from manager
-        assert tx_manager.get_transaction(tx.record.id) is None
+        assert _lock_file_gone(agfs_client, lock_path)
 
-    async def test_commit_file_persists(self, agfs_client, tx_manager, test_dir):
-        """Files written inside a committed transaction persist."""
+    async def test_file_persists_after_context(self, agfs_client, lock_manager, test_dir):
+        """Files written inside a lock context persist."""
         file_path = f"{test_dir}/committed-file.txt"
 
-        async with TransactionContext(tx_manager, "write_op", [test_dir], lock_mode="point") as tx:
-            seq = tx.record_undo("fs_write_new", {"uri": file_path})
+        async with LockContext(lock_manager, [test_dir], lock_mode="point"):
             agfs_client.write(file_path, b"committed data")
-            tx.mark_completed(seq)
-            await tx.commit()
 
         content = agfs_client.cat(file_path)
         assert content == b"committed data"
 
 
-class TestE2ERollback:
-    async def test_explicit_exception_triggers_rollback(self, agfs_client, tx_manager, test_dir):
-        """Exception inside context → auto-rollback → undo operations reversed."""
-        new_dir = f"{test_dir}/to-be-rolled-back-{uuid.uuid4().hex}"
+class TestLockContextException:
+    async def test_lock_released_on_exception(self, agfs_client, lock_manager, test_dir):
+        """Lock is released even when an exception occurs inside the context."""
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
 
         with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "failing_op", [test_dir], lock_mode="point"
-            ) as tx:
-                seq = tx.record_undo("fs_mkdir", {"uri": new_dir})
-                agfs_client.mkdir(new_dir)
-                tx.mark_completed(seq)
-
+            async with LockContext(lock_manager, [test_dir], lock_mode="point"):
+                token = agfs_client.cat(lock_path)
+                assert token is not None
                 raise RuntimeError("simulated failure")
 
-        # Directory should be removed by rollback
-        try:
-            agfs_client.stat(new_dir)
-            raise AssertionError("Directory should be removed by rollback")
-        except Exception:
-            pass
+        assert _lock_file_gone(agfs_client, lock_path)
 
-        # Lock should be released
-        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
-        try:
-            agfs_client.cat(lock_path)
-            raise AssertionError("Lock should be released after rollback")
-        except Exception:
-            pass
-
-    async def test_no_commit_triggers_rollback(self, agfs_client, tx_manager, test_dir):
-        """Exiting context without calling commit() triggers auto-rollback."""
-        new_dir = f"{test_dir}/forgot-commit-{uuid.uuid4().hex}"
-
-        async with TransactionContext(tx_manager, "no_commit", [test_dir], lock_mode="point") as tx:
-            seq = tx.record_undo("fs_mkdir", {"uri": new_dir})
-            agfs_client.mkdir(new_dir)
-            tx.mark_completed(seq)
-            # Intentionally not calling tx.commit()
-
-        # Directory should be removed by rollback
-        try:
-            agfs_client.stat(new_dir)
-            raise AssertionError("Directory should be removed by rollback")
-        except Exception:
-            pass
-
-
-class TestE2EMvLock:
-    async def test_mv_lock_acquires_both_paths(self, agfs_client, tx_manager, test_dir):
+    async def test_exception_not_swallowed(self, agfs_client, lock_manager, test_dir):
+        """Exceptions propagate through the context manager."""
+        with pytest.raises(ValueError, match="test error"):
+            async with LockContext(lock_manager, [test_dir], lock_mode="point"):
+                raise ValueError("test error")
+
+
+class TestLockContextMv:
+    async def test_mv_lock_acquires_both_paths(self, agfs_client, lock_manager, test_dir):
         """mv lock mode acquires SUBTREE on both source and destination."""
         src = f"{test_dir}/mv-src-{uuid.uuid4().hex}"
         dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}"
         agfs_client.mkdir(src)
         agfs_client.mkdir(dst)
 
-        async with TransactionContext(
-            tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst
-        ) as tx:
-            # Both lock files should exist
+        async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst):
             src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}")
             dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}")
             src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token
             dst_token_str = dst_token.decode("utf-8") if isinstance(dst_token, bytes) else dst_token
+            assert ":S" in src_token_str
+            assert ":S" in dst_token_str
 
-            assert ":S" in src_token_str  # SUBTREE on source
-            assert ":S" in dst_token_str  # SUBTREE on destination
-
-            await tx.commit()
-
-        # Both locks released
         for path in [f"{src}/{LOCK_FILE_NAME}", f"{dst}/{LOCK_FILE_NAME}"]:
-            try:
-                agfs_client.cat(path)
-                raise AssertionError(f"Lock {path} should be gone")
-            except Exception:
-                pass
+            assert _lock_file_gone(agfs_client, path)
 
 
-class TestE2ESubtreeRollback:
-    async def test_subtree_lock_with_rollback(self, agfs_client, tx_manager, test_dir):
-        """Subtree lock + rollback: undo is executed and lock released."""
-        target = f"{test_dir}/sub-rb-{uuid.uuid4().hex}"
+class TestLockContextSubtree:
+    async def test_subtree_lock_and_release(self, agfs_client, lock_manager, test_dir):
+        """Subtree lock is acquired and released."""
+        target = f"{test_dir}/sub-{uuid.uuid4().hex}"
         agfs_client.mkdir(target)
 
-        child = f"{target}/child-{uuid.uuid4().hex}"
-
-        with pytest.raises(ValueError):
-            async with TransactionContext(tx_manager, "rm_op", [target], lock_mode="subtree") as tx:
-                seq = tx.record_undo("fs_mkdir", {"uri": child})
-                agfs_client.mkdir(child)
-                tx.mark_completed(seq)
-
-                raise ValueError("abort rm")
-
-        # Child dir should be removed by rollback
-        try:
-            agfs_client.stat(child)
-            raise AssertionError("Child should be cleaned up")
-        except Exception:
-            pass
-
-        # Lock released
-        try:
-            agfs_client.cat(f"{target}/{LOCK_FILE_NAME}")
-            raise AssertionError("Lock should be released")
-        except Exception:
-            pass
-
-
-class TestE2EJournalCleanup:
-    async def test_journal_cleaned_after_commit(self, agfs_client, tx_manager, test_dir):
-        """After successful commit, the journal entry for the transaction is deleted."""
-        journal = TransactionJournal(agfs_client)
-
-        async with TransactionContext(
-            tx_manager, "journal_test", [test_dir], lock_mode="point"
-        ) as tx:
-            tx_id = tx.record.id
-            await tx.commit()
-
-        # Journal should be cleaned up
-        all_ids = journal.list_all()
-        assert tx_id not in all_ids
-
-    async def test_journal_cleaned_after_rollback(self, agfs_client, tx_manager, test_dir):
-        """After rollback, the journal entry is also cleaned up."""
-        journal = TransactionJournal(agfs_client)
-
-        with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "journal_rb", [test_dir], lock_mode="point"
-            ) as tx:
-                tx_id = tx.record.id
-                raise RuntimeError("force rollback")
-
-        all_ids = journal.list_all()
-        assert tx_id not in all_ids
-
-
-class TestE2EMvRollback:
-    async def test_mv_rollback_moves_file_back(self, agfs_client, tx_manager, test_dir):
-        """mv commit 前失败 → 文件被移回原位。"""
-        src = f"{test_dir}/mv-rb-src-{uuid.uuid4().hex}"
-        dst_parent = f"{test_dir}/mv-rb-dst-{uuid.uuid4().hex}"
-        agfs_client.mkdir(src)
-        agfs_client.mkdir(dst_parent)
-
-        # Write a file inside src
-        agfs_client.write(f"{src}/data.txt", b"important")
-
-        dst = f"{dst_parent}/moved"
-
-        with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent
-            ) as tx:
-                seq = tx.record_undo("fs_mv", {"src": src, "dst": dst})
-                agfs_client.mv(src, dst)
-                tx.mark_completed(seq)
-
-                raise RuntimeError("abort after mv")
-
-        # src should be restored (mv reversed: dst → src)
-        content = agfs_client.cat(f"{src}/data.txt")
-        assert content == b"important"
-
-        # dst should no longer exist
-        try:
-            agfs_client.stat(dst)
-            raise AssertionError("dst should not exist after rollback")
-        except Exception:
-            pass
-
-    async def test_mv_commit_persists(self, agfs_client, tx_manager, test_dir):
-        """mv commit 成功 → 文件在新位置，旧位置不存在。"""
-        src = f"{test_dir}/mv-ok-src-{uuid.uuid4().hex}"
-        dst_parent = f"{test_dir}/mv-ok-dst-{uuid.uuid4().hex}"
-        agfs_client.mkdir(src)
-        agfs_client.mkdir(dst_parent)
-        agfs_client.write(f"{src}/data.txt", b"moved-data")
-
-        dst = f"{dst_parent}/moved"
-
-        async with TransactionContext(
-            tx_manager, "mv_op", [src], lock_mode="mv", mv_dst_path=dst_parent
-        ) as tx:
-            seq = tx.record_undo("fs_mv", {"src": src, "dst": dst})
-            agfs_client.mv(src, dst)
-            tx.mark_completed(seq)
-            await tx.commit()
-
-        # File at new location
-        content = agfs_client.cat(f"{dst}/data.txt")
-        assert content == b"moved-data"
-
-        # Old location gone
-        try:
-            agfs_client.stat(src)
-            raise AssertionError("src should not exist after committed mv")
-        except Exception:
-            pass
-
-
-class TestE2EMultiStepRollback:
-    async def test_multi_step_rollback_reverses_all(self, agfs_client, tx_manager, test_dir):
-        """多步操作（mkdir + write + mkdir），中间失败 → 全部反序回滚。
-
-        执行顺序：seq0 mkdir /a → seq1 write /a/f.txt → seq2 mkdir /a/sub
-        在 seq2 完成后抛异常。
-        回滚顺序：seq2 rm /a/sub → seq1 rm /a/f.txt → seq0 rm /a
-        """
-        dir_a = f"{test_dir}/multi-a-{uuid.uuid4().hex}"
-        file_f = f"{dir_a}/f.txt"
-        dir_sub = f"{dir_a}/sub"
+        async with LockContext(lock_manager, [target], lock_mode="subtree"):
+            token = agfs_client.cat(f"{target}/{LOCK_FILE_NAME}")
+            token_str = token.decode("utf-8") if isinstance(token, bytes) else token
+            assert ":S" in token_str
 
-        with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "multi_step", [test_dir], lock_mode="point"
-            ) as tx:
-                s0 = tx.record_undo("fs_mkdir", {"uri": dir_a})
-                agfs_client.mkdir(dir_a)
-                tx.mark_completed(s0)
-
-                s1 = tx.record_undo("fs_write_new", {"uri": file_f})
-                agfs_client.write(file_f, b"content")
-                tx.mark_completed(s1)
-
-                s2 = tx.record_undo("fs_mkdir", {"uri": dir_sub})
-                agfs_client.mkdir(dir_sub)
-                tx.mark_completed(s2)
-
-                raise RuntimeError("abort after all steps")
-
-        # Everything should be cleaned up in reverse order
-        for path in [dir_sub, file_f, dir_a]:
-            try:
-                agfs_client.stat(path)
-                raise AssertionError(f"{path} should not exist after rollback")
-            except Exception:
-                pass
+        assert _lock_file_gone(agfs_client, f"{target}/{LOCK_FILE_NAME}")
 
-    async def test_partial_step_rollback(self, agfs_client, tx_manager, test_dir):
-        """两步操作，第二步执行到一半崩溃（未 mark_completed）→ 只回滚第一步。
-
-        seq0 mkdir (completed=True) → seq1 write (completed=False，异常在 mark 前抛出）
-        回滚只处理 seq0。
-        """
-        dir_a = f"{test_dir}/partial-{uuid.uuid4().hex}"
-        file_f = f"{dir_a}/f.txt"
-
-        with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "partial", [test_dir], lock_mode="point"
-            ) as tx:
-                s0 = tx.record_undo("fs_mkdir", {"uri": dir_a})
-                agfs_client.mkdir(dir_a)
-                tx.mark_completed(s0)
-
-                _s1 = tx.record_undo("fs_write_new", {"uri": file_f})
-                agfs_client.write(file_f, b"half-done")
-                # NOT calling tx.mark_completed(s1) — simulates crash mid-operation
-                raise RuntimeError("crash before marking s1 completed")
-
-        # dir_a (seq0, completed) should be rolled back
-        try:
-            agfs_client.stat(dir_a)
-            raise AssertionError("dir_a should be rolled back")
-        except Exception:
-            pass
-
-        # file_f was written but undo entry not marked completed → not rolled back by normal mode
-        # However, file_f is inside dir_a which was removed, so it's gone too
-
-    async def test_rollback_order_matters_nested_dirs(self, agfs_client, tx_manager, test_dir):
-        """嵌套目录回滚顺序：必须先删子目录再删父目录。
-
-        seq0 mkdir /parent → seq1 mkdir /parent/child
-        回滚必须 seq1 (rm child) → seq0 (rm parent)，否则 parent 非空删除失败。
-        """
-        parent = f"{test_dir}/nested-parent-{uuid.uuid4().hex}"
-        child = f"{parent}/child"
 
-        with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "nested", [test_dir], lock_mode="point"
-            ) as tx:
-                s0 = tx.record_undo("fs_mkdir", {"uri": parent})
-                agfs_client.mkdir(parent)
-                tx.mark_completed(s0)
-
-                s1 = tx.record_undo("fs_mkdir", {"uri": child})
-                agfs_client.mkdir(child)
-                tx.mark_completed(s1)
-
-                raise RuntimeError("abort nested")
-
-        # Both gone (child first, then parent)
-        for path in [child, parent]:
-            try:
-                agfs_client.stat(path)
-                raise AssertionError(f"{path} should not exist")
-            except Exception:
-                pass
-
-    async def test_rollback_failure_best_effort_continues(self, agfs_client, tx_manager, test_dir):
-        """回滚中某步失败，后续步骤仍然执行（best-effort）。
-
-        seq0 mkdir /a → seq1 mkdir /b
-        手动删除 /b（模拟回滚 seq1 时目标已不存在），seq0 的回滚仍应执行。
-        """
-        dir_a = f"{test_dir}/be-a-{uuid.uuid4().hex}"
-        dir_b = f"{test_dir}/be-b-{uuid.uuid4().hex}"
-
-        with pytest.raises(RuntimeError):
-            async with TransactionContext(
-                tx_manager, "best_effort", [test_dir], lock_mode="point"
-            ) as tx:
-                s0 = tx.record_undo("fs_mkdir", {"uri": dir_a})
-                agfs_client.mkdir(dir_a)
-                tx.mark_completed(s0)
-
-                s1 = tx.record_undo("fs_mkdir", {"uri": dir_b})
-                agfs_client.mkdir(dir_b)
-                tx.mark_completed(s1)
-
-                # Manually remove dir_b before rollback — simulates external interference
-                agfs_client.rm(dir_b)
-
-                raise RuntimeError("abort")
-
-        # dir_b removal during rollback "fails" (already gone), but dir_a should still be rolled back
-        try:
-            agfs_client.stat(dir_a)
-            raise AssertionError("dir_a should be rolled back despite dir_b failure")
-        except Exception:
-            pass
-
-
-class TestE2ESequentialTransactions:
-    async def test_sequential_transactions_on_same_path(self, agfs_client, tx_manager, test_dir):
-        """Two sequential transactions on the same path both succeed."""
+class TestSequentialLocks:
+    async def test_sequential_locks_on_same_path(self, agfs_client, lock_manager, test_dir):
+        """Multiple sequential lock contexts on the same path succeed."""
         for i in range(3):
-            async with TransactionContext(
-                tx_manager, f"seq_{i}", [test_dir], lock_mode="point"
-            ) as tx:
-                seq = tx.record_undo("fs_write_new", {"uri": f"{test_dir}/f{i}.txt"})
+            async with LockContext(lock_manager, [test_dir], lock_mode="point"):
                 agfs_client.write(f"{test_dir}/f{i}.txt", f"data-{i}".encode())
-                tx.mark_completed(seq)
-                await tx.commit()
 
-        # All files should exist
         for i in range(3):
             content = agfs_client.cat(f"{test_dir}/f{i}.txt")
             assert content == f"data-{i}".encode()
 
-        assert tx_manager.get_transaction_count() == 0
+    async def test_lock_acquisition_failure(self, agfs_client, lock_manager, test_dir):
+        """LockContext raises LockAcquisitionError for nonexistent path."""
+        nonexistent = f"{test_dir}/nonexistent-{uuid.uuid4().hex}"
+        with pytest.raises(LockAcquisitionError):
+            async with LockContext(lock_manager, [nonexistent], lock_mode="point"):
+                pass
diff --git a/tests/transaction/test_journal.py b/tests/transaction/test_journal.py
deleted file mode 100644
index 57f1e483..00000000
--- a/tests/transaction/test_journal.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for transaction journal."""
-
-import json
-import uuid
-from unittest.mock import MagicMock
-
-from openviking.storage.transaction.journal import TransactionJournal
-
-
-class TestTransactionJournal:
-    def _make_journal(self) -> tuple:
-        agfs = MagicMock()
-        journal = TransactionJournal(agfs)
-        return journal, agfs
-
-    def test_write_calls_agfs_write_with_correct_data(self):
-        journal, agfs = self._make_journal()
-        data = {"id": "tx-1", "status": "INIT", "locks": []}
-
-        journal.write(data)
-
-        # Should call agfs.write with the journal path and serialized data
-        agfs.write.assert_called_once()
-        path, payload = agfs.write.call_args[0]
-        assert "tx-1" in path
-        assert path.endswith("journal.json")
-        parsed = json.loads(payload.decode("utf-8"))
-        assert parsed["id"] == "tx-1"
-        assert parsed["status"] == "INIT"
-
-    def test_write_ensures_directories_exist(self):
-        journal, agfs = self._make_journal()
-        data = {"id": "tx-1", "status": "INIT", "locks": []}
-
-        journal.write(data)
-
-        # Should call mkdir at least once (for parent dirs)
-        assert agfs.mkdir.called
-
-    def test_update_overwrites(self):
-        journal, agfs = self._make_journal()
-        data = {"id": "tx-2", "status": "EXEC", "locks": []}
-
-        journal.update(data)
-
-        agfs.write.assert_called_once()
-        path, payload = agfs.write.call_args[0]
-        assert json.loads(payload.decode("utf-8"))["status"] == "EXEC"
-
-    def test_read_parses_json(self):
-        journal, agfs = self._make_journal()
-        agfs.cat.return_value = json.dumps({"id": "tx-3", "status": "EXEC"}).encode("utf-8")
-
-        result = journal.read("tx-3")
-        assert result["id"] == "tx-3"
-        assert result["status"] == "EXEC"
-
-    def test_read_handles_string_response(self):
-        """Some AGFS backends may return str instead of bytes."""
-        journal, agfs = self._make_journal()
-        agfs.cat.return_value = json.dumps({"id": "tx-str", "status": "INIT"})
-
-        result = journal.read("tx-str")
-        assert result["id"] == "tx-str"
-
-    def test_delete_removes_directory(self):
-        journal, agfs = self._make_journal()
-        journal.delete("tx-4")
-        agfs.rm.assert_called_once()
-        path = agfs.rm.call_args[0][0]
-        assert "tx-4" in path
-
-    def test_list_all_returns_tx_ids(self):
-        journal, agfs = self._make_journal()
-        agfs.ls.return_value = [
-            {"name": "tx-a", "isDir": True},
-            {"name": "tx-b", "isDir": True},
-            {"name": ".", "isDir": True},
-        ]
-
-        result = journal.list_all()
-        assert "tx-a" in result
-        assert "tx-b" in result
-        assert "." not in result
-
-    def test_list_all_filters_dotdot(self):
-        journal, agfs = self._make_journal()
-        agfs.ls.return_value = [
-            {"name": "..", "isDir": True},
-            {"name": "tx-real", "isDir": True},
-        ]
-
-        result = journal.list_all()
-        assert ".." not in result
-        assert "tx-real" in result
-
-    def test_list_all_empty_on_error(self):
-        journal, agfs = self._make_journal()
-        agfs.ls.side_effect = Exception("not found")
-
-        result = journal.list_all()
-        assert result == []
-
-    def test_delete_tolerates_missing(self):
-        journal, agfs = self._make_journal()
-        agfs.rm.side_effect = Exception("not found")
-        # Should not raise
-        journal.delete("tx-missing")
-
-    def test_write_with_post_actions(self):
-        journal, agfs = self._make_journal()
-        data = {
-            "id": "tx-5",
-            "status": "COMMIT",
-            "locks": [],
-            "post_actions": [
-                {"type": "enqueue_semantic", "params": {"uri": "viking://test"}},
-            ],
-        }
-        journal.write(data)
-        path, payload = agfs.write.call_args[0]
-        parsed = json.loads(payload.decode("utf-8"))
-        assert len(parsed["post_actions"]) == 1
-        assert parsed["post_actions"][0]["type"] == "enqueue_semantic"
-
-    def test_write_with_undo_log(self):
-        journal, agfs = self._make_journal()
-        data = {
-            "id": "tx-6",
-            "status": "EXEC",
-            "locks": [],
-            "undo_log": [
-                {
-                    "sequence": 0,
-                    "op_type": "fs_mv",
-                    "params": {"src": "/a", "dst": "/b"},
-                    "completed": True,
-                },
-            ],
-        }
-        journal.write(data)
-        _, payload = agfs.write.call_args[0]
-        parsed = json.loads(payload.decode("utf-8"))
-        assert len(parsed["undo_log"]) == 1
-        assert parsed["undo_log"][0]["op_type"] == "fs_mv"
-
-
-class TestTransactionJournalIntegration:
-    """Integration tests using real AGFS backend to verify persistence behavior."""
-
-    def test_write_read_roundtrip(self, agfs_client):
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-int-{uuid.uuid4().hex}"
-        data = {"id": tx_id, "status": "INIT", "locks": [], "undo_log": []}
-
-        journal.write(data)
-        result = journal.read(tx_id)
-
-        assert result["id"] == tx_id
-        assert result["status"] == "INIT"
-
-        journal.delete(tx_id)
-
-    def test_update_overwrites(self, agfs_client):
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-int-{uuid.uuid4().hex}"
-
-        journal.write({"id": tx_id, "status": "INIT", "locks": []})
-        journal.update({"id": tx_id, "status": "EXEC", "locks": []})
-
-        result = journal.read(tx_id)
-        assert result["status"] == "EXEC"
-
-        journal.delete(tx_id)
-
-    def test_delete_removes_journal(self, agfs_client):
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-int-{uuid.uuid4().hex}"
-
-        journal.write({"id": tx_id, "status": "INIT", "locks": []})
-        journal.delete(tx_id)
-
-        try:
-            journal.read(tx_id)
-            raise AssertionError("Should have raised after deletion")
-        except Exception:
-            pass  # Expected
-
-    def test_list_all_returns_written_ids(self, agfs_client):
-        journal = TransactionJournal(agfs_client)
-        tx_id_a = f"tx-int-{uuid.uuid4().hex}"
-        tx_id_b = f"tx-int-{uuid.uuid4().hex}"
-
-        journal.write({"id": tx_id_a, "status": "INIT", "locks": []})
-        journal.write({"id": tx_id_b, "status": "INIT", "locks": []})
-
-        result = journal.list_all()
-        assert tx_id_a in result
-        assert tx_id_b in result
-
-        journal.delete(tx_id_a)
-        journal.delete(tx_id_b)
-
-    def test_list_all_empty_when_none(self, agfs_client):
-        """After cleanup, list_all should not include previously deleted entries."""
-        journal = TransactionJournal(agfs_client)
-        tx_id = f"tx-int-{uuid.uuid4().hex}"
-
-        journal.write({"id": tx_id, "status": "INIT", "locks": []})
-        journal.delete(tx_id)
-
-        result = journal.list_all()
-        assert tx_id not in result
diff --git a/tests/transaction/test_lock_context.py b/tests/transaction/test_lock_context.py
new file mode 100644
index 00000000..37fcb89c
--- /dev/null
+++ b/tests/transaction/test_lock_context.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for LockContext async context manager."""
+
+import uuid
+
+import pytest
+
+from openviking.storage.errors import LockAcquisitionError
+from openviking.storage.transaction.lock_context import LockContext
+from openviking.storage.transaction.lock_manager import LockManager
+from openviking.storage.transaction.path_lock import LOCK_FILE_NAME
+
+
+def _lock_file_gone(agfs_client, lock_path: str) -> bool:
+    try:
+        agfs_client.stat(lock_path)
+        return False
+    except Exception:
+        return True
+
+
+@pytest.fixture
+def lm(agfs_client):
+    return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0)
+
+
+class TestLockContextPoint:
+    async def test_point_lock_lifecycle(self, agfs_client, lm, test_dir):
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+
+        async with LockContext(lm, [test_dir], lock_mode="point") as handle:
+            assert handle is not None
+            token = agfs_client.cat(lock_path)
+            assert token is not None
+
+        assert _lock_file_gone(agfs_client, lock_path)
+
+    async def test_lock_released_on_exception(self, agfs_client, lm, test_dir):
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+
+        with pytest.raises(RuntimeError):
+            async with LockContext(lm, [test_dir], lock_mode="point"):
+                assert agfs_client.cat(lock_path) is not None
+                raise RuntimeError("fail")
+
+        assert _lock_file_gone(agfs_client, lock_path)
+
+    async def test_exception_propagates(self, lm, test_dir):
+        with pytest.raises(ValueError, match="test"):
+            async with LockContext(lm, [test_dir], lock_mode="point"):
+                raise ValueError("test")
+
+
+class TestLockContextSubtree:
+    async def test_subtree_lock(self, agfs_client, lm, test_dir):
+        async with LockContext(lm, [test_dir], lock_mode="subtree"):
+            token = agfs_client.cat(f"{test_dir}/{LOCK_FILE_NAME}")
+            token_str = token.decode("utf-8") if isinstance(token, bytes) else token
+            assert ":S" in token_str
+
+
+class TestLockContextMv:
+    async def test_mv_lock(self, agfs_client, lm, test_dir):
+        src = f"{test_dir}/src-{uuid.uuid4().hex}"
+        dst = f"{test_dir}/dst-{uuid.uuid4().hex}"
+        agfs_client.mkdir(src)
+        agfs_client.mkdir(dst)
+
+        async with LockContext(lm, [src], lock_mode="mv", mv_dst_path=dst) as handle:
+            assert len(handle.locks) == 2
+
+
+class TestLockContextFailure:
+    async def test_nonexistent_path_raises(self, lm):
+        with pytest.raises(LockAcquisitionError):
+            async with LockContext(lm, ["/local/nonexistent-xyz"], lock_mode="point"):
+                pass
+
+    async def test_handle_cleaned_up_on_failure(self, lm):
+        with pytest.raises(LockAcquisitionError):
+            async with LockContext(lm, ["/local/nonexistent-xyz"], lock_mode="point"):
+                pass
+
+        assert len(lm.get_active_handles()) == 0
diff --git a/tests/transaction/test_lock_manager.py b/tests/transaction/test_lock_manager.py
new file mode 100644
index 00000000..e30f724b
--- /dev/null
+++ b/tests/transaction/test_lock_manager.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for LockManager."""
+
+import uuid
+
+import pytest
+
+from openviking.storage.transaction.lock_manager import LockManager
+from openviking.storage.transaction.path_lock import LOCK_FILE_NAME
+
+
+def _lock_file_gone(agfs_client, lock_path: str) -> bool:
+    try:
+        agfs_client.stat(lock_path)
+        return False
+    except Exception:
+        return True
+
+
+@pytest.fixture
+def lm(agfs_client):
+    return LockManager(agfs=agfs_client, lock_timeout=1.0, lock_expire=1.0)
+
+
+class TestLockManagerBasic:
+    async def test_create_handle_and_acquire_point(self, agfs_client, lm, test_dir):
+        handle = lm.create_handle()
+        ok = await lm.acquire_point(handle, test_dir)
+        assert ok is True
+
+        lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
+        content = agfs_client.cat(lock_path)
+        assert content is not None
+
+        await lm.release(handle)
+        assert _lock_file_gone(agfs_client, lock_path)
+
+    async def test_acquire_subtree(self, agfs_client, lm, test_dir):
+        handle = lm.create_handle()
+        ok = await lm.acquire_subtree(handle, test_dir)
+        assert ok is True
+
+        token = agfs_client.cat(f"{test_dir}/{LOCK_FILE_NAME}")
+        token_str = token.decode("utf-8") if isinstance(token, bytes) else token
+        assert ":S" in token_str
+
+        await lm.release(handle)
+
+    async def test_acquire_mv(self, agfs_client, lm, test_dir):
+        src = f"{test_dir}/mv-src-{uuid.uuid4().hex}"
+        dst = f"{test_dir}/mv-dst-{uuid.uuid4().hex}"
+        agfs_client.mkdir(src)
+        agfs_client.mkdir(dst)
+
+        handle = lm.create_handle()
+        ok = await lm.acquire_mv(handle, src, dst)
+        assert ok is True
+        assert len(handle.locks) == 2
+
+        await lm.release(handle)
+        assert handle.id not in lm.get_active_handles()
+
+    async def test_release_removes_from_active(self, lm, test_dir):
+        handle = lm.create_handle()
+        assert handle.id in lm.get_active_handles()
+
+        await lm.acquire_point(handle, test_dir)
+        await lm.release(handle)
+
+        assert handle.id not in lm.get_active_handles()
+
+    async def test_stop_releases_all(self, agfs_client, lm, test_dir):
+        h1 = lm.create_handle()
+        h2 = lm.create_handle()
+        await lm.acquire_point(h1, test_dir)
+
+        sub = f"{test_dir}/sub-{uuid.uuid4().hex}"
+        agfs_client.mkdir(sub)
+        await lm.acquire_point(h2, sub)
+
+        await lm.stop()
+        assert len(lm.get_active_handles()) == 0
+
+    async def test_nonexistent_path_fails(self, lm):
+        handle = lm.create_handle()
+        ok = await lm.acquire_point(handle, "/local/nonexistent-xyz")
+        assert ok is False
diff --git a/tests/transaction/test_path_lock.py b/tests/transaction/test_path_lock.py
index 2f3b6afc..0b721e07 100644
--- a/tests/transaction/test_path_lock.py
+++ b/tests/transaction/test_path_lock.py
@@ -5,6 +5,7 @@
 import time
 from unittest.mock import MagicMock
 
+from openviking.storage.transaction.lock_handle import LockHandle
 from openviking.storage.transaction.path_lock import (
     LOCK_FILE_NAME,
     LOCK_TYPE_POINT,
@@ -13,7 +14,6 @@
     _make_fencing_token,
     _parse_fencing_token,
 )
-from openviking.storage.transaction.transaction_record import TransactionRecord
 
 
 class TestFencingToken:
@@ -55,20 +55,20 @@ def test_tokens_are_unique(self):
 class TestPathLockStale:
     def test_is_lock_stale_no_file(self):
         agfs = MagicMock()
-        agfs.cat.side_effect = Exception("not found")
+        agfs.read.side_effect = Exception("not found")
         lock = PathLock(agfs)
         assert lock.is_lock_stale("/test/.path.ovlock") is True
 
     def test_is_lock_stale_legacy_token(self):
         agfs = MagicMock()
-        agfs.cat.return_value = b"tx-old-format"
+        agfs.read.return_value = b"tx-old-format"
         lock = PathLock(agfs)
         assert lock.is_lock_stale("/test/.path.ovlock") is True
 
     def test_is_lock_stale_recent_token(self):
         agfs = MagicMock()
         token = _make_fencing_token("tx-1")
-        agfs.cat.return_value = token.encode("utf-8")
+        agfs.read.return_value = token.encode("utf-8")
         lock = PathLock(agfs)
         assert lock.is_lock_stale("/test/.path.ovlock", expire_seconds=300.0) is False
 
@@ -78,7 +78,7 @@ class TestPathLockBehavior:
 
     async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir):
         lock = PathLock(agfs_client)
-        tx = TransactionRecord(id="tx-point-1")
+        tx = LockHandle(id="tx-point-1")
 
         ok = await lock.acquire_point(test_dir, tx, timeout=3.0)
         assert ok is True
@@ -93,7 +93,7 @@ async def test_acquire_point_creates_lock_file(self, agfs_client, test_dir):
 
     async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir):
         lock = PathLock(agfs_client)
-        tx = TransactionRecord(id="tx-subtree-1")
+        tx = LockHandle(id="tx-subtree-1")
 
         ok = await lock.acquire_subtree(test_dir, tx, timeout=3.0)
         assert ok is True
@@ -108,7 +108,7 @@ async def test_acquire_subtree_creates_lock_file(self, agfs_client, test_dir):
 
     async def test_acquire_point_dir_not_found(self, agfs_client):
         lock = PathLock(agfs_client)
-        tx = TransactionRecord(id="tx-no-dir")
+        tx = LockHandle(id="tx-no-dir")
 
         ok = await lock.acquire_point("/local/nonexistent-path-xyz", tx, timeout=0.5)
         assert ok is False
@@ -116,30 +116,32 @@ async def test_acquire_point_dir_not_found(self, agfs_client):
 
     async def test_release_removes_lock_file(self, agfs_client, test_dir):
         lock = PathLock(agfs_client)
-        tx = TransactionRecord(id="tx-release-1")
+        tx = LockHandle(id="tx-release-1")
 
         await lock.acquire_point(test_dir, tx, timeout=3.0)
         lock_path = f"{test_dir}/{LOCK_FILE_NAME}"
 
         await lock.release(tx)
 
-        # Lock file should be gone
+        # Lock file should be gone (use stat, not cat — cat returns b'' for deleted files)
         try:
-            agfs_client.cat(lock_path)
+            agfs_client.stat(lock_path)
             raise AssertionError("Lock file should have been removed")
+        except AssertionError:
+            raise
         except Exception:
             pass  # Expected: file not found
 
     async def test_sequential_acquire_works(self, agfs_client, test_dir):
         lock = PathLock(agfs_client)
 
-        tx1 = TransactionRecord(id="tx-seq-1")
+        tx1 = LockHandle(id="tx-seq-1")
         ok1 = await lock.acquire_point(test_dir, tx1, timeout=3.0)
         assert ok1 is True
 
         await lock.release(tx1)
 
-        tx2 = TransactionRecord(id="tx-seq-2")
+        tx2 = LockHandle(id="tx-seq-2")
         ok2 = await lock.acquire_point(test_dir, tx2, timeout=3.0)
         assert ok2 is True
 
@@ -153,11 +155,11 @@ async def test_point_blocked_by_ancestor_subtree(self, agfs_client, test_dir):
         agfs_client.mkdir(child)
 
         lock = PathLock(agfs_client)
-        tx_parent = TransactionRecord(id="tx-parent-subtree")
+        tx_parent = LockHandle(id="tx-parent-subtree")
         ok = await lock.acquire_subtree(test_dir, tx_parent, timeout=3.0)
         assert ok is True
 
-        tx_child = TransactionRecord(id="tx-child-point")
+        tx_child = LockHandle(id="tx-child-point")
         blocked = await lock.acquire_point(child, tx_child, timeout=0.5)
         assert blocked is False
 
@@ -171,11 +173,11 @@ async def test_subtree_blocked_by_descendant_point(self, agfs_client, test_dir):
         agfs_client.mkdir(child)
 
         lock = PathLock(agfs_client)
-        tx_child = TransactionRecord(id="tx-desc-point")
+        tx_child = LockHandle(id="tx-desc-point")
         ok = await lock.acquire_point(child, tx_child, timeout=3.0)
         assert ok is True
 
-        tx_parent = TransactionRecord(id="tx-parent-sub")
+        tx_parent = LockHandle(id="tx-parent-sub")
         blocked = await lock.acquire_subtree(test_dir, tx_parent, timeout=0.5)
         assert blocked is False
 
@@ -191,7 +193,7 @@ async def test_acquire_mv_creates_subtree_locks(self, agfs_client, test_dir):
         agfs_client.mkdir(dst)
 
         lock = PathLock(agfs_client)
-        tx = TransactionRecord(id="tx-mv-1")
+        tx = LockHandle(id="tx-mv-1")
         ok = await lock.acquire_mv(src, dst, tx, timeout=3.0)
         assert ok is True
 
@@ -223,8 +225,8 @@ async def test_point_does_not_block_sibling_point(self, agfs_client, test_dir):
         agfs_client.mkdir(dir_b)
 
         lock = PathLock(agfs_client)
-        tx_a = TransactionRecord(id="tx-sib-a")
-        tx_b = TransactionRecord(id="tx-sib-b")
+        tx_a = LockHandle(id="tx-sib-a")
+        tx_b = LockHandle(id="tx-sib-b")
 
         ok_a = await lock.acquire_point(dir_a, tx_a, timeout=3.0)
         ok_b = await lock.acquire_point(dir_b, tx_b, timeout=3.0)
@@ -251,7 +253,7 @@ async def test_stale_lock_auto_removed_on_acquire(self, agfs_client, test_dir):
 
         # New transaction should succeed by auto-removing the stale lock
         lock = PathLock(agfs_client, lock_expire=300.0)
-        tx = TransactionRecord(id="tx-new-owner")
+        tx = LockHandle(id="tx-new-owner")
         ok = await lock.acquire_point(target, tx, timeout=2.0)
         assert ok is True
 
@@ -276,7 +278,7 @@ async def test_stale_subtree_ancestor_auto_removed(self, agfs_client, test_dir):
         agfs_client.write(parent_lock, stale_token.encode("utf-8"))
 
         lock = PathLock(agfs_client, lock_expire=300.0)
-        tx = TransactionRecord(id="tx-child-new")
+        tx = LockHandle(id="tx-child-new")
         ok = await lock.acquire_point(child, tx, timeout=2.0)
         assert ok is True
 
@@ -295,12 +297,12 @@ async def test_point_same_path_no_wait_fails_immediately(self, agfs_client, test
         agfs_client.mkdir(target)
 
         lock = PathLock(agfs_client)
-        tx1 = TransactionRecord(id="tx-hold")
+        tx1 = LockHandle(id="tx-hold")
         ok1 = await lock.acquire_point(target, tx1, timeout=3.0)
         assert ok1 is True
 
         # Second acquire with timeout=0 should fail immediately
-        tx2 = TransactionRecord(id="tx-blocked")
+        tx2 = LockHandle(id="tx-blocked")
         t0 = time.monotonic()
         ok2 = await lock.acquire_point(target, tx2, timeout=0.0)
         elapsed = time.monotonic() - t0
@@ -318,11 +320,11 @@ async def test_subtree_same_path_mutual_exclusion(self, agfs_client, test_dir):
         agfs_client.mkdir(target)
 
         lock = PathLock(agfs_client)
-        tx1 = TransactionRecord(id="tx-sub1")
+        tx1 = LockHandle(id="tx-sub1")
         ok1 = await lock.acquire_subtree(target, tx1, timeout=3.0)
         assert ok1 is True
 
-        tx2 = TransactionRecord(id="tx-sub2")
+        tx2 = LockHandle(id="tx-sub2")
         ok2 = await lock.acquire_subtree(target, tx2, timeout=0.5)
         assert ok2 is False
 
diff --git a/tests/transaction/test_post_actions.py b/tests/transaction/test_post_actions.py
deleted file mode 100644
index 2ae3c12b..00000000
--- a/tests/transaction/test_post_actions.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for post_actions execution and replay."""
-
-from unittest.mock import AsyncMock, MagicMock, patch
-
-from openviking.storage.transaction.transaction_manager import TransactionManager
-
-
-class TestPostActions:
-    def _make_manager(self):
-        agfs = MagicMock()
-        manager = TransactionManager(agfs_client=agfs, timeout=3600)
-        manager._journal = MagicMock()
-        return manager, agfs
-
-    async def test_execute_enqueue_semantic(self):
-        manager, _ = self._make_manager()
-
-        mock_queue = AsyncMock()
-        mock_queue_manager = MagicMock()
-        mock_queue_manager.get_queue.return_value = mock_queue
-
-        with patch(
-            "openviking.storage.queuefs.get_queue_manager",
-            return_value=mock_queue_manager,
-        ):
-            await manager._execute_post_actions(
-                [
-                    {
-                        "type": "enqueue_semantic",
-                        "params": {
-                            "uri": "viking://resources/test",
-                            "context_type": "resource",
-                            "account_id": "acc-1",
-                        },
-                    }
-                ]
-            )
-
-        mock_queue.enqueue.assert_called_once()
-        msg = mock_queue.enqueue.call_args[0][0]
-        assert msg.uri == "viking://resources/test"
-        assert msg.context_type == "resource"
-        assert msg.account_id == "acc-1"
-
-    async def test_execute_unknown_action_logged(self):
-        manager, _ = self._make_manager()
-        # Should not raise, just log
-        await manager._execute_post_actions(
-            [
-                {"type": "unknown_action", "params": {}},
-            ]
-        )
-
-    async def test_execute_multiple_actions(self):
-        manager, _ = self._make_manager()
-
-        mock_queue = AsyncMock()
-        mock_queue_manager = MagicMock()
-        mock_queue_manager.get_queue.return_value = mock_queue
-
-        with patch(
-            "openviking.storage.queuefs.get_queue_manager",
-            return_value=mock_queue_manager,
-        ):
-            await manager._execute_post_actions(
-                [
-                    {
-                        "type": "enqueue_semantic",
-                        "params": {
-                            "uri": "viking://a",
-                            "context_type": "resource",
-                            "account_id": "acc-1",
-                        },
-                    },
-                    {
-                        "type": "enqueue_semantic",
-                        "params": {
-                            "uri": "viking://b",
-                            "context_type": "memory",
-                            "account_id": "acc-2",
-                        },
-                    },
-                ]
-            )
-
-        assert mock_queue.enqueue.call_count == 2
-
-    async def test_post_action_failure_does_not_crash(self):
-        manager, _ = self._make_manager()
-
-        mock_queue_manager = MagicMock()
-        mock_queue_manager.get_queue.side_effect = Exception("queue not available")
-
-        with patch(
-            "openviking.storage.queuefs.get_queue_manager",
-            return_value=mock_queue_manager,
-        ):
-            # Should not raise
-            await manager._execute_post_actions(
-                [
-                    {
-                        "type": "enqueue_semantic",
-                        "params": {
-                            "uri": "viking://test",
-                            "context_type": "resource",
-                            "account_id": "",
-                        },
-                    },
-                ]
-            )
diff --git a/tests/transaction/test_redo_log.py b/tests/transaction/test_redo_log.py
new file mode 100644
index 00000000..8a0def2c
--- /dev/null
+++ b/tests/transaction/test_redo_log.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for RedoLog crash recovery."""
+
+import uuid
+
+import pytest
+
+from openviking.storage.transaction.redo_log import RedoLog
+
+
+@pytest.fixture
+def redo(agfs_client):
+    return RedoLog(agfs_client)
+
+
+class TestRedoLogBasic:
+    def test_write_and_read(self, redo):
+        task_id = uuid.uuid4().hex
+        info = {"archive_uri": "viking://test/archive", "session_uri": "viking://test/session"}
+        redo.write_pending(task_id, info)
+
+        result = redo.read(task_id)
+        assert result["archive_uri"] == "viking://test/archive"
+        assert result["session_uri"] == "viking://test/session"
+
+        redo.mark_done(task_id)
+
+    def test_list_pending(self, redo):
+        t1 = uuid.uuid4().hex
+        t2 = uuid.uuid4().hex
+        redo.write_pending(t1, {"key": "v1"})
+        redo.write_pending(t2, {"key": "v2"})
+
+        pending = redo.list_pending()
+        assert t1 in pending
+        assert t2 in pending
+
+        redo.mark_done(t1)
+        pending_after = redo.list_pending()
+        assert t1 not in pending_after
+        assert t2 in pending_after
+
+        redo.mark_done(t2)
+
+    def test_mark_done_removes_task(self, redo):
+        task_id = uuid.uuid4().hex
+        redo.write_pending(task_id, {"x": 1})
+        redo.mark_done(task_id)
+
+        pending = redo.list_pending()
+        assert task_id not in pending
+
+    def test_read_nonexistent_returns_empty(self, redo):
+        result = redo.read("nonexistent-task-id")
+        assert result == {}
+
+    def test_list_pending_empty(self, redo):
+        # Should not crash even if _REDO_ROOT doesn't exist yet
+        pending = redo.list_pending()
+        assert isinstance(pending, list)
+
+    def test_mark_done_idempotent(self, redo):
+        task_id = uuid.uuid4().hex
+        redo.write_pending(task_id, {"x": 1})
+        redo.mark_done(task_id)
+        # Second mark_done should not raise
+        redo.mark_done(task_id)
+
+    def test_overwrite_pending(self, redo):
+        task_id = uuid.uuid4().hex
+        redo.write_pending(task_id, {"version": 1})
+        redo.write_pending(task_id, {"version": 2})
+
+        result = redo.read(task_id)
+        assert result["version"] == 2
+
+        redo.mark_done(task_id)
diff --git a/tests/transaction/test_rm_rollback.py b/tests/transaction/test_rm_rollback.py
deleted file mode 100644
index 604b5f50..00000000
--- a/tests/transaction/test_rm_rollback.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Integration tests: multi-step rollback covering FS + VectorDB coordination."""
-
-import uuid
-
-from openviking.storage.transaction.undo import UndoEntry, execute_rollback
-
-from .conftest import VECTOR_DIM, _mkdir_ok, file_exists
-
-
-class TestRmRollback:
-    async def test_fs_rm_not_reversible(self, agfs_client, test_dir):
-        """fs_rm is intentionally irreversible: even completed=True is a no-op."""
-        path = f"{test_dir}/rm-target"
-        _mkdir_ok(agfs_client, path)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        # Directory still exists — fs_rm rollback does nothing
-        assert file_exists(agfs_client, path)
-
-
-class TestMvRollback:
-    async def test_mv_reversed_on_rollback(self, agfs_client, test_dir):
-        """Real mv → rollback → content back at original location."""
-        src = f"{test_dir}/mv-src"
-        dst = f"{test_dir}/mv-dst"
-        _mkdir_ok(agfs_client, src)
-        agfs_client.write(f"{src}/payload.txt", b"important data")
-
-        # Forward mv
-        agfs_client.mv(src, dst)
-        assert not file_exists(agfs_client, src)
-        content = agfs_client.cat(f"{dst}/payload.txt")
-        assert content == b"important data"
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="fs_mv",
-                params={"src": src, "dst": dst},
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        assert file_exists(agfs_client, src)
-        restored = agfs_client.cat(f"{src}/payload.txt")
-        assert restored == b"important data"
-
-
-class TestRecoverAll:
-    async def test_recover_all_reverses_incomplete(self, agfs_client, test_dir):
-        """recover_all=True also reverses entries with completed=False."""
-        new_dir = f"{test_dir}/recover-all-dir"
-        _mkdir_ok(agfs_client, new_dir)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
-        ]
-        await execute_rollback(undo_log, agfs_client, recover_all=True)
-
-        assert not file_exists(agfs_client, new_dir)
-
-    async def test_recover_all_false_skips_incomplete(self, agfs_client, test_dir):
-        """recover_all=False skips entries with completed=False."""
-        new_dir = f"{test_dir}/skip-incomplete"
-        _mkdir_ok(agfs_client, new_dir)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
-        ]
-        await execute_rollback(undo_log, agfs_client, recover_all=False)
-
-        assert file_exists(agfs_client, new_dir)
-
-
-class TestMultiStepRollback:
-    async def test_reverse_order_nested_dirs(self, agfs_client, test_dir):
-        """parent + child → rollback reverses in reverse sequence order."""
-        parent = f"{test_dir}/multi-parent"
-        child = f"{test_dir}/multi-parent/child"
-        _mkdir_ok(agfs_client, parent)
-        _mkdir_ok(agfs_client, child)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True),
-            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        assert not file_exists(agfs_client, child)
-        assert not file_exists(agfs_client, parent)
-
-    async def test_write_new_rollback(self, agfs_client, test_dir):
-        """New file → rollback → file deleted."""
-        file_path = f"{test_dir}/new-file.txt"
-        agfs_client.write(file_path, b"new content")
-        assert file_exists(agfs_client, file_path)
-
-        undo_log = [
-            UndoEntry(
-                sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        assert not file_exists(agfs_client, file_path)
-
-    async def test_best_effort_continues(self, agfs_client, test_dir):
-        """If one step fails, subsequent steps still execute."""
-        real_dir = f"{test_dir}/best-effort-real"
-        _mkdir_ok(agfs_client, real_dir)
-
-        undo_log = [
-            # seq=0: mkdir rollback on real dir → should succeed
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": real_dir}, completed=True),
-            # seq=1: mkdir rollback on nonexistent dir → fails silently
-            UndoEntry(
-                sequence=1,
-                op_type="fs_mkdir",
-                params={"uri": f"{test_dir}/no-such-dir-{uuid.uuid4().hex}"},
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        # seq=0 still executed despite seq=1 failure (reversed order: 1 runs first, then 0)
-        assert not file_exists(agfs_client, real_dir)
-
-    async def test_unknown_op_type_no_crash(self, agfs_client, test_dir):
-        """Unknown op_type is logged but doesn't raise."""
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="some_future_op",
-                params={"foo": "bar"},
-                completed=True,
-            ),
-        ]
-        # Should not raise
-        await execute_rollback(undo_log, agfs_client)
-
-
-class TestVectorDBRollback:
-    async def test_vectordb_delete_rollback_restores(self, agfs_client, vector_store, request_ctx):
-        """upsert → delete → rollback(vectordb_delete) → record restored."""
-        record_id = str(uuid.uuid4())
-        record = {
-            "id": record_id,
-            "uri": f"viking://resources/del-restore-{record_id}.md",
-            "parent_uri": "viking://resources/",
-            "account_id": "default",
-            "context_type": "resource",
-            "level": 2,
-            "vector": [0.3] * VECTOR_DIM,
-            "name": "del-restore",
-            "description": "test",
-            "abstract": "test",
-        }
-        await vector_store.upsert(record, ctx=request_ctx)
-
-        # Snapshot before delete
-        snapshot = await vector_store.get([record_id], ctx=request_ctx)
-        assert len(snapshot) == 1
-
-        # Forward: delete
-        await vector_store.delete([record_id], ctx=request_ctx)
-        assert len(await vector_store.get([record_id], ctx=request_ctx)) == 0
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_delete",
-                params={
-                    "uris": [record["uri"]],
-                    "records_snapshot": snapshot,
-                    "_ctx_account_id": "default",
-                    "_ctx_user_id": "test_user",
-                    "_ctx_role": "root",
-                },
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
-
-        results = await vector_store.get([record_id], ctx=request_ctx)
-        assert len(results) == 1
-
-    async def test_vectordb_delete_multi_record(self, agfs_client, vector_store, request_ctx):
-        """3 records in snapshot → rollback → all restored."""
-        records = []
-        for i in range(3):
-            rid = str(uuid.uuid4())
-            rec = {
-                "id": rid,
-                "uri": f"viking://resources/multi-{rid}.md",
-                "parent_uri": "viking://resources/",
-                "account_id": "default",
-                "context_type": "resource",
-                "level": 2,
-                "vector": [0.1 * (i + 1)] * VECTOR_DIM,
-                "name": f"multi-{i}",
-                "description": "test",
-                "abstract": "test",
-            }
-            await vector_store.upsert(rec, ctx=request_ctx)
-            records.append(rec)
-
-        ids = [r["id"] for r in records]
-        snapshot = await vector_store.get(ids, ctx=request_ctx)
-        assert len(snapshot) == 3
-
-        # Delete all
-        await vector_store.delete(ids, ctx=request_ctx)
-        assert len(await vector_store.get(ids, ctx=request_ctx)) == 0
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_delete",
-                params={
-                    "uris": [r["uri"] for r in records],
-                    "records_snapshot": snapshot,
-                    "_ctx_account_id": "default",
-                    "_ctx_user_id": "test_user",
-                    "_ctx_role": "root",
-                },
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
-
-        results = await vector_store.get(ids, ctx=request_ctx)
-        assert len(results) == 3
-
-    async def test_vectordb_delete_empty_snapshot(self, agfs_client, vector_store, request_ctx):
-        """Empty snapshot → no-op, no error."""
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_delete",
-                params={
-                    "uris": [],
-                    "records_snapshot": [],
-                    "_ctx_account_id": "default",
-                    "_ctx_user_id": "test_user",
-                    "_ctx_role": "root",
-                },
-                completed=True,
-            ),
-        ]
-        # Should not raise
-        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
-
-    async def test_vectordb_upsert_rollback_deletes(self, agfs_client, vector_store, request_ctx):
-        """upsert → rollback(vectordb_upsert) → record deleted."""
-        record_id = str(uuid.uuid4())
-        record = {
-            "id": record_id,
-            "uri": f"viking://resources/upsert-del-{record_id}.md",
-            "parent_uri": "viking://resources/",
-            "account_id": "default",
-            "context_type": "resource",
-            "level": 2,
-            "vector": [0.4] * VECTOR_DIM,
-            "name": "upsert-del",
-            "description": "test",
-            "abstract": "test",
-        }
-        await vector_store.upsert(record, ctx=request_ctx)
-        assert len(await vector_store.get([record_id], ctx=request_ctx)) == 1
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_upsert",
-                params={
-                    "record_id": record_id,
-                    "_ctx_account_id": "default",
-                    "_ctx_user_id": "test_user",
-                    "_ctx_role": "root",
-                },
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
-
-        results = await vector_store.get([record_id], ctx=request_ctx)
-        assert len(results) == 0
diff --git a/tests/transaction/test_transaction_manager.py b/tests/transaction/test_transaction_manager.py
deleted file mode 100644
index ef0f0b3e..00000000
--- a/tests/transaction/test_transaction_manager.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for TransactionManager: CRUD, lifecycle, commit/rollback flows, timeout cleanup."""
-
-import time
-from unittest.mock import AsyncMock, MagicMock, patch
-
-from openviking.storage.transaction.transaction_manager import TransactionManager
-from openviking.storage.transaction.transaction_record import TransactionRecord, TransactionStatus
-
-
-def _make_manager(**kwargs):
-    """Create a TransactionManager with mocked AGFS and journal."""
-    agfs = MagicMock()
-    defaults = {"agfs_client": agfs, "timeout": 3600, "lock_timeout": 0.0, "lock_expire": 300.0}
-    defaults.update(kwargs)
-    manager = TransactionManager(**defaults)
-    manager._journal = MagicMock()
-    manager._journal.list_all.return_value = []
-    return manager, agfs
-
-
-class TestCreateAndGet:
-    def test_create_transaction_returns_record(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction(init_info={"operation": "rm"})
-        assert isinstance(tx, TransactionRecord)
-        assert tx.status == TransactionStatus.INIT
-        assert tx.init_info == {"operation": "rm"}
-
-    def test_create_assigns_unique_ids(self):
-        manager, _ = _make_manager()
-        tx1 = manager.create_transaction()
-        tx2 = manager.create_transaction()
-        assert tx1.id != tx2.id
-
-    def test_get_transaction_found(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        assert manager.get_transaction(tx.id) is tx
-
-    def test_get_transaction_not_found(self):
-        manager, _ = _make_manager()
-        assert manager.get_transaction("nonexistent") is None
-
-    def test_get_transaction_count(self):
-        manager, _ = _make_manager()
-        assert manager.get_transaction_count() == 0
-        manager.create_transaction()
-        assert manager.get_transaction_count() == 1
-        manager.create_transaction()
-        assert manager.get_transaction_count() == 2
-
-    def test_get_active_transactions(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        active = manager.get_active_transactions()
-        assert tx.id in active
-        # Returned copy, not the internal dict
-        active.pop(tx.id)
-        assert manager.get_transaction(tx.id) is tx
-
-
-class TestBegin:
-    async def test_begin_updates_status(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        ok = await manager.begin(tx.id)
-        assert ok is True
-        assert tx.status == TransactionStatus.ACQUIRE
-
-    async def test_begin_unknown_tx(self):
-        manager, _ = _make_manager()
-        ok = await manager.begin("unknown-tx")
-        assert ok is False
-
-
-class TestCommitFlow:
-    async def test_commit_full_lifecycle(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-
-        # Simulate lock acquisition
-        tx.update_status(TransactionStatus.EXEC)
-        tx.add_lock("/test/.path.ovlock")
-
-        ok = await manager.commit(tx.id)
-        assert ok is True
-        assert tx.status == TransactionStatus.RELEASED
-        # Removed from active transactions
-        assert manager.get_transaction(tx.id) is None
-        # Journal cleaned up
-        manager._journal.delete.assert_called_once_with(tx.id)
-
-    async def test_commit_persists_journal_before_release(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-
-        call_order = []
-        original_update = manager._journal.update
-
-        def track_update(data):
-            call_order.append(("journal_update", data.get("status")))
-            return original_update(data)
-
-        manager._journal.update = track_update
-        manager._journal.delete = MagicMock(
-            side_effect=lambda _: call_order.append(("journal_delete",))
-        )
-
-        await manager.commit(tx.id)
-        # Journal update (COMMIT) happens before delete
-        assert call_order[0] == ("journal_update", "COMMIT")
-
-    async def test_commit_executes_post_actions(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-        tx.post_actions.append({"type": "enqueue_semantic", "params": {"uri": "viking://x"}})
-
-        with patch.object(manager, "_execute_post_actions", new_callable=AsyncMock) as mock_post:
-            await manager.commit(tx.id)
-        mock_post.assert_called_once()
-
-    async def test_commit_unknown_tx(self):
-        manager, _ = _make_manager()
-        ok = await manager.commit("nonexistent")
-        assert ok is False
-
-    async def test_commit_releases_locks(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-        tx.add_lock("/a/.path.ovlock")
-        tx.add_lock("/b/.path.ovlock")
-
-        with patch.object(manager._path_lock, "release", new_callable=AsyncMock) as mock_release:
-            await manager.commit(tx.id)
-        mock_release.assert_called_once()
-
-
-class TestRollbackFlow:
-    async def test_rollback_executes_undo_log(self):
-        manager, agfs = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-
-        from openviking.storage.transaction.undo import UndoEntry
-
-        tx.undo_log.append(
-            UndoEntry(
-                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
-            )
-        )
-
-        ok = await manager.rollback(tx.id)
-        assert ok is True
-        assert tx.status == TransactionStatus.RELEASED
-        agfs.mv.assert_called_once_with("/b", "/a")
-
-    async def test_rollback_removes_from_active(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-
-        await manager.rollback(tx.id)
-        assert manager.get_transaction(tx.id) is None
-
-    async def test_rollback_cleans_journal(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-
-        await manager.rollback(tx.id)
-        manager._journal.delete.assert_called_once_with(tx.id)
-
-    async def test_rollback_unknown_tx(self):
-        manager, _ = _make_manager()
-        ok = await manager.rollback("nonexistent")
-        assert ok is False
-
-    async def test_rollback_undo_failure_does_not_prevent_cleanup(self):
-        """Undo failure is best-effort; lock release and journal cleanup still happen."""
-        manager, agfs = _make_manager()
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-
-        from openviking.storage.transaction.undo import UndoEntry
-
-        tx.undo_log.append(
-            UndoEntry(
-                sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"}, completed=True
-            )
-        )
-        agfs.mv.side_effect = Exception("disk error")
-
-        ok = await manager.rollback(tx.id)
-        assert ok is True
-        manager._journal.delete.assert_called_once()
-
-
-class TestLockAcquisitionWrappers:
-    async def test_acquire_lock_point_success_transitions_to_exec(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-
-        with patch.object(
-            manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=True
-        ):
-            ok = await manager.acquire_lock_point(tx.id, "/test")
-        assert ok is True
-        assert tx.status == TransactionStatus.EXEC
-
-    async def test_acquire_lock_point_failure_transitions_to_fail(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-
-        with patch.object(
-            manager._path_lock, "acquire_point", new_callable=AsyncMock, return_value=False
-        ):
-            ok = await manager.acquire_lock_point(tx.id, "/test")
-        assert ok is False
-        assert tx.status == TransactionStatus.FAIL
-
-    async def test_acquire_lock_subtree_success(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-
-        with patch.object(
-            manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True
-        ):
-            ok = await manager.acquire_lock_subtree(tx.id, "/test")
-        assert ok is True
-        assert tx.status == TransactionStatus.EXEC
-
-    async def test_acquire_lock_subtree_uses_config_timeout(self):
-        manager, _ = _make_manager(lock_timeout=5.0)
-        tx = manager.create_transaction()
-
-        with patch.object(
-            manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True
-        ) as mock_acquire:
-            await manager.acquire_lock_subtree(tx.id, "/test")
-        mock_acquire.assert_called_once_with("/test", tx, timeout=5.0)
-
-    async def test_acquire_lock_subtree_override_timeout(self):
-        manager, _ = _make_manager(lock_timeout=5.0)
-        tx = manager.create_transaction()
-
-        with patch.object(
-            manager._path_lock, "acquire_subtree", new_callable=AsyncMock, return_value=True
-        ) as mock_acquire:
-            await manager.acquire_lock_subtree(tx.id, "/test", timeout=10.0)
-        mock_acquire.assert_called_once_with("/test", tx, timeout=10.0)
-
-    async def test_acquire_lock_mv_success(self):
-        manager, _ = _make_manager()
-        tx = manager.create_transaction()
-
-        with patch.object(
-            manager._path_lock, "acquire_mv", new_callable=AsyncMock, return_value=True
-        ):
-            ok = await manager.acquire_lock_mv(tx.id, "/src", "/dst")
-        assert ok is True
-        assert tx.status == TransactionStatus.EXEC
-
-    async def test_acquire_lock_unknown_tx(self):
-        manager, _ = _make_manager()
-        ok = await manager.acquire_lock_point("nonexistent", "/test")
-        assert ok is False
-
-
-class TestLifecycle:
-    async def test_start_sets_running(self):
-        manager, _ = _make_manager()
-        await manager.start()
-        assert manager._running is True
-        manager.stop()
-
-    async def test_start_idempotent(self):
-        manager, _ = _make_manager()
-        await manager.start()
-        await manager.start()  # Should not error
-        assert manager._running is True
-        await manager.stop()
-
-    async def test_stop_clears_state(self):
-        manager, _ = _make_manager()
-        await manager.start()
-        manager.create_transaction()
-        await manager.stop()
-        assert manager._running is False
-        assert manager.get_transaction_count() == 0
-
-    async def test_stop_idempotent(self):
-        manager, _ = _make_manager()
-        await manager.stop()
-        await manager.stop()  # Should not error
-
-
-class TestTimeoutCleanup:
-    async def test_cleanup_timed_out_rolls_back(self):
-        manager, _ = _make_manager(timeout=1)
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-        # Simulate old updated_at
-        tx.updated_at = time.time() - 10
-
-        with patch.object(
-            manager, "rollback", new_callable=AsyncMock, return_value=True
-        ) as mock_rb:
-            await manager._cleanup_timed_out()
-        mock_rb.assert_called_once_with(tx.id)
-
-    async def test_cleanup_skips_fresh_transactions(self):
-        manager, _ = _make_manager(timeout=3600)
-        tx = manager.create_transaction()
-        tx.update_status(TransactionStatus.EXEC)
-
-        with patch.object(manager, "rollback", new_callable=AsyncMock) as mock_rb:
-            await manager._cleanup_timed_out()
-        mock_rb.assert_not_called()
diff --git a/tests/transaction/test_undo.py b/tests/transaction/test_undo.py
deleted file mode 100644
index aff57887..00000000
--- a/tests/transaction/test_undo.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
-# SPDX-License-Identifier: Apache-2.0
-"""Tests for undo log and rollback executor."""
-
-import uuid
-
-from openviking.storage.transaction.undo import UndoEntry, execute_rollback
-
-from .conftest import VECTOR_DIM, _mkdir_ok, file_exists
-
-
-class TestUndoEntry:
-    def test_to_dict(self):
-        entry = UndoEntry(sequence=0, op_type="fs_mv", params={"src": "/a", "dst": "/b"})
-        d = entry.to_dict()
-        assert d["sequence"] == 0
-        assert d["op_type"] == "fs_mv"
-        assert d["params"] == {"src": "/a", "dst": "/b"}
-        assert d["completed"] is False
-
-    def test_from_dict(self):
-        data = {"sequence": 1, "op_type": "fs_rm", "params": {"uri": "/x"}, "completed": True}
-        entry = UndoEntry.from_dict(data)
-        assert entry.sequence == 1
-        assert entry.op_type == "fs_rm"
-        assert entry.completed is True
-
-    def test_roundtrip(self):
-        entry = UndoEntry(
-            sequence=5, op_type="vectordb_upsert", params={"record_id": "r1"}, completed=True
-        )
-        restored = UndoEntry.from_dict(entry.to_dict())
-        assert restored.sequence == entry.sequence
-        assert restored.op_type == entry.op_type
-        assert restored.params == entry.params
-        assert restored.completed == entry.completed
-
-
-class TestExecuteRollback:
-    """Integration tests for execute_rollback using real AGFS and VectorDB backends."""
-
-    async def test_rollback_fs_mv(self, agfs_client, test_dir):
-        src = f"{test_dir}/src"
-        dst = f"{test_dir}/dst"
-        _mkdir_ok(agfs_client, src)
-        agfs_client.write(f"{src}/data.txt", b"hello")
-
-        # Forward: mv src → dst
-        agfs_client.mv(src, dst)
-        assert not file_exists(agfs_client, src)
-        assert file_exists(agfs_client, dst)
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="fs_mv",
-                params={"src": src, "dst": dst},
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        # src restored, dst gone
-        assert file_exists(agfs_client, src)
-        assert not file_exists(agfs_client, dst)
-
-    async def test_rollback_fs_rm_skipped(self, agfs_client, test_dir):
-        path = f"{test_dir}/will-not-delete"
-        _mkdir_ok(agfs_client, path)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_rm", params={"uri": path}, completed=True),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        # fs_rm rollback is a no-op; directory still exists
-        assert file_exists(agfs_client, path)
-
-    async def test_rollback_fs_mkdir(self, agfs_client, test_dir):
-        new_dir = f"{test_dir}/created"
-        _mkdir_ok(agfs_client, new_dir)
-        assert file_exists(agfs_client, new_dir)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=True),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        assert not file_exists(agfs_client, new_dir)
-
-    async def test_rollback_fs_write_new(self, agfs_client, test_dir):
-        file_path = f"{test_dir}/new-file.txt"
-        agfs_client.write(file_path, b"content")
-        assert file_exists(agfs_client, file_path)
-
-        undo_log = [
-            UndoEntry(
-                sequence=0, op_type="fs_write_new", params={"uri": file_path}, completed=True
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        assert not file_exists(agfs_client, file_path)
-
-    async def test_rollback_reverse_order(self, agfs_client, test_dir):
-        """mkdir parent + child → rollback → both removed in reverse order."""
-        parent = f"{test_dir}/parent"
-        child = f"{test_dir}/parent/child"
-        _mkdir_ok(agfs_client, parent)
-        _mkdir_ok(agfs_client, child)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": parent}, completed=True),
-            UndoEntry(sequence=1, op_type="fs_mkdir", params={"uri": child}, completed=True),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        # child removed first (seq=1), then parent (seq=0)
-        assert not file_exists(agfs_client, child)
-        assert not file_exists(agfs_client, parent)
-
-    async def test_rollback_skips_incomplete(self, agfs_client, test_dir):
-        new_dir = f"{test_dir}/incomplete"
-        _mkdir_ok(agfs_client, new_dir)
-
-        undo_log = [
-            UndoEntry(sequence=0, op_type="fs_mkdir", params={"uri": new_dir}, completed=False),
-        ]
-        await execute_rollback(undo_log, agfs_client)
-
-        # completed=False → not rolled back
-        assert file_exists(agfs_client, new_dir)
-
-    async def test_rollback_best_effort(self, agfs_client, test_dir):
-        """A failing rollback entry should not prevent others from running."""
-        real_dir = f"{test_dir}/real-dir"
-        _mkdir_ok(agfs_client, real_dir)
-
-        src = f"{test_dir}/be-src"
-        dst = f"{test_dir}/be-dst"
-        _mkdir_ok(agfs_client, dst)
-
-        undo_log = [
-            # seq=0: fs_mv rollback will succeed
-            UndoEntry(sequence=0, op_type="fs_mv", params={"src": src, "dst": dst}, completed=True),
-            # seq=1: fs_mkdir rollback will fail (rm on non-empty or non-existent path)
-            UndoEntry(
-                sequence=1,
-                op_type="fs_mkdir",
-                params={"uri": f"{test_dir}/nonexistent-dir-xyz"},
-                completed=True,
-            ),
-        ]
-        # Should not raise
-        await execute_rollback(undo_log, agfs_client)
-
-        # seq=0 mv rollback should have executed (dst → src)
-        assert file_exists(agfs_client, src)
-
-    async def test_rollback_vectordb_upsert(self, agfs_client, vector_store, request_ctx):
-        """Real upsert → rollback → record deleted."""
-        record_id = str(uuid.uuid4())
-        record = {
-            "id": record_id,
-            "uri": f"viking://resources/test-upsert-{record_id}.md",
-            "parent_uri": "viking://resources/",
-            "account_id": "default",
-            "context_type": "resource",
-            "level": 2,
-            "vector": [0.1] * VECTOR_DIM,
-            "name": "test",
-            "description": "test record",
-            "abstract": "test",
-        }
-        await vector_store.upsert(record, ctx=request_ctx)
-
-        # Confirm it exists
-        results = await vector_store.get([record_id], ctx=request_ctx)
-        assert len(results) == 1
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_upsert",
-                params={
-                    "record_id": record_id,
-                    "_ctx_account_id": "default",
-                    "_ctx_user_id": "test_user",
-                    "_ctx_role": "root",
-                },
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
-
-        results = await vector_store.get([record_id], ctx=request_ctx)
-        assert len(results) == 0
-
-    async def test_rollback_vectordb_update_uri(self, agfs_client, vector_store, request_ctx):
-        """Real upsert → update_uri_mapping → rollback → URI restored."""
-        record_id = str(uuid.uuid4())
-        old_uri = f"viking://resources/old-{record_id}.md"
-        new_uri = f"viking://resources/new-{record_id}.md"
-        record = {
-            "id": record_id,
-            "uri": old_uri,
-            "parent_uri": "viking://resources/",
-            "account_id": "default",
-            "context_type": "resource",
-            "level": 2,
-            "vector": [0.2] * VECTOR_DIM,
-            "name": "test",
-            "description": "test",
-            "abstract": "test",
-        }
-        await vector_store.upsert(record, ctx=request_ctx)
-
-        # Forward: update URI mapping
-        await vector_store.update_uri_mapping(
-            ctx=request_ctx,
-            uri=old_uri,
-            new_uri=new_uri,
-            new_parent_uri="viking://resources/",
-        )
-
-        # Verify forward operation
-        result = await vector_store.fetch_by_uri(new_uri, ctx=request_ctx)
-        assert result is not None
-
-        undo_log = [
-            UndoEntry(
-                sequence=0,
-                op_type="vectordb_update_uri",
-                params={
-                    "old_uri": old_uri,
-                    "new_uri": new_uri,
-                    "old_parent_uri": "viking://resources/",
-                    "_ctx_account_id": "default",
-                    "_ctx_user_id": "test_user",
-                    "_ctx_role": "root",
-                },
-                completed=True,
-            ),
-        ]
-        await execute_rollback(undo_log, agfs_client, vector_store=vector_store)
-
-        # URI should be restored to old_uri
-        result = await vector_store.fetch_by_uri(old_uri, ctx=request_ctx)
-        assert result is not None

From fe33516144494bff5770e320709cc3d90d1ee1e5 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Tue, 17 Mar 2026 20:57:37 +0800
Subject: [PATCH 13/18] fix(transaction): remove checkpoint dead code, fix
 TOCTOU race, clarify mv lock param

- Remove unused _write_checkpoint/_write_checkpoint_async/_read_checkpoint
  from Session (superseded by redo-log)
- Re-resolve URI inside lock in resource_processor Phase 3.5 to prevent
  concurrent add_resource calls from resolving to the same final_uri
- Rename acquire_mv dst_path to dst_parent_path with docstring to clarify
  that callers pass the destination parent directory
---
 docs/en/concepts/09-transaction.md            |  2 +-
 docs/zh/concepts/09-transaction.md            |  2 +-
 openviking/core/building_tree.py              |  1 +
 openviking/parse/tree_builder.py              |  2 +
 openviking/session/session.py                 | 50 -------------------
 .../storage/transaction/lock_context.py       | 10 ++--
 .../storage/transaction/lock_manager.py       |  4 +-
 openviking/storage/transaction/path_lock.py   | 29 ++++++++---
 openviking/storage/viking_fs.py               |  2 +-
 openviking/utils/resource_processor.py        |  8 +++
 tests/transaction/test_e2e.py                 |  2 +-
 tests/transaction/test_lock_context.py        |  2 +-
 12 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
index 1ada00dc..0e95bdf1 100644
--- a/docs/en/concepts/09-transaction.md
+++ b/docs/en/concepts/09-transaction.md
@@ -205,7 +205,7 @@ async with LockContext(lock_manager, [path], lock_mode="subtree"):
     pass
 
 # MV lock (move operations)
-async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst):
+async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst):
     # Perform operations...
     pass
 ```
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index 31d09c54..81d27e02 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -204,7 +204,7 @@ async with LockContext(lock_manager, [path], lock_mode="subtree"):
     pass
 
 # MV 锁（移动操作）
-async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst):
+async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst):
     # 执行操作...
     pass
 ```
diff --git a/openviking/core/building_tree.py b/openviking/core/building_tree.py
index 9685a56d..43207e29 100644
--- a/openviking/core/building_tree.py
+++ b/openviking/core/building_tree.py
@@ -28,6 +28,7 @@ def __init__(
         self._contexts: List["Context"] = []
         self._uri_map: Dict[str, "Context"] = {}
         self._root_uri: Optional[str] = None
+        self._candidate_uri: Optional[str] = None
 
     def add_context(self, context: "Context") -> None:
         """Add a context to the tree."""
diff --git a/openviking/parse/tree_builder.py b/openviking/parse/tree_builder.py
index 5f070cee..deea5efc 100644
--- a/openviking/parse/tree_builder.py
+++ b/openviking/parse/tree_builder.py
@@ -177,6 +177,8 @@ async def finalize_from_temp(
             source_format=source_format,
         )
         tree._root_uri = final_uri
+        if not to_uri:
+            tree._candidate_uri = candidate_uri
 
         # Create a minimal Context object for the root so that tree.root is not None
         root_context = Context(uri=final_uri, temp_uri=temp_doc_uri)
diff --git a/openviking/session/session.py b/openviking/session/session.py
index c0f87bd9..bdb6500b 100644
--- a/openviking/session/session.py
+++ b/openviking/session/session.py
@@ -731,56 +731,6 @@ def _write_relations(self) -> None:
             except Exception as e:
                 logger.warning(f"Failed to create relation to {usage.uri}: {e}")
 
-    def _write_checkpoint(self, data: Dict[str, Any]) -> None:
-        """Write a commit checkpoint file for crash recovery."""
-        if not self._viking_fs:
-            return
-
-        checkpoint = {
-            **data,
-            "session_id": self.session_id,
-            "compression_index": self._compression.compression_index,
-            "timestamp": get_current_timestamp(),
-        }
-        run_async(
-            self._viking_fs.write_file(
-                f"{self._session_uri}/.commit_checkpoint.json",
-                json.dumps(checkpoint, ensure_ascii=False),
-                ctx=self.ctx,
-            )
-        )
-
-    async def _write_checkpoint_async(self, data: Dict[str, Any]) -> None:
-        """Write a commit checkpoint file for crash recovery (async)."""
-        if not self._viking_fs:
-            return
-
-        checkpoint = {
-            **data,
-            "session_id": self.session_id,
-            "compression_index": self._compression.compression_index,
-            "timestamp": get_current_timestamp(),
-        }
-        await self._viking_fs.write_file(
-            f"{self._session_uri}/.commit_checkpoint.json",
-            json.dumps(checkpoint, ensure_ascii=False),
-            ctx=self.ctx,
-        )
-
-    def _read_checkpoint(self) -> Optional[Dict[str, Any]]:
-        """Read commit checkpoint file if it exists."""
-        if not self._viking_fs:
-            return None
-        try:
-            content = run_async(
-                self._viking_fs.read_file(
-                    f"{self._session_uri}/.commit_checkpoint.json", ctx=self.ctx
-                )
-            )
-            return json.loads(content)
-        except Exception:
-            return None
-
     async def _write_relations_async(self) -> None:
         """Create relations to used contexts/tools (async)."""
         if not self._viking_fs:
diff --git a/openviking/storage/transaction/lock_context.py b/openviking/storage/transaction/lock_context.py
index 62fc15ba..4d1d8443 100644
--- a/openviking/storage/transaction/lock_context.py
+++ b/openviking/storage/transaction/lock_context.py
@@ -21,13 +21,13 @@ def __init__(
         lock_manager: LockManager,
         paths: list[str],
         lock_mode: str = "point",
-        mv_dst_path: Optional[str] = None,
+        mv_dst_parent_path: Optional[str] = None,
         src_is_dir: bool = True,
     ):
         self._manager = lock_manager
         self._paths = paths
         self._lock_mode = lock_mode
-        self._mv_dst_path = mv_dst_path
+        self._mv_dst_parent_path = mv_dst_parent_path
         self._src_is_dir = src_is_dir
         self._handle: Optional[LockHandle] = None
 
@@ -41,12 +41,12 @@ async def __aenter__(self) -> LockHandle:
                 if not success:
                     break
         elif self._lock_mode == "mv":
-            if self._mv_dst_path is None:
-                raise LockAcquisitionError("mv lock mode requires mv_dst_path")
+            if self._mv_dst_parent_path is None:
+                raise LockAcquisitionError("mv lock mode requires mv_dst_parent_path")
             success = await self._manager.acquire_mv(
                 self._handle,
                 self._paths[0],
-                self._mv_dst_path,
+                self._mv_dst_parent_path,
                 src_is_dir=self._src_is_dir,
             )
         else:  # "point"
diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py
index 5e2e5076..7e5184ab 100644
--- a/openviking/storage/transaction/lock_manager.py
+++ b/openviking/storage/transaction/lock_manager.py
@@ -82,13 +82,13 @@ async def acquire_mv(
         self,
         handle: LockHandle,
         src: str,
-        dst: str,
+        dst_parent: str,
         src_is_dir: bool = True,
         timeout: Optional[float] = None,
     ) -> bool:
         return await self._path_lock.acquire_mv(
             src,
-            dst,
+            dst_parent,
             handle,
             timeout=timeout if timeout is not None else self._lock_timeout,
             src_is_dir=src_is_dir,
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index d9212b3b..345f2661 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -328,17 +328,32 @@ async def acquire_subtree(self, path: str, owner: LockOwner, timeout: float = 0.
     async def acquire_mv(
         self,
         src_path: str,
-        dst_path: str,
+        dst_parent_path: str,
         owner: LockOwner,
         timeout: float = 0.0,
         src_is_dir: bool = True,
     ) -> bool:
+        """Acquire locks for a move operation.
+
+        Args:
+            src_path: Source path to lock.
+            dst_parent_path: Parent directory of the destination to lock.
+                Callers typically pass the destination's parent so that the
+                lock covers sibling-level conflicts without requiring the
+                target to exist yet.
+            owner: Lock owner handle.
+            timeout: Maximum seconds to wait for each lock.
+            src_is_dir: Whether the source is a directory (SUBTREE lock)
+                or a file (POINT lock on parent).
+        """
         if src_is_dir:
             if not await self.acquire_subtree(src_path, owner, timeout=timeout):
                 logger.warning(f"[MV] Failed to acquire SUBTREE lock on source: {src_path}")
                 return False
-            if not await self.acquire_subtree(dst_path, owner, timeout=timeout):
-                logger.warning(f"[MV] Failed to acquire SUBTREE lock on destination: {dst_path}")
+            if not await self.acquire_subtree(dst_parent_path, owner, timeout=timeout):
+                logger.warning(
+                    f"[MV] Failed to acquire SUBTREE lock on destination parent: {dst_parent_path}"
+                )
                 await self.release(owner)
                 return False
         else:
@@ -346,12 +361,14 @@ async def acquire_mv(
             if not await self.acquire_point(src_parent, owner, timeout=timeout):
                 logger.warning(f"[MV] Failed to acquire POINT lock on source parent: {src_parent}")
                 return False
-            if not await self.acquire_point(dst_path, owner, timeout=timeout):
-                logger.warning(f"[MV] Failed to acquire POINT lock on destination: {dst_path}")
+            if not await self.acquire_point(dst_parent_path, owner, timeout=timeout):
+                logger.warning(
+                    f"[MV] Failed to acquire POINT lock on destination parent: {dst_parent_path}"
+                )
                 await self.release(owner)
                 return False
 
-        logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_path}")
+        logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_parent_path}")
         return True
 
     async def release(self, owner: LockOwner) -> None:
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 72475573..d9119c11 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -357,7 +357,7 @@ async def mv(
             get_lock_manager(),
             [old_path],
             lock_mode="mv",
-            mv_dst_path=dst_parent,
+            mv_dst_parent_path=dst_parent,
             src_is_dir=is_dir,
         ):
             uris_to_move = await self._collect_uris(old_path, recursive=True, ctx=ctx)
diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py
index 42ca8752..73bca2f6 100644
--- a/openviking/utils/resource_processor.py
+++ b/openviking/utils/resource_processor.py
@@ -213,6 +213,7 @@ async def process_resource(
         # ============ Phase 3.5: 首次添加立即落盘 ============
         root_uri = result.get("root_uri")
         temp_uri = result.get("temp_uri")  # temp_doc_uri
+        candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None
 
         if root_uri and temp_uri:
             viking_fs = get_viking_fs()
@@ -230,6 +231,13 @@ async def process_resource(
                     await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx)
 
                 async with LockContext(get_lock_manager(), [parent_path], lock_mode="point"):
+                    # Re-resolve URI inside lock to prevent TOCTOU race where
+                    # concurrent add_resource calls resolve to the same final_uri.
+                    if candidate_uri:
+                        root_uri = await self.tree_builder._resolve_unique_uri(candidate_uri)
+                        result["root_uri"] = root_uri
+                        dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx)
+
                     src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx)
                     await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path)
 
diff --git a/tests/transaction/test_e2e.py b/tests/transaction/test_e2e.py
index 1c79414d..2f284f53 100644
--- a/tests/transaction/test_e2e.py
+++ b/tests/transaction/test_e2e.py
@@ -80,7 +80,7 @@ async def test_mv_lock_acquires_both_paths(self, agfs_client, lock_manager, test
         agfs_client.mkdir(src)
         agfs_client.mkdir(dst)
 
-        async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_path=dst):
+        async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=dst):
             src_token = agfs_client.cat(f"{src}/{LOCK_FILE_NAME}")
             dst_token = agfs_client.cat(f"{dst}/{LOCK_FILE_NAME}")
             src_token_str = src_token.decode("utf-8") if isinstance(src_token, bytes) else src_token
diff --git a/tests/transaction/test_lock_context.py b/tests/transaction/test_lock_context.py
index 37fcb89c..131fb48e 100644
--- a/tests/transaction/test_lock_context.py
+++ b/tests/transaction/test_lock_context.py
@@ -67,7 +67,7 @@ async def test_mv_lock(self, agfs_client, lm, test_dir):
         agfs_client.mkdir(src)
         agfs_client.mkdir(dst)
 
-        async with LockContext(lm, [src], lock_mode="mv", mv_dst_path=dst) as handle:
+        async with LockContext(lm, [src], lock_mode="mv", mv_dst_parent_path=dst) as handle:
             assert len(handle.locks) == 2
 
 

From 1010bd4e4f4df88be003dcb32ba23e8fa410f771 Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Tue, 17 Mar 2026 21:29:02 +0800
Subject: [PATCH 14/18] fix: path

---
 .../storage/transaction/lock_manager.py       | 76 +++++++++----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py
index 7e5184ab..2d8ca809 100644
--- a/openviking/storage/transaction/lock_manager.py
+++ b/openviking/storage/transaction/lock_manager.py
@@ -125,10 +125,15 @@ async def _recover_pending_redo(self) -> None:
                 logger.error(f"Redo recovery failed for {task_id}: {e}", exc_info=True)
 
     async def _redo_session_memory(self, info: Dict[str, Any]) -> None:
-        """Re-extract memories from archive."""
+        """Re-extract memories from archive.
+
+        Lets exceptions from _enqueue_semantic propagate so the caller
+        can decide whether to mark the redo task as done.
+        """
         from openviking.message import Message
         from openviking.server.identity import RequestContext, Role
         from openviking.session.compressor import SessionCompressor
+        from openviking.storage.viking_fs import get_viking_fs
         from openviking_cli.session.user_id import UserIdentifier
 
         archive_uri = info.get("archive_uri")
@@ -139,51 +144,46 @@ async def _redo_session_memory(self, info: Dict[str, Any]) -> None:
         role_str = info.get("role", "root")
 
         if not archive_uri or not session_uri:
-            logger.warning("Cannot redo session_memory: missing archive_uri or session_uri")
-            return
+            raise ValueError("Cannot redo session_memory: missing archive_uri or session_uri")
+
+        # 1. Build request context (needed for path conversion below)
+        user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id)
+        ctx = RequestContext(user=user, role=Role(role_str))
 
-        # 1. Read archived messages
-        messages_path = f"{archive_uri}/messages.jsonl"
+        # 2. Read archived messages
+        messages_uri = f"{archive_uri}/messages.jsonl"
+        viking_fs = get_viking_fs()
+        agfs_path = viking_fs._uri_to_path(messages_uri, ctx=ctx)
+        messages = []
         try:
-            agfs_path = messages_path.replace("viking://", "")
             content = self._agfs.cat(agfs_path)
             if isinstance(content, bytes):
                 content = content.decode("utf-8")
+            for line in content.strip().split("\n"):
+                if line.strip():
+                    try:
+                        messages.append(Message.from_dict(json.loads(line)))
+                    except Exception:
+                        pass
         except Exception as e:
-            logger.warning(f"Cannot read archive for redo: {messages_path}: {e}")
-            return
+            logger.warning(f"Cannot read archive for redo: {agfs_path}: {e}")
 
-        messages = []
-        for line in content.strip().split("\n"):
-            if line.strip():
-                try:
-                    messages.append(Message.from_dict(json.loads(line)))
-                except Exception:
-                    pass
-
-        if not messages:
-            logger.warning(f"No messages found in archive for redo: {archive_uri}")
-            return
-
-        # 2. Build request context
-        user = UserIdentifier(account_id=account_id, user_id=user_id, agent_id=agent_id)
-        ctx = RequestContext(user=user, role=Role(role_str))
-
-        # 3. Re-extract memories (best-effort: skip if compressor not available)
-        session_id = session_uri.rstrip("/").rsplit("/", 1)[-1]
-        try:
-            compressor = SessionCompressor(vikingdb=None)
-            memories = await compressor.extract_long_term_memories(
-                messages=messages,
-                user=user,
-                session_id=session_id,
-                ctx=ctx,
-            )
-            logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}")
-        except Exception as e:
-            logger.warning(f"Redo: memory extraction skipped ({e}), will retry via queue")
+        # 3. Re-extract memories (best-effort, only if archive was readable)
+        if messages:
+            session_id = session_uri.rstrip("/").rsplit("/", 1)[-1]
+            try:
+                compressor = SessionCompressor(vikingdb=None)
+                memories = await compressor.extract_long_term_memories(
+                    messages=messages,
+                    user=user,
+                    session_id=session_id,
+                    ctx=ctx,
+                )
+                logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}")
+            except Exception as e:
+                logger.warning(f"Redo: memory extraction failed ({e}), falling back to queue")
 
-        # 4. Enqueue semantic processing
+        # 4. Always enqueue semantic processing as fallback
         await self._enqueue_semantic(
             uri=session_uri,
             context_type="memory",

From 715739da41983bddf5ec6146a96db43ca4d10b1f Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Tue, 17 Mar 2026 23:53:24 +0800
Subject: [PATCH 15/18] fix: resource lock

---
 openviking/storage/errors.py                  |  4 +
 openviking/storage/queuefs/semantic_dag.py    | 91 +++++++++++++----
 openviking/storage/queuefs/semantic_msg.py    |  4 +
 .../storage/queuefs/semantic_processor.py     | 98 ++++++++-----------
 .../storage/transaction/lock_manager.py       |  6 ++
 openviking/storage/transaction/path_lock.py   | 13 +++
 openviking/storage/viking_fs.py               | 18 ++--
 openviking/utils/resource_processor.py        | 64 +++++++-----
 openviking/utils/summarizer.py                |  2 +
 9 files changed, 192 insertions(+), 108 deletions(-)

diff --git a/openviking/storage/errors.py b/openviking/storage/errors.py
index 010200e7..def786be 100644
--- a/openviking/storage/errors.py
+++ b/openviking/storage/errors.py
@@ -37,3 +37,7 @@ class LockError(VikingDBException):
 
 class LockAcquisitionError(LockError):
     """Raised when lock acquisition fails."""
+
+
+class ResourceBusyError(LockError):
+    """Raised when a resource is locked by an ongoing operation (e.g. semantic processing)."""
diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
index 4ee10a93..d626fa82 100644
--- a/openviking/storage/queuefs/semantic_dag.py
+++ b/openviking/storage/queuefs/semantic_dag.py
@@ -75,6 +75,7 @@ def __init__(
         target_uri: Optional[str] = None,
         semantic_msg_id: Optional[str] = None,
         recursive: bool = True,
+        lifecycle_lock_handle_id: str = "",
     ):
         self._processor = processor
         self._context_type = context_type
@@ -84,6 +85,7 @@ def __init__(
         self._target_uri = target_uri
         self._semantic_msg_id = semantic_msg_id
         self._recursive = recursive
+        self._lifecycle_lock_handle_id = lifecycle_lock_handle_id
         self._llm_sem = asyncio.Semaphore(max_concurrent_llm)
         self._viking_fs = get_viking_fs()
         self._nodes: Dict[str, DirNode] = {}
@@ -98,6 +100,7 @@ def __init__(
         self._dir_change_status: Dict[str, bool] = {}
         self._overview_cache: Dict[str, Dict[str, str]] = {}
         self._overview_cache_lock = asyncio.Lock()
+        self._refresh_task: Optional[asyncio.Task] = None
 
     def _create_on_complete_callback(self) -> Optional[Callable[[], Awaitable[None]]]:
         """Create on_complete callback for incremental update or full update."""
@@ -160,10 +163,27 @@ async def run(self, root_uri: str) -> None:
         """Run DAG execution starting from root_uri."""
         self._root_uri = root_uri
         self._root_done = asyncio.Event()
-        await self._dispatch_dir(root_uri, parent_uri=None)
-        await self._root_done.wait()
 
-        on_complete = self._create_on_complete_callback()
+        # Start lifecycle lock refresh loop if we hold a lock
+        if self._lifecycle_lock_handle_id:
+            self._refresh_task = asyncio.create_task(self._lock_refresh_loop())
+
+        try:
+            await self._dispatch_dir(root_uri, parent_uri=None)
+            await self._root_done.wait()
+        except Exception:
+            await self._release_lifecycle_lock()
+            raise
+
+        original_on_complete = self._create_on_complete_callback()
+
+        # Wrap on_complete to release lifecycle lock after all processing
+        async def wrapped_on_complete() -> None:
+            try:
+                if original_on_complete:
+                    await original_on_complete()
+            finally:
+                await self._release_lifecycle_lock()
 
         async with self._vectorize_lock:
             task_count = self._vectorize_task_count
@@ -176,7 +196,7 @@ async def run(self, root_uri: str) -> None:
             await tracker.register(
                 semantic_msg_id=self._semantic_msg_id,
                 total_count=task_count,
-                on_complete=on_complete,
+                on_complete=wrapped_on_complete,
                 metadata={"uri": root_uri},
             )
 
@@ -203,9 +223,10 @@ async def run(self, root_uri: str) -> None:
                             semantic_msg_id=task.semantic_msg_id,
                         )
                     )
-        elif on_complete:
+        else:
+            # No vectorize tasks — release lock immediately (via wrapped callback)
             try:
-                await on_complete()
+                await wrapped_on_complete()
             except Exception as e:
                 logger.error(f"Error in on_complete callback: {e}", exc_info=True)
 
@@ -505,9 +526,6 @@ def _finalize_children_abstracts(self, node: DirNode) -> List[Dict[str, str]]:
         return results
 
     async def _overview_task(self, dir_uri: str) -> None:
-        from openviking.storage.errors import LockAcquisitionError
-        from openviking.storage.transaction import LockContext, get_lock_manager
-
         node = self._nodes.get(dir_uri)
         if not node:
             return
@@ -536,17 +554,12 @@ async def _overview_task(self, dir_uri: str) -> None:
                 abstract = self._processor._extract_abstract_from_overview(overview)
                 overview, abstract = self._processor._enforce_size_limits(overview, abstract)
 
-            dir_path = self._viking_fs._uri_to_path(dir_uri, ctx=self._ctx)
+            # Write directly — protected by the outer lifecycle SUBTREE lock
             try:
-                async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"):
-                    await self._viking_fs.write_file(
-                        f"{dir_uri}/.overview.md", overview, ctx=self._ctx
-                    )
-                    await self._viking_fs.write_file(
-                        f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx
-                    )
-            except LockAcquisitionError:
-                logger.info(f"[SemanticDag] {dir_uri} does not exist or is locked, skipping")
+                await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)
+                await self._viking_fs.write_file(f"{dir_uri}/.abstract.md", abstract, ctx=self._ctx)
+            except Exception:
+                logger.info(f"[SemanticDag] {dir_uri} write failed, skipping")
 
             try:
                 if need_vectorize:
@@ -588,6 +601,46 @@ async def _add_vectorize_task(self, task: VectorizeTask) -> None:
             else:  # directory
                 self._vectorize_task_count += 2
 
+    async def _lock_refresh_loop(self) -> None:
+        """Periodically refresh lifecycle lock to prevent stale expiry."""
+        from openviking.storage.transaction import get_lock_manager
+
+        try:
+            interval = get_lock_manager()._path_lock._lock_expire / 2
+        except Exception:
+            interval = 150.0
+
+        while True:
+            try:
+                await asyncio.sleep(interval)
+                handle = get_lock_manager().get_handle(self._lifecycle_lock_handle_id)
+                if handle:
+                    await get_lock_manager().refresh_lock(handle)
+                else:
+                    break
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.warning(f"[SemanticDag] Lock refresh failed: {e}")
+
+    async def _release_lifecycle_lock(self) -> None:
+        """Stop refresh loop and release lifecycle lock."""
+        if self._refresh_task and not self._refresh_task.done():
+            self._refresh_task.cancel()
+            self._refresh_task = None
+        if not self._lifecycle_lock_handle_id:
+            return
+        handle_id = self._lifecycle_lock_handle_id
+        self._lifecycle_lock_handle_id = ""
+        try:
+            from openviking.storage.transaction import get_lock_manager
+
+            handle = get_lock_manager().get_handle(handle_id)
+            if handle:
+                await get_lock_manager().release(handle)
+        except Exception as e:
+            logger.warning(f"[SemanticDag] Failed to release lifecycle lock {handle_id}: {e}")
+
     def get_stats(self) -> DagStats:
         return DagStats(
             total_nodes=self._stats.total_nodes,
diff --git a/openviking/storage/queuefs/semantic_msg.py b/openviking/storage/queuefs/semantic_msg.py
index f6acdaf4..720948e8 100644
--- a/openviking/storage/queuefs/semantic_msg.py
+++ b/openviking/storage/queuefs/semantic_msg.py
@@ -39,6 +39,7 @@ class SemanticMsg:
     skip_vectorization: bool = False
     telemetry_id: str = ""
     target_uri: str = ""
+    lifecycle_lock_handle_id: str = ""
     changes: Optional[Dict[str, List[str]]] = (
         None  # {"added": [...], "modified": [...], "deleted": [...]}
     )
@@ -55,6 +56,7 @@ def __init__(
         skip_vectorization: bool = False,
         telemetry_id: str = "",
         target_uri: str = "",
+        lifecycle_lock_handle_id: str = "",
         changes: Optional[Dict[str, List[str]]] = None,
     ):
         self.id = str(uuid4())
@@ -68,6 +70,7 @@ def __init__(
         self.skip_vectorization = skip_vectorization
         self.telemetry_id = telemetry_id
         self.target_uri = target_uri
+        self.lifecycle_lock_handle_id = lifecycle_lock_handle_id
         self.changes = changes
 
     def to_dict(self) -> Dict[str, Any]:
@@ -106,6 +109,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SemanticMsg":
             skip_vectorization=data.get("skip_vectorization", False),
             telemetry_id=data.get("telemetry_id", ""),
             target_uri=data.get("target_uri", ""),
+            lifecycle_lock_handle_id=data.get("lifecycle_lock_handle_id", ""),
             changes=data.get("changes"),
         )
         if "id" in data and data["id"]:
diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 0db4019b..98d32f7a 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -239,6 +239,14 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str,
                                 f"Target URI exists, using incremental update: {msg.target_uri}"
                             )
 
+                    # Re-acquire lifecycle lock if handle was lost (e.g. server restart)
+                    if msg.lifecycle_lock_handle_id:
+                        lock_uri = msg.target_uri or msg.uri
+                        msg.lifecycle_lock_handle_id = await self._ensure_lifecycle_lock(
+                            msg.lifecycle_lock_handle_id,
+                            viking_fs._uri_to_path(lock_uri, ctx=self._current_ctx),
+                        )
+
                     executor = SemanticDagExecutor(
                         processor=self,
                         context_type=msg.context_type,
@@ -248,6 +256,7 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str,
                         target_uri=msg.target_uri,
                         semantic_msg_id=msg.id,
                         recursive=msg.recursive,
+                        lifecycle_lock_handle_id=msg.lifecycle_lock_handle_id,
                     )
                     self._dag_executor = executor
                     await executor.run(msg.uri)
@@ -268,6 +277,22 @@ async def on_dequeue(self, data: Optional[Dict[str, Any]]) -> Optional[Dict[str,
             self.report_error(str(e), data)
             return None
         finally:
+            # Safety net: release lifecycle lock if still held (e.g. on exception
+            # before the DAG executor took ownership)
+            if msg and msg.lifecycle_lock_handle_id:
+                try:
+                    from openviking.storage.transaction import get_lock_manager
+
+                    lm = get_lock_manager()
+                    handle = lm.get_handle(msg.lifecycle_lock_handle_id)
+                    if handle:
+                        await lm.release(handle)
+                        logger.info(
+                            f"[SemanticProcessor] Safety-net released lifecycle lock "
+                            f"{msg.lifecycle_lock_handle_id}"
+                        )
+                except Exception:
+                    pass
             self._current_msg = None
             self._current_ctx = None
 
@@ -276,63 +301,24 @@ def get_dag_stats(self) -> Optional["DagStats"]:
             return None
         return self._dag_executor.get_stats()
 
-    async def _process_single_directory(
-        self,
-        uri: str,
-        context_type: str,
-        children_uris: List[str],
-        file_paths: List[str],
-    ) -> None:
-        """Process single directory, generate .abstract.md and .overview.md."""
-        from openviking.storage.errors import LockAcquisitionError
-        from openviking.storage.transaction import LockContext, get_lock_manager
-
-        viking_fs = get_viking_fs()
-        dir_path = viking_fs._uri_to_path(uri, ctx=self._current_ctx)
+    @staticmethod
+    async def _ensure_lifecycle_lock(handle_id: str, lock_path: str) -> str:
+        """If the handle is missing (server restart), re-acquire a SUBTREE lock.
 
-        try:
-            async with LockContext(get_lock_manager(), [dir_path], lock_mode="point"):
-                # 1. Collect .abstract.md from subdirectories
-                children_abstracts = await self._collect_children_abstracts(children_uris)
-
-                # 2. Concurrently generate summaries for files in directory
-                tasks = [
-                    self._generate_single_file_summary(fp, ctx=self._current_ctx)
-                    for fp in file_paths
-                ]
-                file_summaries = await asyncio.gather(*tasks)
-
-                # 3. Generate .overview.md
-                overview = await self._generate_overview(uri, file_summaries, children_abstracts)
-
-                # 4. Extract abstract from overview
-                abstract = self._extract_abstract_from_overview(overview)
-
-                # 5. Write files
-                await viking_fs.write_file(f"{uri}/.overview.md", overview, ctx=self._current_ctx)
-                await viking_fs.write_file(f"{uri}/.abstract.md", abstract, ctx=self._current_ctx)
-
-                logger.debug(f"Generated overview and abstract for {uri}")
-
-                # 6. Vectorize directory and files concurrently
-                vectorize_tasks = [
-                    self._vectorize_directory_simple(uri, context_type, abstract, overview),
-                    *(
-                        self._vectorize_single_file(
-                            parent_uri=uri,
-                            context_type=context_type,
-                            file_path=fp,
-                            summary_dict=summary,
-                        )
-                        for fp, summary in zip(file_paths, file_summaries)
-                    ),
-                ]
-                results = await asyncio.gather(*vectorize_tasks, return_exceptions=True)
-                for result in results:
-                    if isinstance(result, Exception):
-                        logger.error(f"Vectorization failed: {result}", exc_info=True)
-        except LockAcquisitionError:
-            logger.info(f"[SemanticProcessor] {uri} does not exist or is locked, skipping")
+        Returns the (possibly new) handle ID, or "" on failure.
+        """
+        from openviking.storage.transaction import get_lock_manager
+
+        lm = get_lock_manager()
+        if lm.get_handle(handle_id):
+            return handle_id
+        new_handle = lm.create_handle()
+        if await lm.acquire_subtree(new_handle, lock_path):
+            logger.info(f"Re-acquired lifecycle lock on {lock_path} (handle {new_handle.id})")
+            return new_handle.id
+        logger.warning(f"Failed to re-acquire lifecycle lock on {lock_path}")
+        await lm.release(new_handle)
+        return ""
 
     async def _process_memory_directory(self, msg: SemanticMsg) -> None:
         """Process a memory directory with special handling.
diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py
index 2d8ca809..56dedd79 100644
--- a/openviking/storage/transaction/lock_manager.py
+++ b/openviking/storage/transaction/lock_manager.py
@@ -94,6 +94,12 @@ async def acquire_mv(
             src_is_dir=src_is_dir,
         )
 
+    def get_handle(self, handle_id: str) -> Optional[LockHandle]:
+        return self._handles.get(handle_id)
+
+    async def refresh_lock(self, handle: LockHandle) -> None:
+        await self._path_lock.refresh(handle)
+
     async def release(self, handle: LockHandle) -> None:
         await self._path_lock.release(handle)
         self._handles.pop(handle.id, None)
diff --git a/openviking/storage/transaction/path_lock.py b/openviking/storage/transaction/path_lock.py
index 345f2661..2aaaecf1 100644
--- a/openviking/storage/transaction/path_lock.py
+++ b/openviking/storage/transaction/path_lock.py
@@ -371,6 +371,19 @@ async def acquire_mv(
         logger.debug(f"[MV] Locks acquired: {src_path} -> {dst_parent_path}")
         return True
 
+    async def refresh(self, owner: LockOwner) -> None:
+        """Rewrite all lock file timestamps to prevent stale cleanup."""
+        for lock_path in list(owner.locks):
+            token = self._read_token(lock_path)
+            if token:
+                parsed_owner_id, _, lock_type = _parse_fencing_token(token)
+                if parsed_owner_id == owner.id:
+                    new_token = _make_fencing_token(owner.id, lock_type)
+                    try:
+                        self._agfs.write(lock_path, new_token.encode("utf-8"))
+                    except Exception as e:
+                        logger.warning(f"Failed to refresh lock {lock_path}: {e}")
+
     async def release(self, owner: LockOwner) -> None:
         lock_count = len(owner.locks)
         for lock_path in reversed(owner.locks):
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index d9119c11..214c8c93 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -290,7 +290,10 @@ async def rm(
         after cleaning up any orphan index records.
 
         Acquires a path lock, deletes VectorDB records, then FS files.
+        Raises ResourceBusyError when the target is locked by an ongoing
+        operation (e.g. semantic processing).
         """
+        from openviking.storage.errors import LockAcquisitionError, ResourceBusyError
         from openviking.storage.transaction import LockContext, get_lock_manager
 
         self._ensure_access(uri, ctx)
@@ -317,12 +320,15 @@ async def rm(
             lock_paths = [parent]
             lock_mode = "point"
 
-        async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode):
-            uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
-            uris_to_delete.append(target_uri)
-            await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
-            result = self.agfs.rm(path, recursive=recursive)
-            return result
+        try:
+            async with LockContext(get_lock_manager(), lock_paths, lock_mode=lock_mode):
+                uris_to_delete = await self._collect_uris(path, recursive, ctx=ctx)
+                uris_to_delete.append(target_uri)
+                await self._delete_from_vector_store(uris_to_delete, ctx=ctx)
+                result = self.agfs.rm(path, recursive=recursive)
+                return result
+        except LockAcquisitionError:
+            raise ResourceBusyError(f"Resource is being processed: {uri}")
 
     async def mv(
         self,
diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py
index 73bca2f6..adcb4245 100644
--- a/openviking/utils/resource_processor.py
+++ b/openviking/utils/resource_processor.py
@@ -210,29 +210,29 @@ async def process_resource(
 
             return result
 
-        # ============ Phase 3.5: 首次添加立即落盘 ============
+        # ============ Phase 3.5: 首次添加立即落盘 + 生命周期锁 ============
         root_uri = result.get("root_uri")
         temp_uri = result.get("temp_uri")  # temp_doc_uri
         candidate_uri = getattr(context_tree, "_candidate_uri", None) if context_tree else None
+        lifecycle_lock_handle_id = ""
 
         if root_uri and temp_uri:
+            from openviking.storage.transaction import LockContext, get_lock_manager
+
             viking_fs = get_viking_fs()
+            lock_manager = get_lock_manager()
             target_exists = await viking_fs.exists(root_uri, ctx=ctx)
+
             if not target_exists:
                 # 第一次添加：锁保护下将 temp 移到 final
-                from openviking.storage.transaction import LockContext, get_lock_manager
-
                 dst_path = viking_fs._uri_to_path(root_uri, ctx=ctx)
                 parent_path = dst_path.rsplit("/", 1)[0] if "/" in dst_path else dst_path
 
-                # 确保父目录存在
                 parent_uri = "/".join(root_uri.rsplit("/", 1)[:-1])
                 if parent_uri:
                     await viking_fs.mkdir(parent_uri, exist_ok=True, ctx=ctx)
 
-                async with LockContext(get_lock_manager(), [parent_path], lock_mode="point"):
-                    # Re-resolve URI inside lock to prevent TOCTOU race where
-                    # concurrent add_resource calls resolve to the same final_uri.
+                async with LockContext(lock_manager, [parent_path], lock_mode="point"):
                     if candidate_uri:
                         root_uri = await self.tree_builder._resolve_unique_uri(candidate_uri)
                         result["root_uri"] = root_uri
@@ -241,48 +241,58 @@ async def process_resource(
                     src_path = viking_fs._uri_to_path(temp_uri, ctx=ctx)
                     await asyncio.to_thread(viking_fs.agfs.mv, src_path, dst_path)
 
-                # 清理 temp 根目录
+                    # 在 POINT 锁内获取 SUBTREE 锁（消除竞态窗口）
+                    lifecycle_lock_handle_id = await self._try_acquire_lifecycle_lock(
+                        lock_manager, dst_path
+                    )
+
                 try:
                     await viking_fs.delete_temp(parse_result.temp_dir_path, ctx=ctx)
                 except Exception:
                     pass
 
-                # 更新 temp_uri → DAG 直接在 final 上跑
                 result["temp_uri"] = root_uri
+            else:
+                # 增量更新：对目标目录加 SUBTREE 锁
+                resource_path = viking_fs._uri_to_path(root_uri, ctx=ctx)
+                lifecycle_lock_handle_id = await self._try_acquire_lifecycle_lock(
+                    lock_manager, resource_path
+                )
 
         # ============ Phase 4: Optional Steps ============
         build_index = kwargs.get("build_index", True)
         temp_uri_for_summarize = result.get("temp_uri") or parse_result.temp_dir_path
-        if summarize:
-            # Explicit summarization request.
-            # If build_index is ALSO True, we want vectorization.
-            # If build_index is False, we skip vectorization.
+        should_summarize = summarize or build_index
+        if should_summarize:
             skip_vec = not build_index
             try:
                 await self._get_summarizer().summarize(
                     resource_uris=[result["root_uri"]],
                     ctx=ctx,
                     skip_vectorization=skip_vec,
+                    lifecycle_lock_handle_id=lifecycle_lock_handle_id,
                     temp_uris=[temp_uri_for_summarize],
                     **kwargs,
                 )
             except Exception as e:
                 logger.error(f"Summarization failed: {e}")
                 result["warnings"] = result.get("warnings", []) + [f"Summarization failed: {e}"]
+        elif lifecycle_lock_handle_id:
+            # 无下游处理接管锁，主动释放
+            from openviking.storage.transaction import get_lock_manager
 
-        elif build_index:
-            # Standard compatibility mode: "Just Index it" usually implies ingestion flow.
-            # We assume this means "Ingest and Index", which requires summarization.
-            try:
-                await self._get_summarizer().summarize(
-                    resource_uris=[result["root_uri"]],
-                    ctx=ctx,
-                    skip_vectorization=False,
-                    temp_uris=[temp_uri_for_summarize],
-                    **kwargs,
-                )
-            except Exception as e:
-                logger.error(f"Auto-index failed: {e}")
-                result["warnings"] = result.get("warnings", []) + [f"Auto-index failed: {e}"]
+            handle = get_lock_manager().get_handle(lifecycle_lock_handle_id)
+            if handle:
+                await get_lock_manager().release(handle)
 
         return result
+
+    @staticmethod
+    async def _try_acquire_lifecycle_lock(lock_manager, path: str) -> str:
+        """尝试获取 SUBTREE 生命周期锁，失败时优雅降级返回空字符串。"""
+        handle = lock_manager.create_handle()
+        if await lock_manager.acquire_subtree(handle, path):
+            return handle.id
+        logger.warning(f"[ResourceProcessor] Failed to acquire lifecycle lock on {path}")
+        await lock_manager.release(handle)
+        return ""
diff --git a/openviking/utils/summarizer.py b/openviking/utils/summarizer.py
index 36b879e8..e9a1cb20 100644
--- a/openviking/utils/summarizer.py
+++ b/openviking/utils/summarizer.py
@@ -31,6 +31,7 @@ async def summarize(
         resource_uris: List[str],
         ctx: "RequestContext",
         skip_vectorization: bool = False,
+        lifecycle_lock_handle_id: str = "",
         **kwargs,
     ) -> Dict[str, Any]:
         """
@@ -72,6 +73,7 @@ async def summarize(
                 skip_vectorization=skip_vectorization,
                 telemetry_id=telemetry.telemetry_id if telemetry.enabled else "",
                 target_uri=uri if uri != temp_uri else None,
+                lifecycle_lock_handle_id=lifecycle_lock_handle_id,
             )
             await semantic_queue.enqueue(msg)
             enqueued_count += 1

From 5de24ab79b89c657c7ca2efcbc37a92b331d5aec Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Wed, 18 Mar 2026 00:24:53 +0800
Subject: [PATCH 16/18] fix: test

---
 openviking/storage/transaction/__init__.py     | 2 ++
 openviking/storage/transaction/lock_manager.py | 8 ++++++++
 openviking/storage/viking_fs.py                | 6 +++---
 tests/client/test_file_operations.py           | 6 +++++-
 tests/client/test_import_export.py             | 2 ++
 tests/integration/test_full_workflow.py        | 2 ++
 6 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/openviking/storage/transaction/__init__.py b/openviking/storage/transaction/__init__.py
index 0fca8816..52e77ff7 100644
--- a/openviking/storage/transaction/__init__.py
+++ b/openviking/storage/transaction/__init__.py
@@ -12,6 +12,7 @@
     LockManager,
     get_lock_manager,
     init_lock_manager,
+    release_all_locks,
     reset_lock_manager,
 )
 from openviking.storage.transaction.path_lock import PathLock
@@ -26,5 +27,6 @@
     "RedoLog",
     "get_lock_manager",
     "init_lock_manager",
+    "release_all_locks",
     "reset_lock_manager",
 ]
diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py
index 56dedd79..d20d2ef8 100644
--- a/openviking/storage/transaction/lock_manager.py
+++ b/openviking/storage/transaction/lock_manager.py
@@ -251,3 +251,11 @@ def get_lock_manager() -> LockManager:
 def reset_lock_manager() -> None:
     global _lock_manager
     _lock_manager = None
+
+
+async def release_all_locks() -> None:
+    """Release all active lock handles. **Test-only utility.**"""
+    if _lock_manager is None:
+        return
+    for handle in list(_lock_manager.get_active_handles().values()):
+        await _lock_manager.release(handle)
diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py
index 214c8c93..f8b8a356 100644
--- a/openviking/storage/viking_fs.py
+++ b/openviking/storage/viking_fs.py
@@ -1016,20 +1016,20 @@ def _uri_to_path(self, uri: str, ctx: Optional[RequestContext] = None) -> str:
         safe_parts = [self._shorten_component(p, self._MAX_FILENAME_BYTES) for p in parts]
         return f"/local/{account_id}/{'/'.join(safe_parts)}"
 
-    _INTERNAL_DIRS = {"_system"}
+    _INTERNAL_NAMES = {"_system", ".path.ovlock"}
     _ROOT_PATH = "/local"
 
     def _ls_entries(self, path: str) -> List[Dict[str, Any]]:
         """List directory entries, filtering out internal directories.
 
         At account root (/local/{account}), uses VALID_SCOPES whitelist.
-        At other levels, uses _INTERNAL_DIRS blacklist.
+        At other levels, uses _INTERNAL_NAMES blacklist.
         """
         entries = self.agfs.ls(path)
         parts = [p for p in path.strip("/").split("/") if p]
         if len(parts) == 2 and parts[0] == "local":
             return [e for e in entries if e.get("name") in VikingURI.VALID_SCOPES]
-        return [e for e in entries if e.get("name") not in self._INTERNAL_DIRS]
+        return [e for e in entries if e.get("name") not in self._INTERNAL_NAMES]
 
     def _path_to_uri(self, path: str, ctx: Optional[RequestContext] = None) -> str:
         """/local/{account}/... -> viking://...
diff --git a/tests/client/test_file_operations.py b/tests/client/test_file_operations.py
index 99415f20..a402e4af 100644
--- a/tests/client/test_file_operations.py
+++ b/tests/client/test_file_operations.py
@@ -8,6 +8,7 @@
 import pytest
 
 from openviking import AsyncOpenViking
+from openviking.storage.transaction import release_all_locks
 
 
 class TestRm:
@@ -22,6 +23,7 @@ async def test_rm_file(self, client: AsyncOpenViking, sample_markdown_file: Path
             reason="Test rm",
         )
 
+        await release_all_locks()
         uris = await client.tree(result["root_uri"])
         for data in uris:
             if not data["isDir"]:
@@ -35,7 +37,8 @@ async def test_rm_directory_recursive(self, client: AsyncOpenViking, sample_dire
         for f in sample_directory.glob("**/*.txt"):
             await client.add_resource(path=str(f), reason="Test rm dir")
 
-        # Get resource directory
+        # Release lifecycle locks held by add_resource before rm
+        await release_all_locks()
         entries = await client.ls("viking://resources/")
         for data in entries:
             if data["isDir"]:
@@ -57,6 +60,7 @@ async def test_mv_file(self, client: AsyncOpenViking, sample_markdown_file: Path
         )
         uri = result["root_uri"]
         new_uri = "viking://resources/moved/"
+        await release_all_locks()
         await client.mv(uri, new_uri)
         # Verify original location does not exist
         with pytest.raises(Exception):  # noqa: B017
diff --git a/tests/client/test_import_export.py b/tests/client/test_import_export.py
index e4dfe3a9..2aaac8f7 100644
--- a/tests/client/test_import_export.py
+++ b/tests/client/test_import_export.py
@@ -10,6 +10,7 @@
 import pytest
 
 from openviking import AsyncOpenViking
+from openviking.storage.transaction import release_all_locks
 
 
 class TestExportOvpack:
@@ -99,6 +100,7 @@ async def test_import_export_roundtrip(
         await client.export_ovpack(original_uri, str(export_path))
 
         # Delete original resource
+        await release_all_locks()
         await client.rm(original_uri, recursive=True)
 
         # Import
diff --git a/tests/integration/test_full_workflow.py b/tests/integration/test_full_workflow.py
index 823cefd7..b48385d7 100644
--- a/tests/integration/test_full_workflow.py
+++ b/tests/integration/test_full_workflow.py
@@ -10,6 +10,7 @@
 
 from openviking import AsyncOpenViking
 from openviking.message import TextPart
+from openviking.storage.transaction import release_all_locks
 
 
 @pytest_asyncio.fixture(scope="function")
@@ -171,6 +172,7 @@ async def test_export_import_roundtrip(
         assert export_path.exists()
 
         # 4. Delete original resource
+        await release_all_locks()
         await client.rm(original_uri, recursive=True)
 
         # 5. Import

From 08c36731f9e975b3ad581a9cad2048a6079981ba Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Wed, 18 Mar 2026 01:44:45 +0800
Subject: [PATCH 17/18] docs: update

---
 docs/en/concepts/09-transaction.md | 25 ++++++++++++++++++-------
 docs/zh/concepts/09-transaction.md | 25 ++++++++++++++++++-------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/docs/en/concepts/09-transaction.md b/docs/en/concepts/09-transaction.md
index 0e95bdf1..edbda724 100644
--- a/docs/en/concepts/09-transaction.md
+++ b/docs/en/concepts/09-transaction.md
@@ -131,27 +131,37 @@ Operation flow:
 | Problem | Solution |
 |---------|----------|
 | File moved from temp to final directory, then crash -> file exists but never searchable | Two separate paths for first-time add vs incremental update |
+| Resource already on disk but rm deletes it while semantic processing / vectorization is still running -> wasted work | Lifecycle SUBTREE lock held from finalization through processing completion |
 
 **First-time add** (target does not exist) — handled in `ResourceProcessor.process_resource` Phase 3.5:
 
 ```
-1. Acquire lock on parent_path of final_uri (lock_mode="point")
+1. Acquire POINT lock on parent of final_uri
 2. agfs.mv temp directory -> final location
-3. Release lock
-4. Clean up temp directory
-5. Enqueue SemanticMsg -> DAG runs on final
+3. Acquire SUBTREE lock on final_uri (inside POINT lock, eliminating race window)
+4. Release POINT lock
+5. Clean up temp directory
+6. Enqueue SemanticMsg(lifecycle_lock_handle_id=...) -> DAG runs on final
+7. DAG starts lock refresh loop (refreshes timestamp every lock_expire/2 seconds)
+8. DAG complete + all embeddings done -> release SUBTREE lock
 ```
 
+During this period, `rm` attempting to acquire a SUBTREE lock on the same path will fail with `ResourceBusyError`.
+
 **Incremental update** (target already exists) — temp stays in place:
 
 ```
-1. Enqueue SemanticMsg(uri=temp, target_uri=final) -> DAG runs on temp
-2. DAG completion triggers sync_diff_callback or move_temp_to_target_callback
-3. Each VikingFS.rm / VikingFS.mv inside callbacks acquires its own lock
+1. Acquire SUBTREE lock on target_uri (protect existing resource)
+2. Enqueue SemanticMsg(uri=temp, target_uri=final, lifecycle_lock_handle_id=...)
+3. DAG runs on temp, lock refresh loop active
+4. DAG completion triggers sync_diff_callback or move_temp_to_target_callback
+5. Callback completes -> release SUBTREE lock
 ```
 
 Note: DAG callbacks do NOT wrap operations in an outer lock. Each `VikingFS.rm` and `VikingFS.mv` has its own lock internally. An outer lock would conflict with these inner locks causing deadlock.
 
+**Server restart recovery**: SemanticMsg is persisted in QueueFS. On restart, `SemanticProcessor` detects that the `lifecycle_lock_handle_id` handle is missing from the in-memory LockManager and re-acquires a SUBTREE lock.
+
 ### session.commit()
 
 | Problem | Solution |
@@ -316,6 +326,7 @@ Timeout (default 0 = no-wait) raises LockAcquisitionError
 | Failure scenario | Defense | Recovery timing |
 |-----------------|--------|-----------------|
 | Crash during operation | Lock auto-expires + stale detection | Next acquisition of same path lock |
+| Crash during add_resource semantic processing | Lifecycle lock expires + SemanticProcessor re-acquires on restart | Worker restart |
 | Crash during session.commit Phase 2 | RedoLog marker + redo | On restart |
 | Crash after enqueue, before worker | QueueFS SQLite persistence | Worker restart |
 | Orphan index | L2 on-demand load cleanup | When user accesses |
diff --git a/docs/zh/concepts/09-transaction.md b/docs/zh/concepts/09-transaction.md
index 81d27e02..45a10d63 100644
--- a/docs/zh/concepts/09-transaction.md
+++ b/docs/zh/concepts/09-transaction.md
@@ -130,27 +130,37 @@ VectorDB 删除失败 -> 直接抛异常，锁自动释放，文件和索引都
 | 问题 | 方案 |
 |------|------|
 | 文件从临时目录移到正式目录后崩溃 -> 文件存在但永远搜不到 | 首次添加与增量更新分离为两条独立路径 |
+| 资源已落盘但语义处理/向量化还在跑时被 rm 删除 -> 处理白跑 | 生命周期 SUBTREE 锁，从落盘持续到处理完成 |
 
 **首次添加**（target 不存在）— 在 `ResourceProcessor.process_resource` Phase 3.5 中处理：
 
 ```
-1. 获取锁，锁 final_uri 的父目录（lock_mode="point"）
+1. 获取 POINT 锁，锁 final_uri 的父目录
 2. agfs.mv 临时目录 -> 正式位置
-3. 释放锁
-4. 清理临时目录
-5. 入队 SemanticMsg -> DAG 在 final 上跑
+3. 获取 SUBTREE 锁，锁 final_uri（在 POINT 锁内，消除竞态窗口）
+4. 释放 POINT 锁
+5. 清理临时目录
+6. 入队 SemanticMsg(lifecycle_lock_handle_id=...) -> DAG 在 final 上跑
+7. DAG 启动锁刷新循环（每 lock_expire/2 秒刷新时间戳）
+8. DAG 完成 + 所有 embedding 完成 -> 释放 SUBTREE 锁
 ```
 
+此期间 `rm` 尝试获取同路径 SUBTREE 锁会失败，抛出 `ResourceBusyError`。
+
 **增量更新**（target 已存在）— temp 保持不动：
 
 ```
-1. 入队 SemanticMsg(uri=temp, target_uri=final) -> DAG 在 temp 上跑
-2. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback
-3. callback 内的每个 VikingFS.rm / VikingFS.mv 各自独立加锁
+1. 获取 SUBTREE 锁，锁 target_uri（保护已有资源）
+2. 入队 SemanticMsg(uri=temp, target_uri=final, lifecycle_lock_handle_id=...)
+3. DAG 在 temp 上跑，启动锁刷新循环
+4. DAG 完成后触发 sync_diff_callback 或 move_temp_to_target_callback
+5. callback 执行完毕 -> 释放 SUBTREE 锁
 ```
 
 注意：DAG callback 不在外层加锁。每个 `VikingFS.rm` 和 `VikingFS.mv` 内部各自有独立锁保护。外层锁会与内部锁冲突导致死锁。
 
+**服务重启恢复**：SemanticMsg 持久化在 QueueFS 中。重启后 `SemanticProcessor` 发现 `lifecycle_lock_handle_id` 对应的 handle 不在内存中，会重新获取 SUBTREE 锁。
+
 ### session.commit()
 
 | 问题 | 方案 |
@@ -315,6 +325,7 @@ async with LockContext(lock_manager, [src], lock_mode="mv", mv_dst_parent_path=d
 | 异常场景 | 防线 | 恢复时机 |
 |---------|------|---------|
 | 操作中途崩溃 | 锁自动过期 + stale 检测 | 下次获取同路径锁时 |
+| add_resource 语义处理中途崩溃 | 生命周期锁过期 + SemanticProcessor 重启时重新获取 | worker 重启后 |
 | session.commit Phase 2 崩溃 | RedoLog 标记 + 重做 | 重启时 |
 | enqueue 后 worker 处理前崩溃 | QueueFS SQLite 持久化 | worker 重启后 |
 | 孤儿索引 | L2 按需加载时清理 | 用户访问时 |

From cdd222b262378e39cfab0daa43f3e12e92ddfc5b Mon Sep 17 00:00:00 2001
From: qin-ctx <qinhaojie.exe@bytedance.com>
Date: Wed, 18 Mar 2026 14:23:42 +0800
Subject: [PATCH 18/18] fix: tests

---
 openviking/server/app.py                         |  5 +++--
 openviking/storage/transaction/lock_manager.py   |  4 +++-
 tests/misc/test_vikingdb_observer.py             | 16 +++++++++++-----
 .../test_hierarchical_retriever_rerank.py        |  2 +-
 tests/server/conftest.py                         |  6 +++---
 tests/server/test_api_search.py                  |  2 +-
 tests/session/test_memory_dedup_actions.py       |  2 +-
 7 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/openviking/server/app.py b/openviking/server/app.py
index c22794e1..c553c6ff 100644
--- a/openviking/server/app.py
+++ b/openviking/server/app.py
@@ -59,7 +59,8 @@ def create_app(
     async def lifespan(app: FastAPI):
         """Application lifespan handler."""
         nonlocal service
-        if service is None:
+        owns_service = service is None
+        if owns_service:
             service = OpenVikingService()
             await service.initialize()
             logger.info("OpenVikingService initialized")
@@ -93,7 +94,7 @@ async def lifespan(app: FastAPI):
 
         # Cleanup
         task_tracker.stop_cleanup_loop()
-        if service:
+        if owns_service and service:
             await service.close()
             logger.info("OpenVikingService closed")
 
diff --git a/openviking/storage/transaction/lock_manager.py b/openviking/storage/transaction/lock_manager.py
index d20d2ef8..2fec7e42 100644
--- a/openviking/storage/transaction/lock_manager.py
+++ b/openviking/storage/transaction/lock_manager.py
@@ -52,9 +52,11 @@ async def stop(self) -> None:
         if self._cleanup_task:
             self._cleanup_task.cancel()
             try:
-                await self._cleanup_task
+                if self._cleanup_task.get_loop() is asyncio.get_running_loop():
+                    await self._cleanup_task
             except asyncio.CancelledError:
                 pass
+            self._cleanup_task = None
         for handle in list(self._handles.values()):
             await self._path_lock.release(handle)
         self._handles.clear()
diff --git a/tests/misc/test_vikingdb_observer.py b/tests/misc/test_vikingdb_observer.py
index 310d01b6..3dc3cfaf 100644
--- a/tests/misc/test_vikingdb_observer.py
+++ b/tests/misc/test_vikingdb_observer.py
@@ -8,13 +8,16 @@
 import asyncio
 
 import openviking as ov
+from openviking.async_client import AsyncOpenViking
 
 
 async def test_vikingdb_observer():
     """Test VikingDBObserver functionality"""
     print("=== Test VikingDBObserver ===")
 
-    # Create client
+    # Reset singleton to ensure clean state from previous tests
+    await AsyncOpenViking.reset()
+
     client = ov.AsyncOpenViking(path="./test_data/test_vikingdb_observer")
 
     try:
@@ -72,15 +75,17 @@ async def test_vikingdb_observer():
         traceback.print_exc()
 
     finally:
-        # Close client
-        await client.close()
+        await AsyncOpenViking.reset()
         print("Client closed")
 
 
-def test_sync_client():
+async def test_sync_client():
     """Test sync client"""
     print("\n=== Test sync client ===")
 
+    # Reset singleton to ensure clean state from previous tests
+    await AsyncOpenViking.reset()
+
     client = ov.OpenViking(path="./test_data/test_vikingdb_observer")
 
     try:
@@ -109,6 +114,7 @@ def test_sync_client():
 
     finally:
         client.close()
+        await AsyncOpenViking.reset()
         print("Sync client closed")
 
 
@@ -117,4 +123,4 @@ def test_sync_client():
     asyncio.run(test_vikingdb_observer())
 
     # Run sync test
-    test_sync_client()
+    asyncio.run(test_sync_client())
diff --git a/tests/retrieve/test_hierarchical_retriever_rerank.py b/tests/retrieve/test_hierarchical_retriever_rerank.py
index f72682b3..a7ead7bc 100644
--- a/tests/retrieve/test_hierarchical_retriever_rerank.py
+++ b/tests/retrieve/test_hierarchical_retriever_rerank.py
@@ -19,7 +19,7 @@ def __init__(self) -> None:
 
 
 class DummyEmbedder:
-    def embed(self, _query: str) -> DummyEmbedResult:
+    def embed(self, _query: str, is_query: bool = False) -> DummyEmbedResult:
         return DummyEmbedResult()
 
 
diff --git a/tests/server/conftest.py b/tests/server/conftest.py
index 98cf606f..3bc0e40f 100644
--- a/tests/server/conftest.py
+++ b/tests/server/conftest.py
@@ -56,11 +56,11 @@ class FakeEmbedder(DenseEmbedderBase):
         def __init__(self):
             super().__init__(model_name="test-fake-embedder")
 
-        def embed(self, text: str) -> EmbedResult:
+        def embed(self, text: str, is_query: bool = False) -> EmbedResult:
             return EmbedResult(dense_vector=[0.1] * dimension)
 
-        def embed_batch(self, texts: list[str]) -> list[EmbedResult]:
-            return [self.embed(text) for text in texts]
+        def embed_batch(self, texts: list[str], is_query: bool = False) -> list[EmbedResult]:
+            return [self.embed(text, is_query=is_query) for text in texts]
 
         def get_dimension(self) -> int:
             return dimension
diff --git a/tests/server/test_api_search.py b/tests/server/test_api_search.py
index 05d313fb..ce33773c 100644
--- a/tests/server/test_api_search.py
+++ b/tests/server/test_api_search.py
@@ -12,7 +12,7 @@
 @pytest.fixture(autouse=True)
 def fake_query_embedder(service):
     class FakeEmbedder:
-        def embed(self, text: str) -> EmbedResult:
+        def embed(self, text: str, is_query: bool = False) -> EmbedResult:
             return EmbedResult(dense_vector=[0.1, 0.2, 0.3])
 
     service.viking_fs.query_embedder = FakeEmbedder()
diff --git a/tests/session/test_memory_dedup_actions.py b/tests/session/test_memory_dedup_actions.py
index 52c445cc..ac273965 100644
--- a/tests/session/test_memory_dedup_actions.py
+++ b/tests/session/test_memory_dedup_actions.py
@@ -42,7 +42,7 @@ def __init__(self, dense_vector):
 
 
 class _DummyEmbedder:
-    def embed(self, _text):
+    def embed(self, _text, is_query: bool = False):
         return _DummyEmbedResult([0.1, 0.2, 0.3])