diff --git a/dataclaw/cli.py b/dataclaw/cli.py index a7957c5..e304306 100644 --- a/dataclaw/cli.py +++ b/dataclaw/cli.py @@ -12,7 +12,7 @@ from .anonymizer import Anonymizer from .config import CONFIG_FILE, DataClawConfig, load_config, save_config -from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions +from .parser import CLAUDE_DIR, CODEX_DIR, CURSOR_DB, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session HF_TAG = "dataclaw" @@ -58,8 +58,8 @@ "Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"", ] -EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"} -SOURCE_CHOICES = ["auto", "claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all"] +EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "cursor", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"} +SOURCE_CHOICES = ["auto", "claude", "codex", "cursor", "custom", "gemini", "kimi", "opencode", "openclaw", "all"] def _mask_secret(s: str) -> str: @@ -91,9 +91,11 @@ def _source_label(source_filter: str) -> str: return "OpenClaw" if source_filter == "kimi": return "Kimi CLI" + if source_filter == "cursor": + return "Cursor" if source_filter == "custom": return "Custom" - return "Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom" + return "Claude Code, Codex, Cursor, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom" def _normalize_source_filter(source_filter: str) -> str: @@ -139,9 +141,11 @@ def _has_session_sources(source_filter: str = "auto") -> bool: return OPENCLAW_DIR.exists() if source_filter == "kimi": return KIMI_DIR.exists() + if source_filter == "cursor": + return CURSOR_DB.exists() if source_filter == "custom": return CUSTOM_DIR.exists() - return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists() + return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CURSOR_DB.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists() def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]: diff --git a/dataclaw/parser.py b/dataclaw/parser.py index 50cc39e..e38fee8 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -1,4 +1,4 @@ -"""Parse Claude Code, Codex, Gemini CLI, OpenCode, and OpenClaw session data into conversations.""" +"""Parse Claude Code, Codex, Cursor, Gemini CLI, OpenCode, and OpenClaw session data into conversations.""" import dataclasses import hashlib @@ -45,6 +45,10 @@ KIMI_CONFIG_PATH = KIMI_DIR / "kimi.json" UNKNOWN_KIMI_CWD = "" +from .parsers import cursor as _cursor_mod +CURSOR_SOURCE = _cursor_mod.CURSOR_SOURCE +CURSOR_DB = _cursor_mod.CURSOR_DB + CUSTOM_DIR = Path.home() / ".dataclaw" / "custom" _CODEX_PROJECT_INDEX: dict[str, list[Path]] = {} @@ -140,6 +144,7 @@ def discover_projects() -> list[dict]: projects.extend(_discover_opencode_projects()) projects.extend(_discover_openclaw_projects()) projects.extend(_discover_kimi_projects()) + projects.extend(_cursor_mod.discover_projects()) projects.extend(_discover_custom_projects()) return sorted(projects, key=lambda p: (p["display_name"], p["source"])) @@ -494,6 +499,32 @@ def parse_project_sessions( sessions.append(parsed) return sessions + if source == CURSOR_SOURCE: + index = _cursor_mod.get_project_index() + composer_ids = index.get(project_dir_name, []) + if not composer_ids: + return [] + sessions = [] + try: + with sqlite3.connect(f"file:{CURSOR_DB}?mode=ro", uri=True) as conn: + for cid in composer_ids: + parsed = _cursor_mod.parse_session( + cid, conn, anonymizer, include_thinking, + _make_stats=_make_stats, + _make_session_result=_make_session_result, + _parse_tool_input=_parse_tool_input, + _update_time_bounds=_update_time_bounds, + _normalize_timestamp=_normalize_timestamp, + _safe_int=_safe_int, + ) + if parsed and parsed["messages"]: + parsed["project"] = _cursor_mod.build_project_name(project_dir_name) + parsed["source"] = CURSOR_SOURCE + sessions.append(parsed) + except sqlite3.Error: + pass + return sessions + if source == CODEX_SOURCE: index = _get_codex_project_index() session_files = index.get(project_dir_name, []) @@ -1988,9 +2019,63 @@ def _parse_tool_input(tool_name: str | None, input_data: Any, anonymizer: Anonym "plan": [anonymizer.text(str(p)) if isinstance(p, str) else p for p in plan], } + # Cursor tools + if name in ("read_file", "read_file_v2"): + return {"file_path": anonymizer.path(input_data.get("targetFile", input_data.get("file_path", "")))} + if name in ("edit_file", "edit_file_v2"): + result: dict[str, Any] = {"file_path": anonymizer.path( + input_data.get("relativeWorkspacePath", input_data.get("targetFile", "")) + )} + content = input_data.get("streamingContent") or input_data.get("content") + if content: + result["content"] = anonymizer.text(str(content)) + return result + if name == "search_replace": + return { + "file_path": anonymizer.path(input_data.get("targetFile", input_data.get("file_path", ""))), + "old_string": anonymizer.text(input_data.get("old_string", "")), + "new_string": anonymizer.text(input_data.get("new_string", "")), + } + if name in ("run_terminal_cmd", "run_terminal_command_v2"): + cmd, _ = redact_text(input_data.get("command", "")) + result = {"command": anonymizer.text(cmd)} + cwd = input_data.get("cwd") + if cwd: + result["cwd"] = anonymizer.path(cwd) + return result + if name == "codebase_search": + result = {"query": anonymizer.text(input_data.get("query", ""))} + inc = input_data.get("includePattern") + if inc: + result["include"] = inc + return result + if name == "grep_search": + pi = input_data.get("patternInfo", {}) + pattern = pi.get("pattern", "") if isinstance(pi, dict) else str(pi) + return {"pattern": anonymizer.text(pattern)} + if name in ("list_dir", "list_dir_v2"): + return {"dir_path": anonymizer.path(input_data.get("targetDirectory", ""))} + if name == "ripgrep_raw_search": + return { + "query": anonymizer.text(input_data.get("query", "")), + "dir": anonymizer.path(input_data.get("rootDir", "")), + } + if name == "glob_file_search": + return { + "pattern": input_data.get("pattern", ""), + "dir": anonymizer.path(input_data.get("rootDir", "")), + } + if name == "semantic_search_full": + return {"query": anonymizer.text(input_data.get("query", ""))} + if name == "web_search": + return {"query": anonymizer.text(input_data.get("query", input_data.get("search_term", "")))} + if name == "web_fetch": + return {"url": anonymizer.text(input_data.get("url", ""))} + # Fallback: anonymize all string values return {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in input_data.items()} + def _normalize_timestamp(value) -> str | None: if value is None: return None diff --git a/dataclaw/parsers/__init__.py b/dataclaw/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataclaw/parsers/cursor.py b/dataclaw/parsers/cursor.py new file mode 100644 index 0000000..8bd1bbe --- /dev/null +++ b/dataclaw/parsers/cursor.py @@ -0,0 +1,333 @@ +import json +import platform +import sqlite3 +from pathlib import Path +from typing import Any + +from ..anonymizer import Anonymizer +from ..secrets import redact_text + +CURSOR_SOURCE = "cursor" +_SYS = platform.system() +if _SYS == "Darwin": + CURSOR_DB = Path.home() / "Library" / "Application Support" / "Cursor" / "User" / "globalStorage" / "state.vscdb" +elif _SYS == "Windows": + CURSOR_DB = Path.home() / "AppData" / "Roaming" / "Cursor" / "User" / "globalStorage" / "state.vscdb" +else: + CURSOR_DB = Path.home() / ".config" / "Cursor" / "User" / "globalStorage" / "state.vscdb" +UNKNOWN_CURSOR_CWD = "" + +_PROJECT_INDEX: dict[str, list[str]] = {} + + +def _try_parse_json(s: Any) -> Any: + if not isinstance(s, str): + return s + try: + return _try_parse_json(json.loads(s)) + except (json.JSONDecodeError, TypeError): + return s + + +def _strip_mcp_prefix(name: str) -> str: + if not name or not name.startswith("mcp"): + return name + if name.startswith("mcp_"): + parts = name.split("_", 2) + return parts[2] if len(parts) >= 3 else name + if name.startswith("mcp-"): + underscore_pos = name.find("_", 4) + if underscore_pos > 0: + dash_pos = name.rfind("-", 0, underscore_pos) + if dash_pos > 3: + return name[dash_pos + 1:] + rest = name[4:] + for length in range(1, len(rest) // 2 + 1): + server = rest[:length] + after = rest[length:] + for sep in ("-", "-user-"): + if after.startswith(sep + server + "-"): + return after[len(sep) + len(server) + 1:] + return name + + +def get_project_index(refresh: bool = False) -> dict[str, list[str]]: + global _PROJECT_INDEX + if refresh or not _PROJECT_INDEX: + _PROJECT_INDEX = _build_project_index() + return _PROJECT_INDEX + + +def _build_project_index() -> dict[str, list[str]]: + if not CURSOR_DB.exists(): + return {} + index: dict[str, list[str]] = {} + try: + with sqlite3.connect(f"file:{CURSOR_DB}?mode=ro", uri=True) as conn: + rows = conn.execute( + "SELECT key, value FROM cursorDiskKV WHERE key LIKE 'composerData:%'" + ).fetchall() + + cid_to_first_bid: dict[str, str] = {} + for key, value in rows: + cid = key.replace("composerData:", "") + try: + data = json.loads(value) if isinstance(value, (str, bytes)) else {} + except (json.JSONDecodeError, TypeError): + continue + headers = data.get("fullConversationHeadersOnly") or data.get("conversation", []) + if len(headers) < 2: + continue + bid = headers[0].get("bubbleId", "") + if bid: + cid_to_first_bid[cid] = bid + + if not cid_to_first_bid: + return index + + bubble_keys = [f"bubbleId:{cid}:{bid}" for cid, bid in cid_to_first_bid.items()] + conn.execute("CREATE TEMP TABLE _dc_keys(k TEXT)") + conn.executemany("INSERT INTO _dc_keys VALUES(?)", [(k,) for k in bubble_keys]) + bubble_rows = conn.execute( + "SELECT nk.k, kv.value FROM _dc_keys nk JOIN cursorDiskKV kv ON nk.k = kv.key" + ).fetchall() + conn.execute("DROP TABLE _dc_keys") + + found_cids: set[str] = set() + for key, val in bubble_rows: + parts = key.split(":") + cid = parts[1] if len(parts) >= 3 else "" + found_cids.add(cid) + try: + bubble = json.loads(val) if isinstance(val, (str, bytes)) else {} + except (json.JSONDecodeError, TypeError): + index.setdefault(UNKNOWN_CURSOR_CWD, []).append(cid) + continue + wuris = bubble.get("workspaceUris", []) + if wuris and isinstance(wuris, list) and wuris[0]: + uri = wuris[0] + if uri.startswith("file://"): + uri = uri[7:] + index.setdefault(uri, []).append(cid) + else: + index.setdefault(UNKNOWN_CURSOR_CWD, []).append(cid) + + for cid in cid_to_first_bid: + if cid not in found_cids: + index.setdefault(UNKNOWN_CURSOR_CWD, []).append(cid) + + except sqlite3.Error: + return {} + return index + + +def build_project_name(cwd: str) -> str: + if cwd == UNKNOWN_CURSOR_CWD: + return "cursor:unknown" + return f"cursor:{Path(cwd).name or cwd}" + + +def discover_projects() -> list[dict]: + index = get_project_index(refresh=True) + if not index: + return [] + db_size = CURSOR_DB.stat().st_size if CURSOR_DB.exists() else 0 + total_sessions = sum(len(cids) for cids in index.values()) + projects = [] + for cwd, cids in sorted(index.items()): + if not cids: + continue + estimated_size = int(db_size * (len(cids) / total_sessions)) if total_sessions else 0 + projects.append({ + "dir_name": cwd, + "display_name": build_project_name(cwd), + "session_count": len(cids), + "total_size_bytes": estimated_size, + "source": CURSOR_SOURCE, + }) + return projects + + +def parse_session( + composer_id: str, + conn: sqlite3.Connection, + anonymizer: Anonymizer, + include_thinking: bool, + _make_stats, + _make_session_result, + _parse_tool_input, + _update_time_bounds, + _normalize_timestamp, + _safe_int, +) -> dict | None: + row = conn.execute( + "SELECT value FROM cursorDiskKV WHERE key = ?", + (f"composerData:{composer_id}",), + ).fetchone() + if not row: + return None + + try: + composer = json.loads(row[0]) if isinstance(row[0], (str, bytes)) else {} + except (json.JSONDecodeError, TypeError): + return None + + headers = composer.get("fullConversationHeadersOnly") or [] + if not headers: + conv = composer.get("conversation", []) + headers = [{"bubbleId": b["bubbleId"], "type": b.get("type")} for b in conv if "bubbleId" in b] + + if not headers: + return None + + bubble_map: dict[str, dict] = {} + cursor = conn.execute( + "SELECT key, value FROM cursorDiskKV WHERE key LIKE ?", + (f"bubbleId:{composer_id}:%",), + ) + for key, val in cursor: + bid = key.split(":")[-1] + try: + bubble_map[bid] = json.loads(val) if isinstance(val, (str, bytes)) else {} + except (json.JSONDecodeError, TypeError): + pass + + metadata: dict[str, Any] = { + "session_id": composer_id, + "cwd": None, + "git_branch": None, + "model": None, + "start_time": None, + "end_time": None, + } + messages: list[dict[str, Any]] = [] + stats = _make_stats() + + for h in headers: + bubble = bubble_map.get(h.get("bubbleId", "")) + if not bubble: + continue + + timestamp = bubble.get("createdAt") + if isinstance(timestamp, (int, float)): + timestamp = _normalize_timestamp(timestamp) + + if metadata["cwd"] is None: + wuris = bubble.get("workspaceUris", []) + if wuris and isinstance(wuris, list) and wuris[0]: + uri = wuris[0] + if uri.startswith("file://"): + uri = uri[7:] + metadata["cwd"] = anonymizer.path(uri) + + model_info = bubble.get("modelInfo") + if isinstance(model_info, dict) and metadata["model"] is None: + model_name = model_info.get("modelName") + if isinstance(model_name, str) and model_name.strip(): + metadata["model"] = model_name + + bubble_type = bubble.get("type") + + if bubble_type == 1: + text = (bubble.get("text") or "").strip() + if not text: + continue + redacted, _ = redact_text(text) + messages.append({ + "role": "user", + "content": anonymizer.text(redacted), + "timestamp": timestamp, + }) + stats["user_messages"] += 1 + _update_time_bounds(metadata, timestamp) + + elif bubble_type == 2: + tfd = bubble.get("toolFormerData") + tool_name_raw = tfd.get("name", "") if isinstance(tfd, dict) else "" + + if tool_name_raw: + tool_name = _strip_mcp_prefix(tool_name_raw) + params_raw = _try_parse_json(tfd.get("params")) + if isinstance(params_raw, dict) and "tools" in params_raw: + tools = params_raw["tools"] + if isinstance(tools, list) and len(tools) == 1: + inner = _try_parse_json(tools[0].get("parameters", "{}")) + if isinstance(inner, dict): + params_raw = inner + + tool_input = _parse_tool_input(tool_name, params_raw if isinstance(params_raw, dict) else {}, anonymizer) + + result_raw = _try_parse_json(tfd.get("result")) + tool_output: dict[str, Any] = {} + if isinstance(result_raw, str) and result_raw.strip(): + redacted_out, _ = redact_text(result_raw) + tool_output = {"text": anonymizer.text(redacted_out)} + elif isinstance(result_raw, dict): + tool_output = {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in result_raw.items()} + elif result_raw is not None: + tool_output = {"text": anonymizer.text(str(result_raw))} + + status_val = tfd.get("status", "unknown") + if isinstance(status_val, dict): + status_val = status_val.get("status", "unknown") + + tool_entry: dict[str, Any] = { + "tool": tool_name, + "input": tool_input, + } + if tool_output: + tool_entry["output"] = tool_output + if isinstance(status_val, str): + tool_entry["status"] = status_val + + msg: dict[str, Any] = { + "role": "assistant", + "tool_uses": [tool_entry], + "timestamp": timestamp, + } + + thinking = bubble.get("thinking") + if include_thinking and isinstance(thinking, dict): + think_text = (thinking.get("text") or "").strip() + if think_text: + msg["thinking"] = anonymizer.text(think_text) + + text = (bubble.get("text") or "").strip() + if text: + redacted, _ = redact_text(text) + msg["content"] = anonymizer.text(redacted) + + messages.append(msg) + stats["assistant_messages"] += 1 + stats["tool_uses"] += 1 + _update_time_bounds(metadata, timestamp) + else: + text = (bubble.get("text") or "").strip() + thinking = bubble.get("thinking") + think_text = "" + if include_thinking and isinstance(thinking, dict): + think_text = (thinking.get("text") or "").strip() + + if not text and not think_text: + continue + + msg = {"role": "assistant", "timestamp": timestamp} + if text: + redacted, _ = redact_text(text) + msg["content"] = anonymizer.text(redacted) + if think_text: + msg["thinking"] = anonymizer.text(think_text) + + messages.append(msg) + stats["assistant_messages"] += 1 + _update_time_bounds(metadata, timestamp) + + tc = bubble.get("tokenCount") + if isinstance(tc, dict): + stats["input_tokens"] += _safe_int(tc.get("inputTokens")) + stats["output_tokens"] += _safe_int(tc.get("outputTokens")) + + if metadata["model"] is None: + metadata["model"] = "cursor-unknown" + + return _make_session_result(metadata, messages, stats) diff --git a/pyproject.toml b/pyproject.toml index 2fa3716..cde0412 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "dataclaw" version = "0.3.2" description = "Export your coding agent conversations to Hugging Face as structured data" requires-python = ">=3.10" -license = "MIT" +license = {text = "MIT"} readme = "README.md" authors = [ {name = "Banodoco"}, diff --git a/tests/test_cli.py b/tests/test_cli.py index f8212f7..d84f0a0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -400,7 +400,7 @@ def test_no_projects(self, monkeypatch, capsys): monkeypatch.setattr("dataclaw.cli.discover_projects", lambda: []) list_projects() captured = capsys.readouterr() - assert "No Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom sessions" in captured.out + assert "No Claude Code, Codex, Cursor, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom sessions" in captured.out def test_source_filter_codex(self, monkeypatch, capsys): monkeypatch.setattr( @@ -626,7 +626,7 @@ def test_export_requires_explicit_source_selection(self, monkeypatch, capsys): assert payload["error"] == "Source scope is not confirmed yet." assert payload["blocked_on_step"] == "Step 2/6" assert len(payload["process_steps"]) == 6 - assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "custom", "gemini", "kimi", "openclaw", "opencode"] + assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "cursor", "custom", "gemini", "kimi", "openclaw", "opencode"] assert payload["next_command"] == "dataclaw config --source all" def test_configure_next_steps_require_full_folder_presentation(self): diff --git a/tests/test_parser.py b/tests/test_parser.py index e5ffdad..676227a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -439,6 +439,8 @@ def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", tmp_path / "no-cursor.vscdb") + monkeypatch.setattr("dataclaw.parsers.cursor._PROJECT_INDEX", {}) def _write_opencode_db(self, db_path): conn = sqlite3.connect(db_path) @@ -991,6 +993,8 @@ def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", tmp_path / "no-cursor.vscdb") + monkeypatch.setattr("dataclaw.parsers.cursor._PROJECT_INDEX", {}) def test_discover_includes_subagent_sessions(self, tmp_path, monkeypatch, mock_anonymizer): self._disable_codex(tmp_path, monkeypatch) @@ -1591,6 +1595,8 @@ def _disable_others(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", tmp_path / "no-cursor.vscdb") + monkeypatch.setattr("dataclaw.parsers.cursor._PROJECT_INDEX", {}) def test_discover_openclaw_projects(self, tmp_path, monkeypatch, mock_anonymizer): self._disable_others(tmp_path, monkeypatch) @@ -1677,6 +1683,8 @@ def _disable_others(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", tmp_path / "no-cursor.vscdb") + monkeypatch.setattr("dataclaw.parsers.cursor._PROJECT_INDEX", {}) def _make_valid_session(self, session_id="s1", model="gpt-4", content="hello"): return json.dumps({ @@ -1778,3 +1786,295 @@ def test_parse_nonexistent_project(self, tmp_path, monkeypatch, mock_anonymizer) monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir) sessions = parse_project_sessions("nope", mock_anonymizer, source="custom") assert sessions == [] + + +def _write_cursor_db(db_path): + conn = sqlite3.connect(db_path) + conn.execute("CREATE TABLE cursorDiskKV(key TEXT PRIMARY KEY, value TEXT)") + conn.commit() + return conn + + +def _insert_cursor_conversation(conn, composer_id, bubbles): + headers = [{"bubbleId": b["id"], "type": b["type"]} for b in bubbles] + conn.execute( + "INSERT INTO cursorDiskKV VALUES(?, ?)", + (f"composerData:{composer_id}", json.dumps({"fullConversationHeadersOnly": headers})), + ) + for b in bubbles: + data = dict(b) + data.pop("id") + conn.execute( + "INSERT INTO cursorDiskKV VALUES(?, ?)", + (f"bubbleId:{composer_id}:{b['id']}", json.dumps(data)), + ) + + +class TestCursorDiscoverProjects: + def _disable_others(self, tmp_path, monkeypatch): + monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude") + monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex") + monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") + monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") + monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw") + monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi") + monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") + monkeypatch.setattr("dataclaw.parsers.cursor._PROJECT_INDEX", {}) + + def test_discover_cursor_projects(self, tmp_path, monkeypatch): + self._disable_others(tmp_path, monkeypatch) + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + _insert_cursor_conversation(conn, "conv-1", [ + {"id": "b1", "type": 1, "text": "Hello", "createdAt": 1706000000000, + "workspaceUris": ["file:///Users/testuser/work/repo"]}, + {"id": "b2", "type": 2, "text": "Hi there!", "createdAt": 1706000001000}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + projects = discover_projects() + assert len(projects) == 1 + assert projects[0]["source"] == "cursor" + assert projects[0]["display_name"] == "cursor:repo" + assert projects[0]["session_count"] == 1 + + def test_discover_groups_by_workspace(self, tmp_path, monkeypatch): + self._disable_others(tmp_path, monkeypatch) + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + for cid, uri in [("c1", "file:///Users/alice/proj-a"), ("c2", "file:///Users/alice/proj-a"), ("c3", "file:///Users/alice/proj-b")]: + _insert_cursor_conversation(conn, cid, [ + {"id": "b1", "type": 1, "text": "msg", "createdAt": 1706000000000, "workspaceUris": [uri]}, + {"id": "b2", "type": 2, "text": "reply", "createdAt": 1706000001000}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + projects = discover_projects() + assert len(projects) == 2 + names = {p["display_name"] for p in projects} + assert names == {"cursor:proj-a", "cursor:proj-b"} + counts = {p["display_name"]: p["session_count"] for p in projects} + assert counts["cursor:proj-a"] == 2 + assert counts["cursor:proj-b"] == 1 + + def test_discover_no_db(self, tmp_path, monkeypatch): + self._disable_others(tmp_path, monkeypatch) + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", tmp_path / "nonexistent.vscdb") + projects = discover_projects() + assert projects == [] + + def test_discover_skips_single_bubble_conversations(self, tmp_path, monkeypatch): + self._disable_others(tmp_path, monkeypatch) + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + conn.execute( + "INSERT INTO cursorDiskKV VALUES(?, ?)", + ("composerData:lonely", json.dumps({"fullConversationHeadersOnly": [{"bubbleId": "b1", "type": 1}]})), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + projects = discover_projects() + assert projects == [] + + +class TestCursorParseSessions: + def _disable_others(self, tmp_path, monkeypatch): + monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude") + monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex") + monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") + monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") + monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw") + monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi") + monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") + monkeypatch.setattr("dataclaw.parsers.cursor._PROJECT_INDEX", {}) + + def test_basic_conversation(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cwd = "/Users/testuser/work/myapp" + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + _insert_cursor_conversation(conn, "conv-1", [ + {"id": "b1", "type": 1, "text": "Fix the bug", "createdAt": 1706000000000, + "workspaceUris": [f"file://{cwd}"]}, + {"id": "b2", "type": 2, "text": "I'll fix it now.", "createdAt": 1706000001000, + "modelInfo": {"modelName": "claude-sonnet-4-20250514"}, + "tokenCount": {"inputTokens": 100, "outputTokens": 30}}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor") + assert len(sessions) == 1 + s = sessions[0] + assert s["session_id"] == "conv-1" + assert s["source"] == "cursor" + assert s["project"] == "cursor:myapp" + assert s["model"] == "claude-sonnet-4-20250514" + assert len(s["messages"]) == 2 + assert s["messages"][0]["role"] == "user" + assert "Fix the bug" in s["messages"][0]["content"] + assert s["messages"][1]["role"] == "assistant" + assert "fix it" in s["messages"][1]["content"] + assert s["stats"]["user_messages"] == 1 + assert s["stats"]["assistant_messages"] == 1 + assert s["stats"]["input_tokens"] == 100 + assert s["stats"]["output_tokens"] == 30 + + def test_tool_call(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cwd = "/Users/testuser/work/myapp" + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + _insert_cursor_conversation(conn, "conv-2", [ + {"id": "b1", "type": 1, "text": "Read the file", "createdAt": 1706000000000, + "workspaceUris": [f"file://{cwd}"]}, + {"id": "b2", "type": 2, "text": "", "createdAt": 1706000001000, + "toolFormerData": { + "name": "Read", + "params": json.dumps({"file_path": "/tmp/test.py"}), + "result": json.dumps("print('hello')"), + "status": "completed", + }, + "modelInfo": {"modelName": "claude-sonnet-4-20250514"}, + "tokenCount": {"inputTokens": 50, "outputTokens": 10}}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor") + assert len(sessions) == 1 + s = sessions[0] + assert len(s["messages"]) == 2 + tool_msg = s["messages"][1] + assert tool_msg["role"] == "assistant" + assert len(tool_msg["tool_uses"]) == 1 + tu = tool_msg["tool_uses"][0] + assert tu["tool"] == "Read" + assert tu["status"] == "completed" + assert "hello" in tu["output"]["text"] + assert s["stats"]["tool_uses"] == 1 + + def test_mcp_tool_prefix_stripped(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cwd = "/Users/testuser/work/myapp" + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + _insert_cursor_conversation(conn, "conv-3", [ + {"id": "b1", "type": 1, "text": "search", "createdAt": 1706000000000, + "workspaceUris": [f"file://{cwd}"]}, + {"id": "b2", "type": 2, "text": "", "createdAt": 1706000001000, + "toolFormerData": { + "name": "mcp_server_toolname", + "params": "{}", + "result": json.dumps("ok"), + "status": "completed", + }}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor") + tu = sessions[0]["messages"][1]["tool_uses"][0] + assert tu["tool"] == "toolname" + + def test_thinking_included(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cwd = "/Users/testuser/work/myapp" + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + _insert_cursor_conversation(conn, "conv-4", [ + {"id": "b1", "type": 1, "text": "Explain X", "createdAt": 1706000000000, + "workspaceUris": [f"file://{cwd}"]}, + {"id": "b2", "type": 2, "text": "Here's the answer.", "createdAt": 1706000001000, + "thinking": {"text": "Let me reason about X..."}, + "modelInfo": {"modelName": "claude-sonnet-4-20250514"}}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor", include_thinking=True) + msg = sessions[0]["messages"][1] + assert "thinking" in msg + assert "reason about X" in msg["thinking"] + + sessions_no = parse_project_sessions(cwd, mock_anonymizer, source="cursor", include_thinking=False) + msg_no = sessions_no[0]["messages"][1] + assert "thinking" not in msg_no + + def test_unknown_workspace_grouped(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + _insert_cursor_conversation(conn, "conv-5", [ + {"id": "b1", "type": 1, "text": "Hello", "createdAt": 1706000000000}, + {"id": "b2", "type": 2, "text": "Hi", "createdAt": 1706000001000}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions("", mock_anonymizer, source="cursor") + assert len(sessions) == 1 + assert sessions[0]["project"] == "cursor:unknown" + + def test_parse_nonexistent_project(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions("/no/such/path", mock_anonymizer, source="cursor") + assert sessions == [] + + def test_nested_json_params_unwrapped(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cwd = "/Users/testuser/work/myapp" + db_path = tmp_path / "state.vscdb" + conn = _write_cursor_db(db_path) + inner_params = {"file_path": "/tmp/foo.py"} + wrapped_params = {"tools": [{"parameters": json.dumps(inner_params)}]} + _insert_cursor_conversation(conn, "conv-6", [ + {"id": "b1", "type": 1, "text": "read", "createdAt": 1706000000000, + "workspaceUris": [f"file://{cwd}"]}, + {"id": "b2", "type": 2, "text": "", "createdAt": 1706000001000, + "toolFormerData": { + "name": "Read", + "params": json.dumps(wrapped_params), + "result": json.dumps("contents"), + "status": "completed", + }}, + ]) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + monkeypatch.setattr("dataclaw.parser.CURSOR_DB", db_path) + sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor") + tu = sessions[0]["messages"][1]["tool_uses"][0] + assert "file_path" in tu["input"]