Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions dataclaw/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .anonymizer import Anonymizer
from .config import CONFIG_FILE, DataClawConfig, load_config, save_config
from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
from .parser import CLAUDE_DIR, CODEX_DIR, CURSOR_DB, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session

HF_TAG = "dataclaw"
Expand Down Expand Up @@ -58,8 +58,8 @@
"Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"",
]

EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"}
SOURCE_CHOICES = ["auto", "claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all"]
EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "cursor", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"}
SOURCE_CHOICES = ["auto", "claude", "codex", "cursor", "custom", "gemini", "kimi", "opencode", "openclaw", "all"]


def _mask_secret(s: str) -> str:
Expand Down Expand Up @@ -91,9 +91,11 @@ def _source_label(source_filter: str) -> str:
return "OpenClaw"
if source_filter == "kimi":
return "Kimi CLI"
if source_filter == "cursor":
return "Cursor"
if source_filter == "custom":
return "Custom"
return "Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom"
return "Claude Code, Codex, Cursor, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom"


def _normalize_source_filter(source_filter: str) -> str:
Expand Down Expand Up @@ -139,9 +141,11 @@ def _has_session_sources(source_filter: str = "auto") -> bool:
return OPENCLAW_DIR.exists()
if source_filter == "kimi":
return KIMI_DIR.exists()
if source_filter == "cursor":
return CURSOR_DB.exists()
if source_filter == "custom":
return CUSTOM_DIR.exists()
return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists()
return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CURSOR_DB.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists()


def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]:
Expand Down
87 changes: 86 additions & 1 deletion dataclaw/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Parse Claude Code, Codex, Gemini CLI, OpenCode, and OpenClaw session data into conversations."""
"""Parse Claude Code, Codex, Cursor, Gemini CLI, OpenCode, and OpenClaw session data into conversations."""

import dataclasses
import hashlib
Expand Down Expand Up @@ -45,6 +45,10 @@
KIMI_CONFIG_PATH = KIMI_DIR / "kimi.json"
UNKNOWN_KIMI_CWD = "<unknown-cwd>"

from .parsers import cursor as _cursor_mod
CURSOR_SOURCE = _cursor_mod.CURSOR_SOURCE
CURSOR_DB = _cursor_mod.CURSOR_DB

CUSTOM_DIR = Path.home() / ".dataclaw" / "custom"

_CODEX_PROJECT_INDEX: dict[str, list[Path]] = {}
Expand Down Expand Up @@ -140,6 +144,7 @@ def discover_projects() -> list[dict]:
projects.extend(_discover_opencode_projects())
projects.extend(_discover_openclaw_projects())
projects.extend(_discover_kimi_projects())
projects.extend(_cursor_mod.discover_projects())
projects.extend(_discover_custom_projects())
return sorted(projects, key=lambda p: (p["display_name"], p["source"]))

Expand Down Expand Up @@ -494,6 +499,32 @@ def parse_project_sessions(
sessions.append(parsed)
return sessions

if source == CURSOR_SOURCE:
index = _cursor_mod.get_project_index()
composer_ids = index.get(project_dir_name, [])
if not composer_ids:
return []
sessions = []
try:
with sqlite3.connect(f"file:{CURSOR_DB}?mode=ro", uri=True) as conn:
for cid in composer_ids:
parsed = _cursor_mod.parse_session(
cid, conn, anonymizer, include_thinking,
_make_stats=_make_stats,
_make_session_result=_make_session_result,
_parse_tool_input=_parse_tool_input,
_update_time_bounds=_update_time_bounds,
_normalize_timestamp=_normalize_timestamp,
_safe_int=_safe_int,
)
if parsed and parsed["messages"]:
parsed["project"] = _cursor_mod.build_project_name(project_dir_name)
parsed["source"] = CURSOR_SOURCE
sessions.append(parsed)
except sqlite3.Error:
pass
return sessions

if source == CODEX_SOURCE:
index = _get_codex_project_index()
session_files = index.get(project_dir_name, [])
Expand Down Expand Up @@ -1988,9 +2019,63 @@ def _parse_tool_input(tool_name: str | None, input_data: Any, anonymizer: Anonym
"plan": [anonymizer.text(str(p)) if isinstance(p, str) else p for p in plan],
}

# Cursor tools
if name in ("read_file", "read_file_v2"):
return {"file_path": anonymizer.path(input_data.get("targetFile", input_data.get("file_path", "")))}
if name in ("edit_file", "edit_file_v2"):
result: dict[str, Any] = {"file_path": anonymizer.path(
input_data.get("relativeWorkspacePath", input_data.get("targetFile", ""))
)}
content = input_data.get("streamingContent") or input_data.get("content")
if content:
result["content"] = anonymizer.text(str(content))
return result
if name == "search_replace":
return {
"file_path": anonymizer.path(input_data.get("targetFile", input_data.get("file_path", ""))),
"old_string": anonymizer.text(input_data.get("old_string", "")),
"new_string": anonymizer.text(input_data.get("new_string", "")),
}
if name in ("run_terminal_cmd", "run_terminal_command_v2"):
cmd, _ = redact_text(input_data.get("command", ""))
result = {"command": anonymizer.text(cmd)}
cwd = input_data.get("cwd")
if cwd:
result["cwd"] = anonymizer.path(cwd)
return result
if name == "codebase_search":
result = {"query": anonymizer.text(input_data.get("query", ""))}
inc = input_data.get("includePattern")
if inc:
result["include"] = inc
return result
if name == "grep_search":
pi = input_data.get("patternInfo", {})
pattern = pi.get("pattern", "") if isinstance(pi, dict) else str(pi)
return {"pattern": anonymizer.text(pattern)}
if name in ("list_dir", "list_dir_v2"):
return {"dir_path": anonymizer.path(input_data.get("targetDirectory", ""))}
if name == "ripgrep_raw_search":
return {
"query": anonymizer.text(input_data.get("query", "")),
"dir": anonymizer.path(input_data.get("rootDir", "")),
}
if name == "glob_file_search":
return {
"pattern": input_data.get("pattern", ""),
"dir": anonymizer.path(input_data.get("rootDir", "")),
}
if name == "semantic_search_full":
return {"query": anonymizer.text(input_data.get("query", ""))}
if name == "web_search":
return {"query": anonymizer.text(input_data.get("query", input_data.get("search_term", "")))}
if name == "web_fetch":
return {"url": anonymizer.text(input_data.get("url", ""))}

# Fallback: anonymize all string values
return {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in input_data.items()}


def _normalize_timestamp(value) -> str | None:
if value is None:
return None
Expand Down
Empty file added dataclaw/parsers/__init__.py
Empty file.
Loading