Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions dataclaw/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .anonymizer import Anonymizer
from .config import CONFIG_FILE, DataClawConfig, load_config, save_config
from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
from .parser import CLAUDE_DIR, CODEX_DIR, CURSOR_DB, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session

HF_TAG = "dataclaw"
Expand Down Expand Up @@ -58,8 +58,8 @@
"Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"",
]

EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"}
SOURCE_CHOICES = ["auto", "claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all"]
EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "cursor", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"}
SOURCE_CHOICES = ["auto", "claude", "codex", "cursor", "custom", "gemini", "kimi", "opencode", "openclaw", "all"]


def _mask_secret(s: str) -> str:
Expand Down Expand Up @@ -91,9 +91,11 @@ def _source_label(source_filter: str) -> str:
return "OpenClaw"
if source_filter == "kimi":
return "Kimi CLI"
if source_filter == "cursor":
return "Cursor"
if source_filter == "custom":
return "Custom"
return "Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom"
return "Claude Code, Codex, Cursor, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom"


def _normalize_source_filter(source_filter: str) -> str:
Expand Down Expand Up @@ -139,9 +141,11 @@ def _has_session_sources(source_filter: str = "auto") -> bool:
return OPENCLAW_DIR.exists()
if source_filter == "kimi":
return KIMI_DIR.exists()
if source_filter == "cursor":
return CURSOR_DB.exists()
if source_filter == "custom":
return CUSTOM_DIR.exists()
return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists()
return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CURSOR_DB.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists()


def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]:
Expand Down
121 changes: 65 additions & 56 deletions dataclaw/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Parse Claude Code, Codex, Gemini CLI, OpenCode, and OpenClaw session data into conversations."""
"""Parse Claude Code, Codex, Cursor, Gemini CLI, OpenCode, and OpenClaw session data into conversations."""

import dataclasses
import hashlib
Expand Down Expand Up @@ -45,6 +45,10 @@
KIMI_CONFIG_PATH = KIMI_DIR / "kimi.json"
UNKNOWN_KIMI_CWD = "<unknown-cwd>"

from .parsers import cursor as _cursor_mod
CURSOR_SOURCE = _cursor_mod.CURSOR_SOURCE
CURSOR_DB = _cursor_mod.CURSOR_DB

CUSTOM_DIR = Path.home() / ".dataclaw" / "custom"

_CODEX_PROJECT_INDEX: dict[str, list[Path]] = {}
Expand Down Expand Up @@ -140,6 +144,7 @@ def discover_projects() -> list[dict]:
projects.extend(_discover_opencode_projects())
projects.extend(_discover_openclaw_projects())
projects.extend(_discover_kimi_projects())
projects.extend(_cursor_mod.discover_projects())
projects.extend(_discover_custom_projects())
return sorted(projects, key=lambda p: (p["display_name"], p["source"]))

Expand Down Expand Up @@ -494,6 +499,32 @@ def parse_project_sessions(
sessions.append(parsed)
return sessions

if source == CURSOR_SOURCE:
index = _cursor_mod.get_project_index()
composer_ids = index.get(project_dir_name, [])
if not composer_ids:
return []
sessions = []
try:
with sqlite3.connect(f"file:{CURSOR_DB}?mode=ro", uri=True) as conn:
for cid in composer_ids:
parsed = _cursor_mod.parse_session(
cid, conn, anonymizer, include_thinking,
_make_stats=_make_stats,
_make_session_result=_make_session_result,
_parse_tool_input=_parse_tool_input,
_update_time_bounds=_update_time_bounds,
_normalize_timestamp=_normalize_timestamp,
_safe_int=_safe_int,
)
if parsed and parsed["messages"]:
parsed["project"] = _cursor_mod.build_project_name(project_dir_name)
parsed["source"] = CURSOR_SOURCE
sessions.append(parsed)
except sqlite3.Error:
pass
return sessions

if source == CODEX_SOURCE:
index = _get_codex_project_index()
session_files = index.get(project_dir_name, [])
Expand Down Expand Up @@ -1930,66 +1961,44 @@ def _extract_assistant_content(
return msg


_PATH_KEYS = frozenset({
"file_path", "path", "dir", "dir_path", "cwd", "workdir",
"targetFile", "targetDirectory", "relativeWorkspacePath", "rootDir",
})
_CMD_KEYS = frozenset({"command", "cmd"})
_TEXT_KEYS = frozenset({
"content", "text", "prompt", "query", "url", "pattern",
"old_string", "new_string", "patch", "patchText", "chars",
"explanation", "search_term", "streamingContent",
})


def _anonymize_value(key: str, value: Any, anonymizer: Anonymizer) -> Any:
if isinstance(value, str):
if key in _PATH_KEYS:
return anonymizer.path(value)
if key in _CMD_KEYS:
redacted, _ = redact_text(value)
return anonymizer.text(redacted)
return anonymizer.text(value)
if isinstance(value, dict):
return {k: _anonymize_value(k, v, anonymizer) for k, v in value.items()}
if isinstance(value, list):
return [_anonymize_value(key, item, anonymizer) for item in value]
return value


def _parse_tool_input(tool_name: str | None, input_data: Any, anonymizer: Anonymizer) -> dict:
"""Return a structured dict for a tool's input args, with paths/content anonymized."""
"""Return a structured dict for a tool's input args, with paths/content anonymized.

Preserves all original fields; applies path anonymization to known path keys,
secret redaction to known command keys, and text anonymization to all other strings.
"""
if not isinstance(input_data, dict):
return {"raw": anonymizer.text(str(input_data))}

name = (tool_name or "").lower()

# Claude Code tools
if name in ("read", "edit"):
return {"file_path": anonymizer.path(input_data.get("file_path", ""))}
if name == "write":
return {
"file_path": anonymizer.path(input_data.get("file_path", "")),
"content": anonymizer.text(input_data.get("content", "")),
}
if name == "bash":
cmd, _ = redact_text(input_data.get("command", ""))
return {"command": anonymizer.text(cmd)}
if name == "grep":
pattern, _ = redact_text(input_data.get("pattern", ""))
return {"pattern": anonymizer.text(pattern), "path": anonymizer.path(input_data.get("path", ""))}
if name == "glob":
return {"pattern": input_data.get("pattern", ""), "path": anonymizer.path(input_data.get("path", ""))}
if name == "task":
return {"prompt": anonymizer.text(input_data.get("prompt", ""))}
if name == "websearch":
return {"query": anonymizer.text(input_data.get("query", ""))}
if name == "webfetch":
return {"url": anonymizer.text(input_data.get("url", ""))}
if name == "apply_patch":
return {"patch": anonymizer.text(input_data.get("patchText", ""))}
if name == "codesearch":
return {"query": anonymizer.text(input_data.get("query", ""))}

# Codex tools
if name == "exec_command":
cmd, _ = redact_text(input_data.get("cmd", ""))
return {"cmd": anonymizer.text(cmd)}
if name == "shell_command":
cmd, _ = redact_text(input_data.get("command", ""))
return {
"command": anonymizer.text(cmd),
"workdir": anonymizer.path(input_data.get("workdir", "")),
}
if name == "write_stdin":
return {
"session_id": input_data.get("session_id"),
"chars": anonymizer.text(input_data.get("chars", "")),
"yield_time_ms": input_data.get("yield_time_ms"),
"max_output_tokens": input_data.get("max_output_tokens"),
}
if name == "update_plan":
plan = input_data.get("plan", [])
return {
"explanation": anonymizer.text(input_data.get("explanation", "")),
"plan": [anonymizer.text(str(p)) if isinstance(p, str) else p for p in plan],
}
return {k: _anonymize_value(k, v, anonymizer) for k, v in input_data.items()}

# Fallback: anonymize all string values
return {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in input_data.items()}

def _normalize_timestamp(value) -> str | None:
if value is None:
Expand Down
Empty file added dataclaw/parsers/__init__.py
Empty file.
Loading