diff --git a/README.md b/README.md index b231d84..f0810f1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **This is a performance art project.** Anthropic built their models on the world's freely shared information, then introduced increasingly [dystopian data policies](https://www.anthropic.com/news/detecting-and-preventing-distillation-attacks) to stop anyone else from doing the same with their data — pulling up the ladder behind them. DataClaw lets you throw the ladder back down. The dataset it produces is yours to share. -Turn your Claude Code and Codex conversation history into structured data and publish it to Hugging Face with a single command. DataClaw parses session logs, redacts secrets and PII, and uploads the result as a ready-to-use dataset. +Turn your Claude Code, Codex, and Gemini CLI conversation history into structured data and publish it to Hugging Face with a single command. DataClaw parses session logs, redacts secrets and PII, and uploads the result as a ready-to-use dataset. ![DataClaw](dataclaw.jpeg) @@ -13,7 +13,7 @@ Every export is tagged **`dataclaw`** on Hugging Face. Together, they may someda Paste this into Claude Code, Codex, or any coding agent: ``` -Help me export my Claude Code and Codex conversation history to Hugging Face using DataClaw. +Help me export my Claude Code, Codex, and Gemini CLI conversation history to Hugging Face using DataClaw. Install it, set up the skill, then walk me through the process. STEP 1 — INSTALL @@ -29,12 +29,12 @@ STEP 3 — START Every dataclaw command outputs next_steps in its JSON — follow them through the entire flow. STEP 3A — CHOOSE SOURCE SCOPE (REQUIRED BEFORE EXPORT) - Ask the user explicitly: Claude Code, Codex, or both? - dataclaw config --source "claude|codex|both" + Ask the user explicitly: Claude Code, Codex, Gemini CLI, or all? + dataclaw config --source "claude|codex|gemini|all" Do not export until source scope is explicitly confirmed. STEP 3B — PRESENT ALL FOLDERS (REQUIRED BEFORE EXPORT) - dataclaw list --source "claude|codex|both" + dataclaw list --source "claude|codex|gemini|all" Send the FULL project/folder list to the user in a message (name, source, sessions, size, excluded). Ask which projects to exclude. dataclaw config --exclude "project1,project2" OR dataclaw config --confirm-projects @@ -69,8 +69,8 @@ huggingface-cli login --token YOUR_TOKEN # See your projects dataclaw prep -dataclaw config --source both # REQUIRED: choose claude, codex, or both -dataclaw list --source both # Present full list and confirm folder scope before export +dataclaw config --source all # REQUIRED: choose claude, codex, gemini, or all +dataclaw list --source all # Present full list and confirm folder scope before export # Configure dataclaw config --repo username/my-personal-codex-data @@ -105,23 +105,25 @@ dataclaw export --publish-attestation "User explicitly approved publishing to Hu |---------|-------------| | `dataclaw status` | Show current stage and next steps (JSON) | | `dataclaw prep` | Discover projects, check HF auth, output JSON | -| `dataclaw prep --source both` | Prep with both Claude + Codex explicitly selected | +| `dataclaw prep --source all` | Prep with Claude, Codex, and Gemini explicitly selected | +| `dataclaw prep --source gemini` | Prep using only Gemini CLI sessions | | `dataclaw prep --source codex` | Prep using only Codex sessions | | `dataclaw prep --source claude` | Prep using only Claude Code sessions | | `dataclaw list` | List all projects with exclusion status | -| `dataclaw list --source both` | List both Claude and Codex projects | +| `dataclaw list --source all` | List Claude, Codex, and Gemini projects | | `dataclaw list --source codex` | List only Codex projects | | `dataclaw config` | Show current config | | `dataclaw config --repo user/my-personal-codex-data` | Set HF repo | -| `dataclaw config --source both` | REQUIRED source scope selection (`claude`, `codex`, or `both`) | +| `dataclaw config --source all` | REQUIRED source scope selection (`claude`, `codex`, `gemini`, or `all`) | | `dataclaw config --exclude "a,b"` | Add excluded projects (appends) | | `dataclaw config --redact "str1,str2"` | Add strings to always redact (appends) | | `dataclaw config --redact-usernames "u1,u2"` | Add usernames to anonymize (appends) | | `dataclaw config --confirm-projects` | Mark project selection as confirmed | | `dataclaw export --no-push` | Export locally only (always do this first) | -| `dataclaw export --source both --no-push` | Export Claude + Codex sessions locally | +| `dataclaw export --source all --no-push` | Export Claude, Codex, and Gemini sessions locally | | `dataclaw export --source codex --no-push` | Export only Codex sessions locally | | `dataclaw export --source claude --no-push` | Export only Claude Code sessions locally | +| `dataclaw export --source gemini --no-push` | Export only Gemini CLI sessions locally | | `dataclaw confirm --full-name "NAME" --attest-full-name "..." --attest-sensitive "..." --attest-manual-scan "..."` | Scan for PII, run exact-name privacy check, verify review attestations, unlock pushing | | `dataclaw confirm --skip-full-name-scan --attest-full-name "..." --attest-sensitive "..." --attest-manual-scan "..."` | Skip exact-name scan when user declines sharing full name (requires skip attestation) | | `dataclaw export --publish-attestation "..."` | Export and push (requires `dataclaw confirm` first) | diff --git a/dataclaw/cli.py b/dataclaw/cli.py index e8298b1..ece23df 100644 --- a/dataclaw/cli.py +++ b/dataclaw/cli.py @@ -10,7 +10,7 @@ from .anonymizer import Anonymizer from .config import CONFIG_FILE, DataClawConfig, load_config, save_config -from .parser import CLAUDE_DIR, CODEX_DIR, discover_projects, parse_project_sessions +from .parser import CLAUDE_DIR, CODEX_DIR, GEMINI_DIR, discover_projects, parse_project_sessions from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session HF_TAG = "dataclaw" @@ -49,15 +49,15 @@ SETUP_TO_PUBLISH_STEPS = [ "Step 1/6: Run prep/list to review project scope: dataclaw prep && dataclaw list", - "Step 2/6: Explicitly choose source scope: dataclaw config --source ", + "Step 2/6: Explicitly choose source scope: dataclaw config --source ", "Step 3/6: Configure exclusions/redactions and confirm projects: dataclaw config ...", "Step 4/6: Export locally only: dataclaw export --no-push --output /tmp/dataclaw_export.jsonl", "Step 5/6: Review and confirm: dataclaw confirm ...", "Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"", ] -EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "both"} -SOURCE_CHOICES = ["auto", "claude", "codex", "both"] +EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "gemini", "all", "both"} +SOURCE_CHOICES = ["auto", "claude", "codex", "gemini", "all"] def _mask_secret(s: str) -> str: @@ -81,11 +81,13 @@ def _source_label(source_filter: str) -> str: return "Claude Code" if source_filter == "codex": return "Codex" - return "Claude Code or Codex" + if source_filter == "gemini": + return "Gemini CLI" + return "Claude Code, Codex, or Gemini CLI" def _normalize_source_filter(source_filter: str) -> str: - if source_filter == "both": + if source_filter in ("all", "both"): return "auto" return source_filter @@ -102,7 +104,7 @@ def _resolve_source_choice( Returns: (source_choice, explicit) where source_choice is one of - "claude" | "codex" | "both" | "auto". + "claude" | "codex" | "gemini" | "all" | "auto". """ if _is_explicit_source_choice(requested_source): return requested_source, True @@ -119,7 +121,9 @@ def _has_session_sources(source_filter: str = "auto") -> bool: return CLAUDE_DIR.exists() if source_filter == "codex": return CODEX_DIR.exists() - return CLAUDE_DIR.exists() or CODEX_DIR.exists() + if source_filter == "gemini": + return GEMINI_DIR.exists() + return CLAUDE_DIR.exists() or CODEX_DIR.exists() or GEMINI_DIR.exists() def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]: @@ -204,14 +208,14 @@ def _build_status_next_steps( steps = [] if not source_confirmed: steps.append( - "Ask the user to explicitly choose export source scope: Claude Code, Codex, or both. " - "Then set it: dataclaw config --source . " + "Ask the user to explicitly choose export source scope: Claude Code, Codex, Gemini, or all. " + "Then set it: dataclaw config --source . " "Do not run export until source scope is explicitly confirmed." ) else: steps.append( f"Source scope is currently set to '{configured_source}'. " - "If the user wants a different scope, run: dataclaw config --source ." + "If the user wants a different scope, run: dataclaw config --source ." ) if not projects_confirmed: steps.append( @@ -456,6 +460,7 @@ def _build_dataset_card(repo_id: str, meta: dict) -> str: - dataclaw - claude-code - codex-cli + - gemini-cli - conversations - coding-assistant - tool-use @@ -1091,8 +1096,11 @@ def prep(source_filter: str = "auto") -> None: err = "~/.claude was not found." elif effective_source_filter == "codex": err = "~/.codex was not found." + elif effective_source_filter == "gemini": + from .parser import GEMINI_DIR + err = f"{GEMINI_DIR} was not found." else: - err = "Neither ~/.claude nor ~/.codex was found." + err = "None of ~/.claude, ~/.codex, or ~/.gemini/tmp were found." print(json.dumps({"error": err})) sys.exit(1) @@ -1181,7 +1189,7 @@ def main() -> None: cfg = sub.add_parser("config", help="View or set config") cfg.add_argument("--repo", type=str, help="Set HF repo") cfg.add_argument("--source", choices=sorted(EXPLICIT_SOURCE_CHOICES), - help="Set export source scope explicitly: claude, codex, or both") + help="Set export source scope explicitly: claude, codex, gemini, or all") cfg.add_argument("--exclude", type=str, help="Comma-separated projects to exclude") cfg.add_argument("--redact", type=str, help="Comma-separated strings to always redact (API keys, usernames, domains)") @@ -1302,17 +1310,17 @@ def _run_export(args) -> None: "error": "Source scope is not confirmed yet.", "hint": ( "Explicitly choose one source scope before exporting: " - "`claude`, `codex`, or `both`." + "`claude`, `codex`, `gemini`, or `all`." ), "required_action": ( - "Ask the user whether to export Claude Code, Codex, or both. " - "Then run `dataclaw config --source ` " - "or pass `--source ` on the export command." + "Ask the user whether to export Claude Code, Codex, Gemini, or all. " + "Then run `dataclaw config --source ` " + "or pass `--source ` on the export command." ), "allowed_sources": sorted(EXPLICIT_SOURCE_CHOICES), "blocked_on_step": "Step 2/6", "process_steps": SETUP_TO_PUBLISH_STEPS, - "next_command": "dataclaw config --source both", + "next_command": "dataclaw config --source all", }, indent=2)) sys.exit(1) @@ -1396,8 +1404,11 @@ def _run_export(args) -> None: print(f"Error: {CLAUDE_DIR} not found.", file=sys.stderr) elif source_filter == "codex": print(f"Error: {CODEX_DIR} not found.", file=sys.stderr) + elif source_filter == "gemini": + from .parser import GEMINI_DIR + print(f"Error: {GEMINI_DIR} not found.", file=sys.stderr) else: - print("Error: neither ~/.claude nor ~/.codex was found.", file=sys.stderr) + print("Error: none of ~/.claude, ~/.codex, or ~/.gemini/tmp were found.", file=sys.stderr) sys.exit(1) projects = _filter_projects_by_source(discover_projects(), source_filter) diff --git a/dataclaw/config.py b/dataclaw/config.py index 2dd5779..6d0d43c 100644 --- a/dataclaw/config.py +++ b/dataclaw/config.py @@ -13,7 +13,7 @@ class DataClawConfig(TypedDict, total=False): """Expected shape of the config dict.""" repo: str | None - source: str | None # "claude" | "codex" | "both" + source: str | None # "claude" | "codex" | "gemini" | "all" excluded_projects: list[str] redact_strings: list[str] redact_usernames: list[str] diff --git a/dataclaw/parser.py b/dataclaw/parser.py index baabedb..7df8207 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -1,6 +1,7 @@ """Parse Claude Code and Codex session JSONL files into structured conversations.""" import dataclasses +import hashlib import json import logging from datetime import datetime, timezone @@ -14,6 +15,7 @@ CLAUDE_SOURCE = "claude" CODEX_SOURCE = "codex" +GEMINI_SOURCE = "gemini" CLAUDE_DIR = Path.home() / ".claude" PROJECTS_DIR = CLAUDE_DIR / "projects" @@ -23,7 +25,75 @@ CODEX_ARCHIVED_DIR = CODEX_DIR / "archived_sessions" UNKNOWN_CODEX_CWD = "" +GEMINI_DIR = Path.home() / ".gemini" / "tmp" + _CODEX_PROJECT_INDEX: dict[str, list[Path]] = {} +_GEMINI_HASH_MAP: dict[str, str] = {} + + +def _build_gemini_hash_map() -> dict[str, str]: + """Build a mapping from SHA-256 hash prefix to directory path. + + Gemini CLI names project dirs by hashing the absolute working directory path. + We scan first-level dirs under $HOME to reverse this mapping. + """ + result: dict[str, str] = {} + home = Path.home() + try: + for entry in home.iterdir(): + if entry.is_dir() and not entry.name.startswith("."): + h = hashlib.sha256(str(entry).encode()).hexdigest() + result[h] = str(entry) + except OSError: + pass + return result + + +def _extract_project_path_from_sessions(project_hash: str) -> str | None: + """Try to extract the project working directory from session tool call file paths.""" + chats_dir = GEMINI_DIR / project_hash / "chats" + if not chats_dir.exists(): + return None + for session_file in sorted(chats_dir.glob("session-*.json"), reverse=True): + try: + data = json.loads(session_file.read_text()) + except (json.JSONDecodeError, OSError): + continue + for msg in data.get("messages", []): + for tc in msg.get("toolCalls", []): + fp = tc.get("args", {}).get("file_path") or tc.get("args", {}).get("path", "") + if fp.startswith("/"): + # Extract the shallowest directory and verify its hash matches + parts = Path(fp).parts # e.g. ('/', 'home', 'wd', 'project', ...) + for depth in range(3, len(parts)): + candidate = str(Path(*parts[:depth + 1])) + if hashlib.sha256(candidate.encode()).hexdigest() == project_hash: + return candidate + # Only check the most recent session file with tool calls + break + return None + + +def _resolve_gemini_hash(project_hash: str) -> str: + """Resolve a Gemini project hash to a readable directory name. + + Strategy: + 1. Check hash map built from first-level dirs under $HOME. + 2. Fallback: extract path from session file tool call args. + 3. Last resort: return first 8 chars of the hash. + """ + global _GEMINI_HASH_MAP + if not _GEMINI_HASH_MAP: + _GEMINI_HASH_MAP = _build_gemini_hash_map() + full_path = _GEMINI_HASH_MAP.get(project_hash) + if full_path: + return Path(full_path).name + # Fallback: try extracting from session files + extracted = _extract_project_path_from_sessions(project_hash) + if extracted: + _GEMINI_HASH_MAP[project_hash] = extracted # cache it + return Path(extracted).name + return project_hash[:8] def _iter_jsonl(filepath: Path): @@ -40,9 +110,10 @@ def _iter_jsonl(filepath: Path): def discover_projects() -> list[dict]: - """Discover Claude Code and Codex projects with session counts.""" + """Discover Claude Code, Codex, and Gemini CLI projects with session counts.""" projects = _discover_claude_projects() projects.extend(_discover_codex_projects()) + projects.extend(_discover_gemini_projects()) return sorted(projects, key=lambda p: (p["display_name"], p["source"])) @@ -93,6 +164,32 @@ def _discover_codex_projects() -> list[dict]: return projects +def _discover_gemini_projects() -> list[dict]: + if not GEMINI_DIR.exists(): + return [] + + projects = [] + for project_dir in sorted(GEMINI_DIR.iterdir()): + if not project_dir.is_dir() or project_dir.name == "bin": + continue + chats_dir = project_dir / "chats" + if not chats_dir.exists(): + continue + sessions = list(chats_dir.glob("session-*.json")) + if not sessions: + continue + projects.append( + { + "dir_name": project_dir.name, + "display_name": f"gemini:{_resolve_gemini_hash(project_dir.name)}", + "session_count": len(sessions), + "total_size_bytes": sum(f.stat().st_size for f in sessions), + "source": GEMINI_SOURCE, + } + ) + return projects + + def parse_project_sessions( project_dir_name: str, anonymizer: Anonymizer, @@ -100,6 +197,19 @@ def parse_project_sessions( source: str = CLAUDE_SOURCE, ) -> list[dict]: """Parse all sessions for a project into structured dicts.""" + if source == GEMINI_SOURCE: + project_path = GEMINI_DIR / project_dir_name / "chats" + if not project_path.exists(): + return [] + sessions = [] + for session_file in sorted(project_path.glob("session-*.json")): + parsed = _parse_gemini_session_file(session_file, anonymizer, include_thinking) + if parsed and parsed["messages"]: + parsed["project"] = f"gemini:{_resolve_gemini_hash(project_dir_name)}" + parsed["source"] = GEMINI_SOURCE + sessions.append(parsed) + return sessions + if source == CODEX_SOURCE: index = _get_codex_project_index() session_files = index.get(project_dir_name, []) @@ -258,6 +368,97 @@ def _parse_subagent_session( return _make_session_result(metadata, messages, stats) +def _parse_gemini_session_file( + filepath: Path, anonymizer: Anonymizer, include_thinking: bool = True +) -> dict | None: + try: + with open(filepath) as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + return None + + messages = [] + metadata = { + "session_id": data.get("sessionId", filepath.stem), + "cwd": None, + "git_branch": None, + "model": None, + "start_time": data.get("startTime"), + "end_time": data.get("lastUpdated"), + } + stats = _make_stats() + + for msg_data in data.get("messages", []): + msg_type = msg_data.get("type") + timestamp = msg_data.get("timestamp") + + if msg_type == "user": + content = msg_data.get("content") + if isinstance(content, list): + text_parts = [part.get("text", "") for part in content if isinstance(part, dict) and "text" in part] + text = "\n".join(text_parts) + elif isinstance(content, str): + text = content + else: + continue + if not text.strip(): + continue + messages.append({ + "role": "user", + "content": anonymizer.text(text.strip()), + "timestamp": timestamp, + }) + stats["user_messages"] += 1 + _update_time_bounds(metadata, timestamp) + + elif msg_type == "gemini": + if metadata["model"] is None: + metadata["model"] = msg_data.get("model") + + tokens = msg_data.get("tokens", {}) + if tokens: + stats["input_tokens"] += tokens.get("input", 0) + tokens.get("cached", 0) + stats["output_tokens"] += tokens.get("output", 0) + + msg = {"role": "assistant"} + if timestamp: + msg["timestamp"] = timestamp + + content = msg_data.get("content") + if isinstance(content, str) and content.strip(): + msg["content"] = anonymizer.text(content.strip()) + + if include_thinking: + thoughts = msg_data.get("thoughts", []) + if thoughts: + thought_texts = [] + for t in thoughts: + if "description" in t and isinstance(t["description"], str): + thought_texts.append(t["description"].strip()) + if thought_texts: + msg["thinking"] = anonymizer.text("\n\n".join(thought_texts)) + + tool_uses = [] + for tc in msg_data.get("toolCalls", []): + tool_name = tc.get("name") + args_data = tc.get("args", {}) + tool_uses.append({ + "tool": tool_name, + "input": _summarize_tool_input(tool_name, args_data, anonymizer) + }) + + if tool_uses: + msg["tool_uses"] = tool_uses + stats["tool_uses"] += len(tool_uses) + + if "content" in msg or "thinking" in msg or "tool_uses" in msg: + messages.append(msg) + stats["assistant_messages"] += 1 + _update_time_bounds(metadata, timestamp) + + return _make_session_result(metadata, messages, stats) + + @dataclasses.dataclass class _CodexParseState: messages: list[dict[str, Any]] = dataclasses.field(default_factory=list) diff --git a/tests/test_cli.py b/tests/test_cli.py index 84b6098..97afca4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -400,7 +400,7 @@ def test_no_projects(self, monkeypatch, capsys): monkeypatch.setattr("dataclaw.cli.discover_projects", lambda: []) list_projects() captured = capsys.readouterr() - assert "No Claude Code or Codex sessions" in captured.out + assert "No Claude Code, Codex, or Gemini CLI sessions" in captured.out def test_source_filter_codex(self, monkeypatch, capsys): monkeypatch.setattr( @@ -579,7 +579,7 @@ def test_confirm_skip_full_name_scan_succeeds(self, tmp_path, monkeypatch, capsy assert payload["full_name_scan"]["skipped"] is True def test_push_before_confirm_shows_step_process(self, monkeypatch, capsys): - monkeypatch.setattr("dataclaw.cli.load_config", lambda: {"stage": "review", "source": "both"}) + monkeypatch.setattr("dataclaw.cli.load_config", lambda: {"stage": "review", "source": "all"}) monkeypatch.setattr("sys.argv", ["dataclaw", "export"]) with pytest.raises(SystemExit): main() @@ -602,7 +602,7 @@ def test_export_requires_project_confirmation_with_full_flow(self, monkeypatch, } ], ) - monkeypatch.setattr("dataclaw.cli.load_config", lambda: {"source": "both"}) + monkeypatch.setattr("dataclaw.cli.load_config", lambda: {"source": "all"}) monkeypatch.setattr("sys.argv", ["dataclaw", "export", "--no-push"]) with pytest.raises(SystemExit): main() @@ -626,8 +626,8 @@ def test_export_requires_explicit_source_selection(self, monkeypatch, capsys): assert payload["error"] == "Source scope is not confirmed yet." assert payload["blocked_on_step"] == "Step 2/6" assert len(payload["process_steps"]) == 6 - assert payload["allowed_sources"] == ["both", "claude", "codex"] - assert payload["next_command"] == "dataclaw config --source both" + assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "gemini"] + assert payload["next_command"] == "dataclaw config --source all" def test_configure_next_steps_require_full_folder_presentation(self): steps, _next = _build_status_next_steps(