diff --git a/README.md b/README.md index e928c5e..ce32707 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,48 @@ dataclaw export --publish-attestation "User explicitly approved publishing to Hu | `dataclaw export --all-projects` | Include everything (ignore exclusions) | | `dataclaw export --no-thinking` | Exclude extended thinking blocks | | `dataclaw update-skill claude` | Install/update the dataclaw skill for Claude Code | +| `dataclaw backup-sessions` | Copy session files to a safe location to prevent tool auto-deletion | +| `dataclaw backup-sessions --source claude` | Back up only Claude Code sessions | +| `dataclaw backup-sessions --source gemini` | Back up only Gemini CLI sessions | + + + +
+Preventing session deletion + +Claude Code and Gemini CLI automatically delete sessions older than 30 days. To avoid losing data before you export: + +### Option 1 — Run `dataclaw backup-sessions` periodically (recommended) + +```bash +dataclaw backup-sessions +``` + +This copies all session files to `~/.dataclaw/session_backup/`. DataClaw automatically reads from this backup when the original files have been deleted by a tool's cleanup routine. + +**Set up automatic hourly backups with cron:** + +```bash +# Open your crontab +crontab -e + +# Add this line to run every hour +0 * * * * dataclaw backup-sessions +``` + +### Option 2 — Configure tools to retain sessions longer + +**Claude Code** — add to `~/.claude/settings.json`: + +```json +{ + "cleanupPeriodDays": 0 +} +``` + +Setting `cleanupPeriodDays` to `0` disables automatic cleanup entirely. + +**Gemini CLI** — automatic session cleanup can be disabled by setting the retention period in `~/.gemini/settings.json` if your version supports it. Check your Gemini CLI documentation for the specific setting.
diff --git a/dataclaw/cli.py b/dataclaw/cli.py index a7957c5..c0e030c 100644 --- a/dataclaw/cli.py +++ b/dataclaw/cli.py @@ -3,6 +3,7 @@ import argparse import json import re +import shutil import sys import urllib.error import urllib.request @@ -12,7 +13,21 @@ from .anonymizer import Anonymizer from .config import CONFIG_FILE, DataClawConfig, load_config, save_config -from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions +from .parser import ( + BACKUP_BASE_DIR, + CLAUDE_BACKUP_PROJECTS_DIR, + CLAUDE_DIR, + CODEX_DIR, + CUSTOM_DIR, + GEMINI_BACKUP_DIR, + GEMINI_DIR, + KIMI_DIR, + OPENCODE_DIR, + OPENCLAW_DIR, + PROJECTS_DIR, + discover_projects, + parse_project_sessions, +) from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session HF_TAG = "dataclaw" @@ -609,6 +624,85 @@ def update_skill(target: str) -> None: }, indent=2)) +def backup_sessions(source_filter: str = "auto") -> None: + """Copy session files to ~/.dataclaw/session_backup/ to prevent tool auto-deletion. + + Claude Code and Gemini CLI automatically delete sessions older than 30 days. + This command copies all session files to a safe location that dataclaw also + reads from, so older sessions are preserved even after tool cleanup. + + Run periodically (e.g. via cron) to keep the backup current. + """ + source_filter = _normalize_source_filter(source_filter) + backed_up = 0 + already_current = 0 + errors: list[str] = [] + + def _copy_file(src: Path, dest: Path) -> None: + nonlocal backed_up, already_current + dest.parent.mkdir(parents=True, exist_ok=True) + if not dest.exists() or dest.stat().st_mtime < src.stat().st_mtime: + try: + shutil.copy2(src, dest) + backed_up += 1 + except OSError as e: + errors.append(str(e)) + else: + already_current += 1 + + # Back up Claude Code sessions + if source_filter in ("auto", "claude") and PROJECTS_DIR.exists(): + for project_dir in PROJECTS_DIR.iterdir(): + if not project_dir.is_dir(): + continue + for session_file in project_dir.glob("*.jsonl"): + dest = CLAUDE_BACKUP_PROJECTS_DIR / project_dir.name / session_file.name + _copy_file(session_file, dest) + # Back up subagent sessions + for subagent_dir in project_dir.iterdir(): + if not subagent_dir.is_dir(): + continue + sa_dir = subagent_dir / "subagents" + if not sa_dir.is_dir(): + continue + for sa_file in sa_dir.glob("agent-*.jsonl"): + dest = ( + CLAUDE_BACKUP_PROJECTS_DIR + / project_dir.name + / subagent_dir.name + / "subagents" + / sa_file.name + ) + _copy_file(sa_file, dest) + + # Back up Gemini CLI sessions + if source_filter in ("auto", "gemini") and GEMINI_DIR.exists(): + for project_dir in GEMINI_DIR.iterdir(): + if not project_dir.is_dir() or project_dir.name == "bin": + continue + chats_dir = project_dir / "chats" + if not chats_dir.exists(): + continue + for session_file in chats_dir.glob("session-*.json"): + dest = GEMINI_BACKUP_DIR / project_dir.name / "chats" / session_file.name + _copy_file(session_file, dest) + + result: dict[str, Any] = { + "backed_up": backed_up, + "already_current": already_current, + "backup_dir": str(BACKUP_BASE_DIR), + "next_steps": [ + "Run periodically to keep sessions safe: dataclaw backup-sessions", + "Automate with cron (runs every hour): add to crontab via `crontab -e`:", + " 0 * * * * dataclaw backup-sessions", + ], + } + if errors: + result["warning"] = f"{len(errors)} file(s) could not be backed up." + result["errors"] = errors[:5] + print(json.dumps(result, indent=2)) + + def status() -> None: """Show current stage and next steps (JSON). Read-only — does not modify config.""" config = load_config() @@ -1213,6 +1307,17 @@ def main() -> None: us = sub.add_parser("update-skill", help="Install/update the dataclaw skill for a coding agent") us.add_argument("target", choices=["claude"], help="Agent to install skill for") + bs = sub.add_parser( + "backup-sessions", + help="Copy session files to ~/.dataclaw/session_backup/ to prevent tool auto-deletion", + ) + bs.add_argument( + "--source", + choices=SOURCE_CHOICES, + default="auto", + help="Which source(s) to back up (default: all detected)", + ) + cfg = sub.add_parser("config", help="View or set config") cfg.add_argument("--repo", type=str, help="Set HF repo") cfg.add_argument("--source", choices=sorted(EXPLICIT_SOURCE_CHOICES), @@ -1284,6 +1389,10 @@ def main() -> None: update_skill(args.target) return + if command == "backup-sessions": + backup_sessions(source_filter=args.source) + return + if command == "list": config = load_config() resolved_source_choice, _ = _resolve_source_choice(args.source, config) diff --git a/dataclaw/parser.py b/dataclaw/parser.py index 50cc39e..aa606bd 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -47,6 +47,12 @@ CUSTOM_DIR = Path.home() / ".dataclaw" / "custom" +# Backup directory — session files are copied here by `dataclaw backup-sessions` +# to survive automatic cleanup by coding tools (Claude Code, Gemini CLI, etc.) +BACKUP_BASE_DIR = Path.home() / ".dataclaw" / "session_backup" +CLAUDE_BACKUP_PROJECTS_DIR = BACKUP_BASE_DIR / "claude" / "projects" +GEMINI_BACKUP_DIR = BACKUP_BASE_DIR / "gemini" + _CODEX_PROJECT_INDEX: dict[str, list[Path]] = {} _GEMINI_HASH_MAP: dict[str, str] = {} _OPENCODE_PROJECT_INDEX: dict[str, list[str]] = {} @@ -145,26 +151,42 @@ def discover_projects() -> list[dict]: def _discover_claude_projects() -> list[dict]: - if not PROJECTS_DIR.exists(): + base_dirs = [d for d in [PROJECTS_DIR, CLAUDE_BACKUP_PROJECTS_DIR] if d.exists()] + if not base_dirs: return [] + # Collect all project dirs from both original and backup, merging by dir name + all_project_dirs: dict[str, list[Path]] = {} + for base_dir in base_dirs: + for project_dir in sorted(base_dir.iterdir()): + if not project_dir.is_dir(): + continue + all_project_dirs.setdefault(project_dir.name, []).append(project_dir) + projects = [] - for project_dir in sorted(PROJECTS_DIR.iterdir()): - if not project_dir.is_dir(): - continue - root_sessions = list(project_dir.glob("*.jsonl")) - subagent_sessions = _find_subagent_only_sessions(project_dir) - total_count = len(root_sessions) + len(subagent_sessions) + for dir_name, dirs in sorted(all_project_dirs.items()): + # Merge session files across dirs, deduplicating by filename + seen_files: dict[str, Path] = {} + for d in dirs: + for f in d.glob("*.jsonl"): + seen_files.setdefault(f.name, f) + + # Collect subagent-only session dirs from all locations + subagent_sessions: list[Path] = [] + for d in dirs: + subagent_sessions.extend(_find_subagent_only_sessions(d)) + + total_count = len(seen_files) + len(subagent_sessions) if total_count == 0: continue - total_size = sum(f.stat().st_size for f in root_sessions) + total_size = sum(f.stat().st_size for f in seen_files.values()) for session_dir in subagent_sessions: for sa_file in (session_dir / "subagents").glob("agent-*.jsonl"): total_size += sa_file.stat().st_size projects.append( { - "dir_name": project_dir.name, - "display_name": _build_project_name(project_dir.name), + "dir_name": dir_name, + "display_name": _build_project_name(dir_name), "session_count": total_count, "total_size_bytes": total_size, "source": CLAUDE_SOURCE, @@ -192,25 +214,36 @@ def _discover_codex_projects() -> list[dict]: def _discover_gemini_projects() -> list[dict]: - if not GEMINI_DIR.exists(): + base_dirs = [d for d in [GEMINI_DIR, GEMINI_BACKUP_DIR] if d.exists()] + if not base_dirs: return [] + # Collect all project dirs from both original and backup, merging by hash name + all_project_dirs: dict[str, list[Path]] = {} + for base_dir in base_dirs: + for project_dir in sorted(base_dir.iterdir()): + if not project_dir.is_dir() or project_dir.name == "bin": + continue + chats_dir = project_dir / "chats" + if not chats_dir.exists(): + continue + all_project_dirs.setdefault(project_dir.name, []).append(project_dir) + projects = [] - for project_dir in sorted(GEMINI_DIR.iterdir()): - if not project_dir.is_dir() or project_dir.name == "bin": - continue - chats_dir = project_dir / "chats" - if not chats_dir.exists(): - continue - sessions = list(chats_dir.glob("session-*.json")) - if not sessions: + for project_hash, dirs in all_project_dirs.items(): + # Merge session files across dirs, deduplicating by filename + seen_files: dict[str, Path] = {} + for d in dirs: + for f in (d / "chats").glob("session-*.json"): + seen_files.setdefault(f.name, f) + if not seen_files: continue projects.append( { - "dir_name": project_dir.name, - "display_name": f"gemini:{_resolve_gemini_hash(project_dir.name)}", - "session_count": len(sessions), - "total_size_bytes": sum(f.stat().st_size for f in sessions), + "dir_name": project_hash, + "display_name": f"gemini:{_resolve_gemini_hash(project_hash)}", + "session_count": len(seen_files), + "total_size_bytes": sum(f.stat().st_size for f in seen_files.values()), "source": GEMINI_SOURCE, } ) @@ -465,11 +498,17 @@ def parse_project_sessions( return sessions if source == GEMINI_SOURCE: - project_path = GEMINI_DIR / project_dir_name / "chats" - if not project_path.exists(): + # Merge session files from original dir and backup, deduplicating by filename + session_file_map: dict[str, Path] = {} + for base_dir in [GEMINI_DIR, GEMINI_BACKUP_DIR]: + chats_dir = base_dir / project_dir_name / "chats" + if chats_dir.exists(): + for f in chats_dir.glob("session-*.json"): + session_file_map.setdefault(f.name, f) + if not session_file_map: return [] sessions = [] - for session_file in sorted(project_path.glob("session-*.json")): + for session_file in sorted(session_file_map.values()): parsed = _parse_gemini_session_file(session_file, anonymizer, include_thinking) if parsed and parsed["messages"]: parsed["project"] = f"gemini:{_resolve_gemini_hash(project_dir_name)}" @@ -511,24 +550,45 @@ def parse_project_sessions( sessions.append(parsed) return sessions - project_path = PROJECTS_DIR / project_dir_name - if not project_path.exists(): + # Merge session files from original dir and backup, deduplicating by filename + session_file_map: dict[str, Path] = {} + project_dirs = [] + for base_dir in [PROJECTS_DIR, CLAUDE_BACKUP_PROJECTS_DIR]: + project_path = base_dir / project_dir_name + if project_path.exists(): + project_dirs.append(project_path) + for f in project_path.glob("*.jsonl"): + session_file_map.setdefault(f.name, f) + + if not project_dirs: return [] sessions = [] - for session_file in sorted(project_path.glob("*.jsonl")): + seen_session_ids: set[str] = set() + for session_file in sorted(session_file_map.values()): parsed = _parse_claude_session_file(session_file, anonymizer, include_thinking) if parsed and parsed["messages"]: + session_id = parsed.get("session_id", "") + if session_id and session_id in seen_session_ids: + continue + if session_id: + seen_session_ids.add(session_id) parsed["project"] = _build_project_name(project_dir_name) parsed["source"] = CLAUDE_SOURCE sessions.append(parsed) - for session_dir in _find_subagent_only_sessions(project_path): - parsed = _parse_subagent_session(session_dir, anonymizer, include_thinking) - if parsed and parsed["messages"]: - parsed["project"] = _build_project_name(project_dir_name) - parsed["source"] = CLAUDE_SOURCE - sessions.append(parsed) + for project_path in project_dirs: + for session_dir in _find_subagent_only_sessions(project_path): + parsed = _parse_subagent_session(session_dir, anonymizer, include_thinking) + if parsed and parsed["messages"]: + session_id = parsed.get("session_id", "") + if session_id and session_id in seen_session_ids: + continue + if session_id: + seen_session_ids.add(session_id) + parsed["project"] = _build_project_name(project_dir_name) + parsed["source"] = CLAUDE_SOURCE + sessions.append(parsed) return sessions diff --git a/tests/test_parser.py b/tests/test_parser.py index e5ffdad..4bde389 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -429,10 +429,12 @@ def test_blank_lines_skipped(self, tmp_path, mock_anonymizer): class TestDiscoverProjects: def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude-projects") + monkeypatch.setattr("dataclaw.parser.CLAUDE_BACKUP_PROJECTS_DIR", tmp_path / "no-claude-backup") monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex-sessions") monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.GEMINI_BACKUP_DIR", tmp_path / "no-gemini-backup") monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") @@ -981,10 +983,12 @@ class TestDiscoverSubagentProjects: """Verify discover_projects and parse_project_sessions include subagent-only sessions.""" def _disable_codex(self, tmp_path, monkeypatch): + monkeypatch.setattr("dataclaw.parser.CLAUDE_BACKUP_PROJECTS_DIR", tmp_path / "no-claude-backup") monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex-sessions") monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.GEMINI_BACKUP_DIR", tmp_path / "no-gemini-backup") monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") @@ -1582,10 +1586,12 @@ def test_cache_read_tokens(self, mock_anonymizer): class TestDiscoverOpenclawProjects: def _disable_others(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude") + monkeypatch.setattr("dataclaw.parser.CLAUDE_BACKUP_PROJECTS_DIR", tmp_path / "no-claude-backup") monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex-sessions") monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.GEMINI_BACKUP_DIR", tmp_path / "no-gemini-backup") monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) @@ -1668,10 +1674,12 @@ def test_multiple_agents_same_cwd(self, tmp_path, monkeypatch, mock_anonymizer): class TestDiscoverCustomProjects: def _disable_others(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude") + monkeypatch.setattr("dataclaw.parser.CLAUDE_BACKUP_PROJECTS_DIR", tmp_path / "no-claude-backup") monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex-sessions") monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.GEMINI_BACKUP_DIR", tmp_path / "no-gemini-backup") monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents")