Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,48 @@ dataclaw export --publish-attestation "User explicitly approved publishing to Hu
| `dataclaw export --all-projects` | Include everything (ignore exclusions) |
| `dataclaw export --no-thinking` | Exclude extended thinking blocks |
| `dataclaw update-skill claude` | Install/update the dataclaw skill for Claude Code |
| `dataclaw backup-sessions` | Copy session files to a safe location to prevent tool auto-deletion |
| `dataclaw backup-sessions --source claude` | Back up only Claude Code sessions |
| `dataclaw backup-sessions --source gemini` | Back up only Gemini CLI sessions |

</details>

<details>
<summary><b>Preventing session deletion</b></summary>

Claude Code and Gemini CLI automatically delete sessions older than 30 days. To avoid losing data before you export:

### Option 1 — Run `dataclaw backup-sessions` periodically (recommended)

```bash
dataclaw backup-sessions
```

This copies all session files to `~/.dataclaw/session_backup/`. DataClaw automatically reads from this backup when the original files have been deleted by a tool's cleanup routine.

**Set up automatic hourly backups with cron:**

```bash
# Open your crontab
crontab -e

# Add this line to run every hour
0 * * * * dataclaw backup-sessions
```

### Option 2 — Configure tools to retain sessions longer

**Claude Code** — add to `~/.claude/settings.json`:

```json
{
"cleanupPeriodDays": 0
}
```

Setting `cleanupPeriodDays` to `0` disables automatic cleanup entirely.

**Gemini CLI** — automatic session cleanup can be disabled by setting the retention period in `~/.gemini/settings.json` if your version supports it. Check your Gemini CLI documentation for the specific setting.

</details>

Expand Down
111 changes: 110 additions & 1 deletion dataclaw/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import json
import re
import shutil
import sys
import urllib.error
import urllib.request
Expand All @@ -12,7 +13,21 @@

from .anonymizer import Anonymizer
from .config import CONFIG_FILE, DataClawConfig, load_config, save_config
from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
from .parser import (
BACKUP_BASE_DIR,
CLAUDE_BACKUP_PROJECTS_DIR,
CLAUDE_DIR,
CODEX_DIR,
CUSTOM_DIR,
GEMINI_BACKUP_DIR,
GEMINI_DIR,
KIMI_DIR,
OPENCODE_DIR,
OPENCLAW_DIR,
PROJECTS_DIR,
discover_projects,
parse_project_sessions,
)
from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session

HF_TAG = "dataclaw"
Expand Down Expand Up @@ -609,6 +624,85 @@ def update_skill(target: str) -> None:
}, indent=2))


def backup_sessions(source_filter: str = "auto") -> None:
"""Copy session files to ~/.dataclaw/session_backup/ to prevent tool auto-deletion.

Claude Code and Gemini CLI automatically delete sessions older than 30 days.
This command copies all session files to a safe location that dataclaw also
reads from, so older sessions are preserved even after tool cleanup.

Run periodically (e.g. via cron) to keep the backup current.
"""
source_filter = _normalize_source_filter(source_filter)
backed_up = 0
already_current = 0
errors: list[str] = []

def _copy_file(src: Path, dest: Path) -> None:
nonlocal backed_up, already_current
dest.parent.mkdir(parents=True, exist_ok=True)
if not dest.exists() or dest.stat().st_mtime < src.stat().st_mtime:
try:
shutil.copy2(src, dest)
backed_up += 1
except OSError as e:
errors.append(str(e))
else:
already_current += 1

# Back up Claude Code sessions
if source_filter in ("auto", "claude") and PROJECTS_DIR.exists():
for project_dir in PROJECTS_DIR.iterdir():
if not project_dir.is_dir():
continue
for session_file in project_dir.glob("*.jsonl"):
dest = CLAUDE_BACKUP_PROJECTS_DIR / project_dir.name / session_file.name
_copy_file(session_file, dest)
# Back up subagent sessions
for subagent_dir in project_dir.iterdir():
if not subagent_dir.is_dir():
continue
sa_dir = subagent_dir / "subagents"
if not sa_dir.is_dir():
continue
for sa_file in sa_dir.glob("agent-*.jsonl"):
dest = (
CLAUDE_BACKUP_PROJECTS_DIR
/ project_dir.name
/ subagent_dir.name
/ "subagents"
/ sa_file.name
)
_copy_file(sa_file, dest)

# Back up Gemini CLI sessions
if source_filter in ("auto", "gemini") and GEMINI_DIR.exists():
for project_dir in GEMINI_DIR.iterdir():
if not project_dir.is_dir() or project_dir.name == "bin":
continue
chats_dir = project_dir / "chats"
if not chats_dir.exists():
continue
for session_file in chats_dir.glob("session-*.json"):
dest = GEMINI_BACKUP_DIR / project_dir.name / "chats" / session_file.name
_copy_file(session_file, dest)

result: dict[str, Any] = {
"backed_up": backed_up,
"already_current": already_current,
"backup_dir": str(BACKUP_BASE_DIR),
"next_steps": [
"Run periodically to keep sessions safe: dataclaw backup-sessions",
"Automate with cron (runs every hour): add to crontab via `crontab -e`:",
" 0 * * * * dataclaw backup-sessions",
],
}
if errors:
result["warning"] = f"{len(errors)} file(s) could not be backed up."
result["errors"] = errors[:5]
print(json.dumps(result, indent=2))


def status() -> None:
"""Show current stage and next steps (JSON). Read-only — does not modify config."""
config = load_config()
Expand Down Expand Up @@ -1213,6 +1307,17 @@ def main() -> None:
us = sub.add_parser("update-skill", help="Install/update the dataclaw skill for a coding agent")
us.add_argument("target", choices=["claude"], help="Agent to install skill for")

bs = sub.add_parser(
"backup-sessions",
help="Copy session files to ~/.dataclaw/session_backup/ to prevent tool auto-deletion",
)
bs.add_argument(
"--source",
choices=SOURCE_CHOICES,
default="auto",
help="Which source(s) to back up (default: all detected)",
)

cfg = sub.add_parser("config", help="View or set config")
cfg.add_argument("--repo", type=str, help="Set HF repo")
cfg.add_argument("--source", choices=sorted(EXPLICIT_SOURCE_CHOICES),
Expand Down Expand Up @@ -1284,6 +1389,10 @@ def main() -> None:
update_skill(args.target)
return

if command == "backup-sessions":
backup_sessions(source_filter=args.source)
return

if command == "list":
config = load_config()
resolved_source_choice, _ = _resolve_source_choice(args.source, config)
Expand Down
130 changes: 95 additions & 35 deletions dataclaw/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@

CUSTOM_DIR = Path.home() / ".dataclaw" / "custom"

# Backup directory — session files are copied here by `dataclaw backup-sessions`
# to survive automatic cleanup by coding tools (Claude Code, Gemini CLI, etc.)
BACKUP_BASE_DIR = Path.home() / ".dataclaw" / "session_backup"
CLAUDE_BACKUP_PROJECTS_DIR = BACKUP_BASE_DIR / "claude" / "projects"
GEMINI_BACKUP_DIR = BACKUP_BASE_DIR / "gemini"

_CODEX_PROJECT_INDEX: dict[str, list[Path]] = {}
_GEMINI_HASH_MAP: dict[str, str] = {}
_OPENCODE_PROJECT_INDEX: dict[str, list[str]] = {}
Expand Down Expand Up @@ -145,26 +151,42 @@ def discover_projects() -> list[dict]:


def _discover_claude_projects() -> list[dict]:
if not PROJECTS_DIR.exists():
base_dirs = [d for d in [PROJECTS_DIR, CLAUDE_BACKUP_PROJECTS_DIR] if d.exists()]
if not base_dirs:
return []

# Collect all project dirs from both original and backup, merging by dir name
all_project_dirs: dict[str, list[Path]] = {}
for base_dir in base_dirs:
for project_dir in sorted(base_dir.iterdir()):
if not project_dir.is_dir():
continue
all_project_dirs.setdefault(project_dir.name, []).append(project_dir)

projects = []
for project_dir in sorted(PROJECTS_DIR.iterdir()):
if not project_dir.is_dir():
continue
root_sessions = list(project_dir.glob("*.jsonl"))
subagent_sessions = _find_subagent_only_sessions(project_dir)
total_count = len(root_sessions) + len(subagent_sessions)
for dir_name, dirs in sorted(all_project_dirs.items()):
# Merge session files across dirs, deduplicating by filename
seen_files: dict[str, Path] = {}
for d in dirs:
for f in d.glob("*.jsonl"):
seen_files.setdefault(f.name, f)

# Collect subagent-only session dirs from all locations
subagent_sessions: list[Path] = []
for d in dirs:
subagent_sessions.extend(_find_subagent_only_sessions(d))

total_count = len(seen_files) + len(subagent_sessions)
if total_count == 0:
continue
total_size = sum(f.stat().st_size for f in root_sessions)
total_size = sum(f.stat().st_size for f in seen_files.values())
for session_dir in subagent_sessions:
for sa_file in (session_dir / "subagents").glob("agent-*.jsonl"):
total_size += sa_file.stat().st_size
projects.append(
{
"dir_name": project_dir.name,
"display_name": _build_project_name(project_dir.name),
"dir_name": dir_name,
"display_name": _build_project_name(dir_name),
"session_count": total_count,
"total_size_bytes": total_size,
"source": CLAUDE_SOURCE,
Expand Down Expand Up @@ -192,25 +214,36 @@ def _discover_codex_projects() -> list[dict]:


def _discover_gemini_projects() -> list[dict]:
if not GEMINI_DIR.exists():
base_dirs = [d for d in [GEMINI_DIR, GEMINI_BACKUP_DIR] if d.exists()]
if not base_dirs:
return []

# Collect all project dirs from both original and backup, merging by hash name
all_project_dirs: dict[str, list[Path]] = {}
for base_dir in base_dirs:
for project_dir in sorted(base_dir.iterdir()):
if not project_dir.is_dir() or project_dir.name == "bin":
continue
chats_dir = project_dir / "chats"
if not chats_dir.exists():
continue
all_project_dirs.setdefault(project_dir.name, []).append(project_dir)

projects = []
for project_dir in sorted(GEMINI_DIR.iterdir()):
if not project_dir.is_dir() or project_dir.name == "bin":
continue
chats_dir = project_dir / "chats"
if not chats_dir.exists():
continue
sessions = list(chats_dir.glob("session-*.json"))
if not sessions:
for project_hash, dirs in all_project_dirs.items():
# Merge session files across dirs, deduplicating by filename
seen_files: dict[str, Path] = {}
for d in dirs:
for f in (d / "chats").glob("session-*.json"):
seen_files.setdefault(f.name, f)
if not seen_files:
continue
projects.append(
{
"dir_name": project_dir.name,
"display_name": f"gemini:{_resolve_gemini_hash(project_dir.name)}",
"session_count": len(sessions),
"total_size_bytes": sum(f.stat().st_size for f in sessions),
"dir_name": project_hash,
"display_name": f"gemini:{_resolve_gemini_hash(project_hash)}",
"session_count": len(seen_files),
"total_size_bytes": sum(f.stat().st_size for f in seen_files.values()),
"source": GEMINI_SOURCE,
}
)
Expand Down Expand Up @@ -465,11 +498,17 @@ def parse_project_sessions(
return sessions

if source == GEMINI_SOURCE:
project_path = GEMINI_DIR / project_dir_name / "chats"
if not project_path.exists():
# Merge session files from original dir and backup, deduplicating by filename
session_file_map: dict[str, Path] = {}
for base_dir in [GEMINI_DIR, GEMINI_BACKUP_DIR]:
chats_dir = base_dir / project_dir_name / "chats"
if chats_dir.exists():
for f in chats_dir.glob("session-*.json"):
session_file_map.setdefault(f.name, f)
if not session_file_map:
return []
sessions = []
for session_file in sorted(project_path.glob("session-*.json")):
for session_file in sorted(session_file_map.values()):
parsed = _parse_gemini_session_file(session_file, anonymizer, include_thinking)
if parsed and parsed["messages"]:
parsed["project"] = f"gemini:{_resolve_gemini_hash(project_dir_name)}"
Expand Down Expand Up @@ -511,24 +550,45 @@ def parse_project_sessions(
sessions.append(parsed)
return sessions

project_path = PROJECTS_DIR / project_dir_name
if not project_path.exists():
# Merge session files from original dir and backup, deduplicating by filename
session_file_map: dict[str, Path] = {}
project_dirs = []
for base_dir in [PROJECTS_DIR, CLAUDE_BACKUP_PROJECTS_DIR]:
project_path = base_dir / project_dir_name
if project_path.exists():
project_dirs.append(project_path)
for f in project_path.glob("*.jsonl"):
session_file_map.setdefault(f.name, f)

if not project_dirs:
return []

sessions = []
for session_file in sorted(project_path.glob("*.jsonl")):
seen_session_ids: set[str] = set()
for session_file in sorted(session_file_map.values()):
parsed = _parse_claude_session_file(session_file, anonymizer, include_thinking)
if parsed and parsed["messages"]:
session_id = parsed.get("session_id", "")
if session_id and session_id in seen_session_ids:
continue
if session_id:
seen_session_ids.add(session_id)
parsed["project"] = _build_project_name(project_dir_name)
parsed["source"] = CLAUDE_SOURCE
sessions.append(parsed)

for session_dir in _find_subagent_only_sessions(project_path):
parsed = _parse_subagent_session(session_dir, anonymizer, include_thinking)
if parsed and parsed["messages"]:
parsed["project"] = _build_project_name(project_dir_name)
parsed["source"] = CLAUDE_SOURCE
sessions.append(parsed)
for project_path in project_dirs:
for session_dir in _find_subagent_only_sessions(project_path):
parsed = _parse_subagent_session(session_dir, anonymizer, include_thinking)
if parsed and parsed["messages"]:
session_id = parsed.get("session_id", "")
if session_id and session_id in seen_session_ids:
continue
if session_id:
seen_session_ids.add(session_id)
parsed["project"] = _build_project_name(project_dir_name)
parsed["source"] = CLAUDE_SOURCE
sessions.append(parsed)

return sessions

Expand Down
Loading