From 3bf4697b33e2734713f8f1264e81ba2a530a5304 Mon Sep 17 00:00:00 2001 From: lyx Date: Thu, 26 Feb 2026 11:57:22 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=B9=20Kimi=20CLI=20?= =?UTF-8?q?=E7=9A=84=E6=94=AF=E6=8C=81=EF=BC=8C=E5=8C=85=E6=8B=AC=E6=BA=90?= =?UTF-8?q?=E7=9B=AE=E5=BD=95=E3=80=81=E9=A1=B9=E7=9B=AE=E5=8F=91=E7=8E=B0?= =?UTF-8?q?=E5=92=8C=E4=BC=9A=E8=AF=9D=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dataclaw/cli.py | 37 ++++--- dataclaw/config.py | 3 +- dataclaw/parser.py | 227 ++++++++++++++++++++++++++++++++++++++++++- tests/test_cli.py | 4 +- tests/test_parser.py | 2 + 5 files changed, 255 insertions(+), 18 deletions(-) diff --git a/dataclaw/cli.py b/dataclaw/cli.py index e9adffc..eb7af4f 100644 --- a/dataclaw/cli.py +++ b/dataclaw/cli.py @@ -12,7 +12,7 @@ from .anonymizer import Anonymizer from .config import CONFIG_FILE, DataClawConfig, load_config, save_config -from .parser import CLAUDE_DIR, CODEX_DIR, GEMINI_DIR, OPENCODE_DIR, discover_projects, parse_project_sessions +from .parser import CLAUDE_DIR, CODEX_DIR, GEMINI_DIR, OPENCODE_DIR, KIMI_DIR, discover_projects, parse_project_sessions from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session HF_TAG = "dataclaw" @@ -58,8 +58,8 @@ "Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"", ] -EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "gemini", "opencode", "all", "both"} -SOURCE_CHOICES = ["auto", "claude", "codex", "gemini", "opencode", "all"] +EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "gemini", "opencode", "kimi", "all", "both"} +SOURCE_CHOICES = ["auto", "claude", "codex", "gemini", "opencode", "kimi", "all"] def _mask_secret(s: str) -> str: @@ -87,7 +87,9 @@ def _source_label(source_filter: str) -> str: return "Gemini CLI" if source_filter == "opencode": return "OpenCode" - return "Claude Code, Codex, Gemini CLI, or OpenCode" + if source_filter == "kimi": + return "Kimi CLI" + return "Claude Code, Codex, Gemini CLI, OpenCode, or Kimi CLI" def _normalize_source_filter(source_filter: str) -> str: @@ -129,7 +131,9 @@ def _has_session_sources(source_filter: str = "auto") -> bool: return GEMINI_DIR.exists() if source_filter == "opencode": return OPENCODE_DIR.exists() - return CLAUDE_DIR.exists() or CODEX_DIR.exists() or GEMINI_DIR.exists() or OPENCODE_DIR.exists() + if source_filter == "kimi": + return KIMI_DIR.exists() + return CLAUDE_DIR.exists() or CODEX_DIR.exists() or GEMINI_DIR.exists() or OPENCODE_DIR.exists() or KIMI_DIR.exists() def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]: @@ -215,14 +219,14 @@ def _build_status_next_steps( steps = [] if not source_confirmed: steps.append( - "Ask the user to explicitly choose export source scope: Claude Code, Codex, Gemini, or all. " - "Then set it: dataclaw config --source . " + "Ask the user to explicitly choose export source scope: Claude Code, Codex, Gemini, OpenCode, Kimi, or all. " + "Then set it: dataclaw config --source . " "Do not run export until source scope is explicitly confirmed." ) else: steps.append( f"Source scope is currently set to '{configured_source}'. " - "If the user wants a different scope, run: dataclaw config --source ." + "If the user wants a different scope, run: dataclaw config --source ." ) if not projects_confirmed: steps.append( @@ -1196,7 +1200,7 @@ def main() -> None: cfg = sub.add_parser("config", help="View or set config") cfg.add_argument("--repo", type=str, help="Set HF repo") cfg.add_argument("--source", choices=sorted(EXPLICIT_SOURCE_CHOICES), - help="Set export source scope explicitly: claude, codex, gemini, or all") + help="Set export source scope explicitly: claude, codex, gemini, opencode, kimi, or all") cfg.add_argument("--exclude", type=str, help="Comma-separated projects to exclude") cfg.add_argument("--redact", type=str, help="Comma-separated strings to always redact (API keys, usernames, domains)") @@ -1317,12 +1321,12 @@ def _run_export(args) -> None: "error": "Source scope is not confirmed yet.", "hint": ( "Explicitly choose one source scope before exporting: " - "`claude`, `codex`, `gemini`, or `all`." + "`claude`, `codex`, `gemini`, `opencode`, `kimi`, or `all`." ), "required_action": ( - "Ask the user whether to export Claude Code, Codex, Gemini, or all. " - "Then run `dataclaw config --source ` " - "or pass `--source ` on the export command." + "Ask the user whether to export Claude Code, Codex, Gemini, OpenCode, Kimi, or all. " + "Then run `dataclaw config --source ` " + "or pass `--source ` on the export command." ), "allowed_sources": sorted(EXPLICIT_SOURCE_CHOICES), "blocked_on_step": "Step 2/6", @@ -1414,8 +1418,13 @@ def _run_export(args) -> None: elif source_filter == "gemini": from .parser import GEMINI_DIR print(f"Error: {GEMINI_DIR} not found.", file=sys.stderr) + elif source_filter == "opencode": + from .parser import OPENCODE_DIR + print(f"Error: {OPENCODE_DIR} not found.", file=sys.stderr) + elif source_filter == "kimi": + print(f"Error: {KIMI_DIR} not found.", file=sys.stderr) else: - print("Error: none of ~/.claude, ~/.codex, or ~/.gemini/tmp were found.", file=sys.stderr) + print("Error: none of ~/.claude, ~/.codex, ~/.gemini/tmp, ~/.local/share/opencode, or ~/.kimi were found.", file=sys.stderr) sys.exit(1) projects = _filter_projects_by_source(discover_projects(), source_filter) diff --git a/dataclaw/config.py b/dataclaw/config.py index 45add4c..21fed8e 100644 --- a/dataclaw/config.py +++ b/dataclaw/config.py @@ -13,7 +13,7 @@ class DataClawConfig(TypedDict, total=False): """Expected shape of the config dict.""" repo: str | None - source: str | None # "claude" | "codex" | "gemini" | "all" + source: str | None # "claude" | "codex" | "gemini" | "opencode" | "kimi" | "all" excluded_projects: list[str] redact_strings: list[str] redact_usernames: list[str] @@ -31,6 +31,7 @@ class DataClawConfig(TypedDict, total=False): "source": None, "excluded_projects": [], "redact_strings": [], + "redact_usernames": [], } diff --git a/dataclaw/parser.py b/dataclaw/parser.py index 12f3889..4004c31 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -18,6 +18,7 @@ CODEX_SOURCE = "codex" GEMINI_SOURCE = "gemini" OPENCODE_SOURCE = "opencode" +KIMI_SOURCE = "kimi" CLAUDE_DIR = Path.home() / ".claude" PROJECTS_DIR = CLAUDE_DIR / "projects" @@ -33,9 +34,15 @@ OPENCODE_DB_PATH = OPENCODE_DIR / "opencode.db" UNKNOWN_OPENCODE_CWD = "" +KIMI_DIR = Path.home() / ".kimi" +KIMI_SESSIONS_DIR = KIMI_DIR / "sessions" +KIMI_CONFIG_PATH = KIMI_DIR / "kimi.json" +UNKNOWN_KIMI_CWD = "" + _CODEX_PROJECT_INDEX: dict[str, list[Path]] = {} _GEMINI_HASH_MAP: dict[str, str] = {} _OPENCODE_PROJECT_INDEX: dict[str, list[str]] = {} +_KIMI_PROJECT_INDEX: dict[str, list[Path]] = {} def _build_gemini_hash_map() -> dict[str, str]: @@ -117,11 +124,12 @@ def _iter_jsonl(filepath: Path): def discover_projects() -> list[dict]: - """Discover Claude Code, Codex, and Gemini CLI projects with session counts.""" + """Discover Claude Code, Codex, Gemini CLI, OpenCode, and Kimi CLI projects with session counts.""" projects = _discover_claude_projects() projects.extend(_discover_codex_projects()) projects.extend(_discover_gemini_projects()) projects.extend(_discover_opencode_projects()) + projects.extend(_discover_kimi_projects()) return sorted(projects, key=lambda p: (p["display_name"], p["source"])) @@ -220,6 +228,89 @@ def _discover_opencode_projects() -> list[dict]: return projects +def _load_kimi_work_dirs() -> dict[str, str]: + """Load Kimi work directory mapping from config file. + + Returns a mapping from project_hash -> work_dir_path. + """ + if not KIMI_CONFIG_PATH.exists(): + return {} + try: + data = json.loads(KIMI_CONFIG_PATH.read_text()) + work_dirs = data.get("work_dirs", []) + return { + entry.get("path", ""): entry.get("path", "") + for entry in work_dirs + if entry.get("path") + } + except (json.JSONDecodeError, OSError): + return {} + + +def _get_kimi_project_hash(cwd: str) -> str: + """Generate Kimi project hash from working directory path. + + Kimi uses MD5 hash of the absolute path as project directory name. + """ + return hashlib.md5(cwd.encode()).hexdigest() + + +def _discover_kimi_projects() -> list[dict]: + """Discover Kimi CLI projects with session counts.""" + if not KIMI_SESSIONS_DIR.exists(): + return [] + + # 加载工作目录配置以获取路径映射 + work_dirs = _load_kimi_work_dirs() + # 建立路径到哈希的反向映射 + path_to_hash = {path: _get_kimi_project_hash(path) for path in work_dirs} + hash_to_path = {h: p for p, h in path_to_hash.items()} + + projects = [] + for project_dir in sorted(KIMI_SESSIONS_DIR.iterdir()): + if not project_dir.is_dir(): + continue + + project_hash = project_dir.name + # 查找所有会话子目录 + session_dirs = [d for d in project_dir.iterdir() if d.is_dir()] + if not session_dirs: + continue + + # 计算总会话数和总大小 + total_sessions = 0 + total_size = 0 + for session_dir in session_dirs: + context_file = session_dir / "context.jsonl" + if context_file.exists(): + total_sessions += 1 + total_size += context_file.stat().st_size + + if total_sessions == 0: + continue + + # 尝试解析项目路径 + project_path = hash_to_path.get(project_hash) + if project_path: + display_name = f"kimi:{Path(project_path).name}" + dir_name = project_path + else: + # 无法解析时使用哈希前8位 + display_name = f"kimi:{project_hash[:8]}" + dir_name = project_hash + + projects.append( + { + "dir_name": dir_name, + "display_name": display_name, + "session_count": total_sessions, + "total_size_bytes": total_size, + "source": KIMI_SOURCE, + } + ) + return projects + + def parse_project_sessions( project_dir_name: str, anonymizer: Anonymizer, @@ -274,6 +365,34 @@ def parse_project_sessions( sessions.append(parsed) return sessions + if source == KIMI_SOURCE: + # project_dir_name 是工作目录路径 + project_hash = _get_kimi_project_hash(project_dir_name) + project_path = KIMI_SESSIONS_DIR / project_hash + if not project_path.exists(): + return [] + + sessions = [] + for session_dir in sorted(project_path.iterdir()): + if not session_dir.is_dir(): + continue + context_file = session_dir / "context.jsonl" + if not context_file.exists(): + continue + parsed = _parse_kimi_session_file( + context_file, + anonymizer=anonymizer, + include_thinking=include_thinking, + ) + if parsed and parsed["messages"]: + parsed["project"] = _build_kimi_project_name(project_dir_name) + parsed["source"] = KIMI_SOURCE + # 如果模型未设置,使用默认模型名 + if not parsed.get("model"): + parsed["model"] = "kimi-k2" + sessions.append(parsed) + return sessions + project_path = PROJECTS_DIR / project_dir_name if not project_path.exists(): return [] @@ -973,6 +1092,12 @@ def _build_opencode_project_name(cwd: str) -> str: return f"opencode:{Path(cwd).name or cwd}" +def _build_kimi_project_name(cwd: str) -> str: + if cwd == UNKNOWN_KIMI_CWD: + return "kimi:unknown" + return f"kimi:{Path(cwd).name or cwd}" + + def _get_opencode_project_index(refresh: bool = False) -> dict[str, list[str]]: global _OPENCODE_PROJECT_INDEX if refresh or not _OPENCODE_PROJECT_INDEX: @@ -980,6 +1105,106 @@ def _get_opencode_project_index(refresh: bool = False) -> dict[str, list[str]]: return _OPENCODE_PROJECT_INDEX +def _parse_kimi_session_file( + filepath: Path, + anonymizer: Anonymizer, + include_thinking: bool = True, +) -> dict | None: + """Parse a Kimi CLI context.jsonl file into structured session data.""" + messages: list[dict[str, Any]] = [] + metadata: dict[str, Any] = { + "session_id": filepath.parent.name, # session_id from directory name + "cwd": None, + "git_branch": None, + "model": None, + "start_time": None, + "end_time": None, + } + stats = _make_stats() + + try: + for entry in _iter_jsonl(filepath): + role = entry.get("role") + + if role == "user": + content = entry.get("content") + if isinstance(content, str) and content.strip(): + messages.append({ + "role": "user", + "content": anonymizer.text(content.strip()), + "timestamp": None, + }) + stats["user_messages"] += 1 + + elif role == "assistant": + msg: dict[str, Any] = {"role": "assistant"} + + # 提取内容(可能包含 think 和 text) + content = entry.get("content") + text_parts = [] + thinking_parts = [] + + if isinstance(content, list): + for block in content: + if not isinstance(block, dict): + continue + block_type = block.get("type") + if block_type == "text": + text = block.get("text", "").strip() + if text: + text_parts.append(anonymizer.text(text)) + elif block_type == "think" and include_thinking: + think = block.get("think", "").strip() + if think: + thinking_parts.append(anonymizer.text(think)) + + if text_parts: + msg["content"] = "\n\n".join(text_parts) + if thinking_parts: + msg["thinking"] = "\n\n".join(thinking_parts) + + # 提取工具调用 + tool_calls = entry.get("tool_calls", []) + tool_uses = [] + if isinstance(tool_calls, list): + for tc in tool_calls: + if not isinstance(tc, dict): + continue + func = tc.get("function", {}) + if isinstance(func, dict): + tool_name = func.get("name") + args_str = func.get("arguments", "") + try: + args = json.loads(args_str) if isinstance(args_str, str) else args_str + except json.JSONDecodeError: + args = args_str + tool_uses.append({ + "tool": tool_name, + "input": _summarize_tool_input(tool_name, args, anonymizer), + }) + + if tool_uses: + msg["tool_uses"] = tool_uses + stats["tool_uses"] += len(tool_uses) + + # 只添加有内容的助手消息 + if text_parts or thinking_parts or tool_uses: + messages.append(msg) + stats["assistant_messages"] += 1 + + elif role == "_usage": + # 提取 token 使用量 + token_count = entry.get("token_count") + if isinstance(token_count, int): + # Kimi 的 token_count 是累积值,我们取最大值作为输出 + stats["output_tokens"] = max(stats["output_tokens"], token_count) + + except OSError: + return None + + return _make_session_result(metadata, messages, stats) + + def _build_opencode_project_index() -> dict[str, list[str]]: if not OPENCODE_DB_PATH.exists(): return {} diff --git a/tests/test_cli.py b/tests/test_cli.py index adfd118..d51995f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -400,7 +400,7 @@ def test_no_projects(self, monkeypatch, capsys): monkeypatch.setattr("dataclaw.cli.discover_projects", lambda: []) list_projects() captured = capsys.readouterr() - assert "No Claude Code, Codex, Gemini CLI, or OpenCode sessions" in captured.out + assert "No Claude Code, Codex, Gemini CLI, OpenCode, or Kimi CLI sessions" in captured.out def test_source_filter_codex(self, monkeypatch, capsys): monkeypatch.setattr( @@ -626,7 +626,7 @@ def test_export_requires_explicit_source_selection(self, monkeypatch, capsys): assert payload["error"] == "Source scope is not confirmed yet." assert payload["blocked_on_step"] == "Step 2/6" assert len(payload["process_steps"]) == 6 - assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "gemini", "opencode"] + assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "gemini", "kimi", "opencode"] assert payload["next_command"] == "dataclaw config --source all" def test_configure_next_steps_require_full_folder_presentation(self): diff --git a/tests/test_parser.py b/tests/test_parser.py index 0f0e506..22ae9f6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -400,6 +400,7 @@ def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") def _write_opencode_db(self, db_path): conn = sqlite3.connect(db_path) @@ -948,6 +949,7 @@ def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") def test_discover_includes_subagent_sessions(self, tmp_path, monkeypatch, mock_anonymizer): self._disable_codex(tmp_path, monkeypatch) From 902e285fa6187b7e252f8aa272f9c722c86a3564 Mon Sep 17 00:00:00 2001 From: lyx Date: Thu, 26 Feb 2026 11:59:54 +0800 Subject: [PATCH 2/2] update comments --- dataclaw/parser.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dataclaw/parser.py b/dataclaw/parser.py index 4004c31..d2969a6 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -260,9 +260,9 @@ def _discover_kimi_projects() -> list[dict]: if not KIMI_SESSIONS_DIR.exists(): return [] - # 加载工作目录配置以获取路径映射 + # Load work directory config to get path mapping work_dirs = _load_kimi_work_dirs() - # 建立路径到哈希的反向映射 + # Build reverse mapping from path to hash path_to_hash = {path: _get_kimi_project_hash(path) for path in work_dirs} hash_to_path = {h: p for p, h in path_to_hash.items()} @@ -272,12 +272,12 @@ def _discover_kimi_projects() -> list[dict]: continue project_hash = project_dir.name - # 查找所有会话子目录 + # Find all session subdirectories session_dirs = [d for d in project_dir.iterdir() if d.is_dir()] if not session_dirs: continue - # 计算总会话数和总大小 + # Calculate total sessions and size total_sessions = 0 total_size = 0 for session_dir in session_dirs: @@ -289,13 +289,13 @@ def _discover_kimi_projects() -> list[dict]: if total_sessions == 0: continue - # 尝试解析项目路径 + # Try to resolve project path project_path = hash_to_path.get(project_hash) if project_path: display_name = f"kimi:{Path(project_path).name}" dir_name = project_path else: - # 无法解析时使用哈希前8位 + # Use first 8 chars of hash if unresolved display_name = f"kimi:{project_hash[:8]}" dir_name = project_hash @@ -366,7 +366,7 @@ def parse_project_sessions( return sessions if source == KIMI_SOURCE: - # project_dir_name 是工作目录路径 + # project_dir_name is the working directory path project_hash = _get_kimi_project_hash(project_dir_name) project_path = KIMI_SESSIONS_DIR / project_hash if not project_path.exists(): @@ -387,7 +387,7 @@ def parse_project_sessions( if parsed and parsed["messages"]: parsed["project"] = _build_kimi_project_name(project_dir_name) parsed["source"] = KIMI_SOURCE - # 如果模型未设置,使用默认模型名 + # Use default model name if not set if not parsed.get("model"): parsed["model"] = "kimi-k2" sessions.append(parsed) @@ -1139,7 +1139,7 @@ def _parse_kimi_session_file( elif role == "assistant": msg: dict[str, Any] = {"role": "assistant"} - # 提取内容(可能包含 think 和 text) + # Extract content (may include think and text) content = entry.get("content") text_parts = [] thinking_parts = [] @@ -1163,7 +1163,7 @@ def _parse_kimi_session_file( if thinking_parts: msg["thinking"] = "\n\n".join(thinking_parts) - # 提取工具调用 + # Extract tool calls tool_calls = entry.get("tool_calls", []) tool_uses = [] if isinstance(tool_calls, list): @@ -1187,16 +1187,16 @@ def _parse_kimi_session_file( msg["tool_uses"] = tool_uses stats["tool_uses"] += len(tool_uses) - # 只添加有内容的助手消息 + # Only add assistant messages with content if text_parts or thinking_parts or tool_uses: messages.append(msg) stats["assistant_messages"] += 1 elif role == "_usage": - # 提取 token 使用量 + # Extract token usage token_count = entry.get("token_count") if isinstance(token_count, int): - # Kimi 的 token_count 是累积值,我们取最大值作为输出 + # Kimi's token_count is cumulative, use max as output stats["output_tokens"] = max(stats["output_tokens"], token_count) except OSError: