From c2f7bd0eb6a44db9de38d88996b9676353e8e182 Mon Sep 17 00:00:00 2001
From: keyliang <keyliang@txcombo.com>
Date: Mon, 2 Mar 2026 22:52:40 +0800
Subject: [PATCH] Fix: Explicit UTF-8 encoding for cross-platform file reading

On Windows (especially Chinese/Asian locales) and some other systems,
the default file encoding is not UTF-8. This causes UnicodeDecodeError
when reading session files that contain non-ASCII characters.

This fix explicitly specifies encoding="utf-8" in all open() and
read_text() calls when reading JSON files.

Changes:
- Line 82: Add encoding="utf-8" to read_text() in _extract_project_path_from_sessions()
- Line 124: Add encoding="utf-8" to open() in _iter_jsonl()
- Line 266: Add encoding="utf-8" to read_text() in _load_kimi_work_dirs()
---
 dataclaw/parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dataclaw/parser.py b/dataclaw/parser.py
index 50cc39e..b7f8977 100644
--- a/dataclaw/parser.py
+++ b/dataclaw/parser.py
@@ -79,7 +79,7 @@ def _extract_project_path_from_sessions(project_hash: str) -> str | None:
         return None
     for session_file in sorted(chats_dir.glob("session-*.json"), reverse=True):
         try:
-            data = json.loads(session_file.read_text())
+            data = json.loads(session_file.read_text(encoding="utf-8"))
         except (json.JSONDecodeError, OSError):
             continue
         for msg in data.get("messages", []):
@@ -121,7 +121,7 @@ def _resolve_gemini_hash(project_hash: str) -> str:
 
 def _iter_jsonl(filepath: Path):
     """Yield parsed JSON objects from a JSONL file, skipping blank/malformed lines."""
-    with open(filepath) as f:
+    with open(filepath, encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             if not line:
@@ -263,7 +263,7 @@ def _load_kimi_work_dirs() -> dict[str, str]:
     if not KIMI_CONFIG_PATH.exists():
         return {}
     try:
-        data = json.loads(KIMI_CONFIG_PATH.read_text())
+        data = json.loads(KIMI_CONFIG_PATH.read_text(encoding="utf-8"))
         work_dirs = data.get("work_dirs", [])
         return {
             entry.get("path", ""): entry.get("path", "")