From 9cb0e6ab8263944689354baba8a0d671168c3963 Mon Sep 17 00:00:00 2001 From: woctordho Date: Fri, 27 Feb 2026 10:21:06 +0800 Subject: [PATCH 1/4] Adjust anonymizer logic and add Windows support --- dataclaw/anonymizer.py | 83 +++++++++++++++++----------------------- tests/test_anonymizer.py | 65 +++++++++---------------------- 2 files changed, 54 insertions(+), 94 deletions(-) diff --git a/dataclaw/anonymizer.py b/dataclaw/anonymizer.py index 045fe01..5258312 100644 --- a/dataclaw/anonymizer.py +++ b/dataclaw/anonymizer.py @@ -15,58 +15,49 @@ def _detect_home_dir() -> tuple[str, str]: return home, username -def anonymize_path(path: str, username: str, username_hash: str, home: str | None = None) -> str: - """Strip a path to project-relative and hash the username.""" - if not path: - return path - - if home is None: - home = os.path.expanduser("~") - prefixes = set() - for base in (f"/Users/{username}", f"/home/{username}", home): - for subdir in ("Documents", "Downloads", "Desktop"): - prefixes.add(f"{base}/{subdir}/") - prefixes.add(f"{base}/") +def anonymize_text(text: str, username: str, username_hash: str, home: str | None = None) -> str: + if not text or not username: + return text - # Try longest prefixes first (subdirectory matches before bare home) - home_patterns = sorted(prefixes, key=len, reverse=True) + escaped = re.escape(username) - for prefix in home_patterns: - if path.startswith(prefix): - rest = path[len(prefix):] - if "/Documents/" in prefix or "/Downloads/" in prefix or "/Desktop/" in prefix: - return rest - return f"{username_hash}/{rest}" + # Replace bare username in contexts (ls output, prose, etc.) + # Only if username is >= 4 chars to avoid false positives + # \b does not match word boundaries around underscore, but we need to match them + if len(username) >= 4: + return re.sub(rf"(? , \Users\ , \\Users\\ + # Ignore case for Windows-like pattern + text = re.sub(rf"([/\\]+Users[/\\]+){escaped}(?=[^a-zA-Z0-9]|$)", rf"\g<1>{username_hash}", text, flags=re.IGNORECASE) + # Replace /home/ + text = re.sub(rf"/home/{escaped}(?=[^a-zA-Z0-9]|$)", f"/home/{username_hash}", text) -def anonymize_text(text: str, username: str, username_hash: str) -> str: - if not text or not username: - return text + # If home is not conventional, replace it with more specific pattern + if home and not home.startswith(("/Users/", "/home/", "C:\\Users\\")): + # Escape home and replace / or \ with `r"[/\\]+"` + home_escaped = home.replace("\\", "/") + home_escaped = re.escape(home_escaped) + home_escaped = home_escaped.replace("/", r"[/\\]+") - escaped = re.escape(username) + def f(match): + # match.group(0) is a non-escaped string + return re.sub(rf"(? and /home/ - text = re.sub(rf"/Users/{escaped}(?=/|[^a-zA-Z0-9_-]|$)", f"/{username_hash}", text) - text = re.sub(rf"/home/{escaped}(?=/|[^a-zA-Z0-9_-]|$)", f"/{username_hash}", text) + text = re.sub(home_escaped, f, text, flags=re.IGNORECASE) - # Catch hyphen-encoded paths: -Users-peteromalley- or -Users-peteromalley/ - text = re.sub(rf"-Users-{escaped}(?=-|/|$)", f"-Users-{username_hash}", text) - text = re.sub(rf"-home-{escaped}(?=-|/|$)", f"-home-{username_hash}", text) + # Catch hyphen-encoded paths: -Users-username- or -Users-username/ + text = re.sub(rf"-Users-{escaped}(?=[^a-zA-Z0-9]|$)", f"-Users-{username_hash}", text, flags=re.IGNORECASE) + text = re.sub(rf"-home-{escaped}(?=[^a-zA-Z0-9]|$)", f"-home-{username_hash}", text) - # Catch temp paths like /private/tmp/claude-501/-Users-peteromalley/ - text = re.sub(rf"claude-\d+/-Users-{escaped}", f"claude-XXX/-Users-{username_hash}", text) + return text - # Final pass: replace bare username in remaining contexts (ls output, prose, etc.) - # Only if username is >= 4 chars to avoid false positives - if len(username) >= 4: - text = re.sub(rf"\b{escaped}\b", username_hash, text) - return text +# Backward compatibility +anonymize_path = anonymize_text class Anonymizer: @@ -84,22 +75,18 @@ def __init__(self, extra_usernames: list[str] | None = None): self._extra.append((name, _hash_username(name))) def path(self, file_path: str) -> str: - result = anonymize_path(file_path, self.username, self.username_hash, self.home) - result = anonymize_text(result, self.username, self.username_hash) - for name, hashed in self._extra: - result = _replace_username(result, name, hashed) - return result + return self.text(file_path) def text(self, content: str) -> str: - result = anonymize_text(content, self.username, self.username_hash) + result = anonymize_text(content, self.username, self.username_hash, self.home) for name, hashed in self._extra: result = _replace_username(result, name, hashed) return result def _replace_username(text: str, username: str, username_hash: str) -> str: - if not text or not username or len(username) < 3: + if not text or not username or len(username) < 4: return text escaped = re.escape(username) - text = re.sub(escaped, username_hash, text, flags=re.IGNORECASE) + text = re.sub(rf"(?= 4 chars, username is hashed using global replace result = anonymize_path( - "/Users/alice/Documents/myproject/src/main.py", - "alice", "user_abc12345", home="/Users/alice", + "/Users/alice/something", + "alice", "user_abc12345" ) - assert result == "myproject/src/main.py" - - def test_downloads_prefix_stripped(self): - result = anonymize_path( - "/Users/alice/Downloads/file.zip", - "alice", "user_abc12345", home="/Users/alice", - ) - assert result == "file.zip" - - def test_desktop_prefix_stripped(self): - result = anonymize_path( - "/Users/alice/Desktop/notes.txt", - "alice", "user_abc12345", home="/Users/alice", - ) - assert result == "notes.txt" + assert result == "/Users/user_abc12345/something" def test_bare_home_hashed(self): result = anonymize_path( - "/Users/alice/somedir/file.py", - "alice", "user_abc12345", home="/Users/alice", + "/Users/s/somedir/file.py", + "s", "user_abc12345", home="/Users/s", ) - assert result == "user_abc12345/somedir/file.py" + assert result == "/Users/user_abc12345/somedir/file.py" def test_linux_home_path(self): result = anonymize_path( - "/home/alice/Documents/project/file.py", - "alice", "user_abc12345", home="/home/alice", + "/home/s/Documents/project/file.py", + "s", "user_abc12345", home="/home/s", ) - assert result == "project/file.py" + assert result == "/home/user_abc12345/Documents/project/file.py" def test_path_not_under_home(self): result = anonymize_path( "/var/log/syslog", - "alice", "user_abc12345", home="/Users/alice", + "s", "user_abc12345", home="/Users/s", ) assert result == "/var/log/syslog" - def test_fallback_users_replacement(self): - # Path with username not matching the prefix set - result = anonymize_path( - "/tmp/Users/alice/something", - "alice", "user_abc12345", home="/Users/alice", - ) - # Falls through prefix matching, hits the fallback .replace - assert "user_abc12345" in result or "/tmp/" in result - # --- anonymize_text --- @@ -109,40 +87,35 @@ def test_users_path_replaced(self): "File at /Users/alice/project/main.py", "alice", "user_abc12345", ) - assert "/user_abc12345/project/main.py" in result + assert result == "File at /Users/user_abc12345/project/main.py" def test_home_path_replaced(self): result = anonymize_text( "File at /home/alice/project/main.py", "alice", "user_abc12345", ) - assert "/user_abc12345/project/main.py" in result + assert result == "File at /home/user_abc12345/project/main.py" def test_hyphen_encoded_path(self): result = anonymize_text( "-Users-alice-Documents-myproject", "alice", "user_abc12345", ) - assert "-Users-user_abc12345" in result + assert result == "-Users-user_abc12345-Documents-myproject" def test_temp_path(self): - # The hyphen-encoded path regex runs before the temp path regex, - # so the username gets replaced but claude-XXX may not trigger. - # The important thing is the username is anonymized. result = anonymize_text( "/private/tmp/claude-501/-Users-alice-Documents-proj/foo", "alice", "user_abc12345", ) - assert "alice" not in result - assert "user_abc12345" in result + assert result == "/private/tmp/claude-501/-Users-user_abc12345-Documents-proj/foo" def test_bare_username_replaced(self): result = anonymize_text( "Hello alice, welcome back", "alice", "user_abc12345", ) - assert "alice" not in result - assert "user_abc12345" in result + assert result == "Hello user_abc12345, welcome back" def test_short_username_not_replaced_bare(self): # Usernames < 4 chars should NOT be replaced as bare words @@ -150,7 +123,7 @@ def test_short_username_not_replaced_bare(self): "Hello bob, welcome back", "bob", "user_abc12345", ) - assert "bob" in result # bare replacement skipped for short username + assert result == "Hello bob, welcome back" def test_short_username_path_still_replaced(self): # Even short usernames should be replaced in path contexts @@ -158,7 +131,7 @@ def test_short_username_path_still_replaced(self): "File at /Users/bob/project", "bob", "user_abc12345", ) - assert "/user_abc12345/project" in result + assert result == "File at /Users/user_abc12345/project" # --- Anonymizer class --- From d83987afd9d8ec91eb05e74746b793638870ab9a Mon Sep 17 00:00:00 2001 From: woctordho Date: Fri, 27 Feb 2026 10:53:27 +0800 Subject: [PATCH 2/4] Simplify handling hyphen-encoded path --- dataclaw/anonymizer.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/dataclaw/anonymizer.py b/dataclaw/anonymizer.py index 5258312..4940fe4 100644 --- a/dataclaw/anonymizer.py +++ b/dataclaw/anonymizer.py @@ -29,19 +29,19 @@ def anonymize_text(text: str, username: str, username_hash: str, home: str | Non # When username is < 4 chars, replace with more specific patterns - # Replace /Users/ , \Users\ , \\Users\\ + # Match /Users/username , \Users\username , \\Users\\username , -Users-username (hyphen-encoded path like Claude Code and HuggingFace cache) + # Match conventional indicators of home dir: 'Users' and 'home' # Ignore case for Windows-like pattern - text = re.sub(rf"([/\\]+Users[/\\]+){escaped}(?=[^a-zA-Z0-9]|$)", rf"\g<1>{username_hash}", text, flags=re.IGNORECASE) - - # Replace /home/ - text = re.sub(rf"/home/{escaped}(?=[^a-zA-Z0-9]|$)", f"/home/{username_hash}", text) + text = re.sub(rf"([/\\-]+(Users|home)[/\\-]+){escaped}(?=[^a-zA-Z0-9]|$)", rf"\g<1>{username_hash}", text, flags=re.IGNORECASE) # If home is not conventional, replace it with more specific pattern if home and not home.startswith(("/Users/", "/home/", "C:\\Users\\")): - # Escape home and replace / or \ with `r"[/\\]+"` + # Escape home and replace / or \ with `r"[/\\-]+"` home_escaped = home.replace("\\", "/") home_escaped = re.escape(home_escaped) - home_escaped = home_escaped.replace("/", r"[/\\]+") + home_escaped = home_escaped.replace("/", r"[/\\-]+") + # In WSL and MSYS2, C:\ may be represented by /c/ + home_escaped = home_escaped.replace(":", ":?") def f(match): # match.group(0) is a non-escaped string @@ -49,10 +49,6 @@ def f(match): text = re.sub(home_escaped, f, text, flags=re.IGNORECASE) - # Catch hyphen-encoded paths: -Users-username- or -Users-username/ - text = re.sub(rf"-Users-{escaped}(?=[^a-zA-Z0-9]|$)", f"-Users-{username_hash}", text, flags=re.IGNORECASE) - text = re.sub(rf"-home-{escaped}(?=[^a-zA-Z0-9]|$)", f"-home-{username_hash}", text) - return text From e4a62d7740acee70bb3a399604a11a9138a23d94 Mon Sep 17 00:00:00 2001 From: woctordho Date: Fri, 27 Feb 2026 11:17:13 +0800 Subject: [PATCH 3/4] Add tests --- tests/test_anonymizer.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index c9d6069..81d8111 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -68,6 +68,34 @@ def test_path_not_under_home(self): ) assert result == "/var/log/syslog" + def test_windows_users_path(self): + result = anonymize_path( + r"C:\Users\bob\Documents\file.txt", + "bob", "user_abc12345", + ) + assert result == r"C:\Users\user_abc12345\Documents\file.txt" + + def test_windows_users_path_double_backslashes(self): + result = anonymize_path( + r"\\Users\\bob\\Documents\\file.txt", + "bob", "user_abc12345", + ) + assert result == r"\\Users\\user_abc12345\\Documents\\file.txt" + + def test_windows_custom_home_path(self): + result = anonymize_path( + "C:\\custom_home\\bob\\project\\file.py", + "bob", "user_abc12345", home=r"C:\custom_home\bob", + ) + assert result == "C:\\custom_home\\user_abc12345\\project\\file.py" + + def test_msys2_custom_home_path(self): + result = anonymize_path( + "/c/custom_home/bob/project/file.py", + "bob", "user_abc12345", home=r"C:\custom_home\bob", + ) + assert result == "/c/custom_home/user_abc12345/project/file.py" + # --- anonymize_text --- From a46fa0780313da78779a3cc23659b4a290180dd8 Mon Sep 17 00:00:00 2001 From: woctordho Date: Fri, 27 Feb 2026 11:43:07 +0800 Subject: [PATCH 4/4] Optimize speed --- dataclaw/anonymizer.py | 95 +++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 29 deletions(-) diff --git a/dataclaw/anonymizer.py b/dataclaw/anonymizer.py index 4940fe4..3f04aa8 100644 --- a/dataclaw/anonymizer.py +++ b/dataclaw/anonymizer.py @@ -1,5 +1,6 @@ """Anonymize PII in Claude Code log data.""" +import functools import hashlib import os import re @@ -15,39 +16,62 @@ def _detect_home_dir() -> tuple[str, str]: return home, username +@functools.lru_cache(maxsize=32) +def _get_username_pattern(username: str) -> re.Pattern: + escaped = re.escape(username) + # \b does not match word boundaries around underscore, but we need to match them + return re.compile(rf"(? re.Pattern: + escaped = re.escape(username) + # Match /Users/username , \Users\username , \\Users\\username , -Users-username (hyphen-encoded path like Claude Code and HuggingFace cache) + # Match conventional indicators of home dir: 'Users' and 'home' + # Ignore case for Windows-like path + return re.compile(rf"([/\\-]+(Users|home)[/\\-]+){escaped}(?=[^a-zA-Z0-9]|$)", flags=re.IGNORECASE) + + +@functools.lru_cache(maxsize=32) +def _get_custom_home_pattern(home: str) -> re.Pattern | None: + if home.startswith(("/Users/", "/home/", "C:\\Users\\")): + return None + + # If home is not conventional, replace with more specific pattern + + # Escape home and replace / or \ with `r"[/\\-]+"` + home_escaped = home.replace("\\", "/") + home_escaped = re.escape(home_escaped) + home_escaped = home_escaped.replace("/", r"[/\\-]+") + # In WSL and MSYS2, C:\ may be represented by /c/ + home_escaped = home_escaped.replace(":", ":?") + return re.compile(home_escaped, flags=re.IGNORECASE) + + def anonymize_text(text: str, username: str, username_hash: str, home: str | None = None) -> str: if not text or not username: return text - escaped = re.escape(username) + if username.lower() not in text.lower(): + return text # Replace bare username in contexts (ls output, prose, etc.) # Only if username is >= 4 chars to avoid false positives - # \b does not match word boundaries around underscore, but we need to match them if len(username) >= 4: - return re.sub(rf"(?{username_hash}", text, flags=re.IGNORECASE) + text = _get_home_pattern(username).sub(rf"\g<1>{username_hash}", text) - # If home is not conventional, replace it with more specific pattern - if home and not home.startswith(("/Users/", "/home/", "C:\\Users\\")): - # Escape home and replace / or \ with `r"[/\\-]+"` - home_escaped = home.replace("\\", "/") - home_escaped = re.escape(home_escaped) - home_escaped = home_escaped.replace("/", r"[/\\-]+") - # In WSL and MSYS2, C:\ may be represented by /c/ - home_escaped = home_escaped.replace(":", ":?") - - def f(match): - # match.group(0) is a non-escaped string - return re.sub(rf"(?= 4: + self._extra_dict[name.lower()] = _hash_username(name) + + self._extra = list(self._extra_dict.keys()) + + if self._extra_dict: + escaped_names = [re.escape(k) for k in sorted(self._extra_dict.keys(), key=len, reverse=True)] + self._extra_pattern = re.compile(rf"(? str: return self.text(file_path) def text(self, content: str) -> str: result = anonymize_text(content, self.username, self.username_hash, self.home) - for name, hashed in self._extra: - result = _replace_username(result, name, hashed) + if self._extra_pattern: + def f(match): + return self._extra_dict[match.group(1).lower()] + result = self._extra_pattern.sub(f, result) return result def _replace_username(text: str, username: str, username_hash: str) -> str: if not text or not username or len(username) < 4: return text - escaped = re.escape(username) - text = re.sub(rf"(?