From 78cebcbbdc463730f2adaeb290345dfa4c4d8de7 Mon Sep 17 00:00:00 2001 From: Leo Date: Mon, 1 Dec 2025 02:41:54 -0500 Subject: [PATCH] fix: prioritize utf-8 over system locale to prevent crashes on Windows --- src/gitingest/utils/file_utils.py | 2 +- tests/test_windows_encoding.py | 48 +++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tests/test_windows_encoding.py diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py index 2c6ef74d..066195bc 100644 --- a/src/gitingest/utils/file_utils.py +++ b/src/gitingest/utils/file_utils.py @@ -27,7 +27,7 @@ def _get_preferred_encodings() -> list[str]: platform's default encoding followed by common fallback encodings. """ - encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] + encodings = ["utf-8", locale.getpreferredencoding(), "utf-16", "utf-16le", "utf-8-sig", "latin"] if platform.system() == "Windows": encodings += ["cp1252", "iso-8859-1"] return list(dict.fromkeys(encodings)) diff --git a/tests/test_windows_encoding.py b/tests/test_windows_encoding.py new file mode 100644 index 00000000..65706bf3 --- /dev/null +++ b/tests/test_windows_encoding.py @@ -0,0 +1,48 @@ +"""Tests for Windows encoding handling.""" + +from pathlib import Path +from unittest.mock import patch + +from gitingest.schemas.filesystem import FileSystemNode, FileSystemNodeType +from gitingest.utils.file_utils import _CHUNK_SIZE + + +def test_utf8_priority_on_windows(tmp_path: Path) -> None: + """Ensure UTF-8 files are read correctly even on Windows systems defaulting to cp1252. + + This test reproduces a specific crash scenario where: + 1. A file is valid UTF-8. + 2. The first 1024 bytes are safe ASCII (passing the initial CP1252 check). + 3. Subsequent bytes contain characters undefined in CP1252 (e.g., smart quotes), + causing a UnicodeDecodeError if CP1252 is preferred over UTF-8. + """ + file_path = tmp_path / "test_encoding_crash.md" + + # The right double quotation mark (”) is: + # - UTF-8: 0xE2 0x80 0x9D + # - CP1252: Byte 0x9D is UNDEFINED and causes a crash during full read. + poison_char = "”" + + # Fill buffer to bypass the initial chunk read (which checks the first 1024 bytes) + content = ("a" * (_CHUNK_SIZE + 50)) + poison_char + "\nEnd." + + file_path.write_text(content, encoding="utf-8") + + node = FileSystemNode( + name=file_path.name, + type=FileSystemNodeType.FILE, + path_str=str(file_path), + path=file_path, + size=file_path.stat().st_size, + ) + + # Mock the environment to simulate Windows with CP1252 locale + with patch("locale.getpreferredencoding", return_value="cp1252"), patch( + "platform.system", + return_value="Windows", + ): + read_content = node.content + + assert "Error reading file" not in read_content, "Failed to read valid UTF-8 file on Windows/CP1252 simulation" + assert poison_char in read_content, "Failed to correctly decode the special character" + assert read_content.endswith("End."), "Content appears truncated or malformed"