Skip to content

Commit 1b57725

Browse files
takeruhukushimapre-commit-ci-lite[bot]jamesbraza
authored
refactor: Improve maybe_is_text for multilingual document support (#1179)
Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Co-authored-by: James Braza <jamesbraza@gmail.com>
1 parent 1169f63 commit 1b57725

File tree

5 files changed

+4103
-380
lines changed

5 files changed

+4103
-380
lines changed

src/paperqa/utils.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
import math
77
import os
88
import re
9-
import string
109
import unicodedata
10+
from collections import Counter
1111
from collections.abc import Awaitable, Callable, Collection, Iterable, Iterator, Mapping
1212
from datetime import datetime
1313
from functools import reduce
@@ -32,6 +32,8 @@
3232

3333
logger = logging.getLogger(__name__)
3434

35+
MAX_TEXT_ENTROPY = 8.0
36+
3537
T = TypeVar("T")
3638

3739

@@ -57,15 +59,19 @@ def maybe_is_text(s: str, thresh: float = 2.5) -> bool:
5759
if not s:
5860
return False
5961

60-
entropy = 0.0
6162
s_wo_spaces = s.replace(" ", "")
62-
for c in string.printable:
63-
p = s_wo_spaces.count(c) / len(s_wo_spaces)
64-
if p > 0:
65-
entropy += -p * math.log2(p)
63+
if not s_wo_spaces:
64+
return False
65+
66+
counts = Counter(s_wo_spaces)
67+
entropy = 0.0
68+
length = len(s_wo_spaces)
69+
for count in counts.values():
70+
p = count / length
71+
entropy += -p * math.log2(p)
6672

6773
# Check if the entropy is within a reasonable range for text
68-
return entropy > thresh
74+
return MAX_TEXT_ENTROPY > entropy > thresh
6975

7076

7177
def maybe_is_pdf(file: BinaryIO) -> bool:

0 commit comments

Comments
 (0)