Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 311 additions & 8 deletions lib/charset_normalizer/api.py

Large diffs are not rendered by default.

59 changes: 46 additions & 13 deletions lib/charset_normalizer/cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def encoding_unicode_range(iana_name: str) -> list[str]:
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise OSError("Function not supported on multi-byte code page")
raise OSError( # Defensive:
"Function not supported on multi-byte code page"
)

decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder

Expand Down Expand Up @@ -179,7 +181,7 @@ def characters_popularity_compare(
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError(f"{language} not available")
raise ValueError(f"{language} not available") # Defensive:

character_approved_count: int = 0
frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
Expand All @@ -205,6 +207,13 @@ def characters_popularity_compare(
(lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
]

# Pre-extract lr and orr arrays for faster iteration in the inner loop.
# Plain integer loops with local arrays are much faster under mypyc than
# generator expression sums over a list of tuples.
common_count: int = len(common_chars)
common_lr: list[int] = [p[0] for p in common_chars]
common_orr: list[int] = [p[1] for p in common_chars]

for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
Expand All @@ -230,18 +239,21 @@ def characters_popularity_compare(

# Count how many characters appear "before" in both orderings,
# and how many appear "at or after" in both orderings.
before_match_count: int = sum(
1
for lr, orr in common_chars
if lr < character_rank_in_language and orr < character_rank
)
# Single pass over pre-extracted arrays — much faster under mypyc
# than two generator expression sums.
before_match_count: int = 0
after_match_count: int = 0
for i in range(common_count):
lr_i: int = common_lr[i]
orr_i: int = common_orr[i]
if lr_i < character_rank_in_language:
if orr_i < character_rank:
before_match_count += 1
else:
if orr_i >= character_rank:
after_match_count += 1

after_len: int = target_language_characters_count - character_rank_in_language
after_match_count: int = sum(
1
for lr, orr in common_chars
if lr >= character_rank_in_language and orr >= character_rank
)

if character_rank_in_language == 0 and before_match_count <= 4:
character_approved_count += 1
Expand Down Expand Up @@ -273,15 +285,32 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
single_layer_key: str | None = None
multi_layer: bool = False

# Cache the last character_range and its resolved layer to avoid repeated
# is_suspiciously_successive_range calls for consecutive same-range chars.
prev_character_range: str | None = None
prev_layer_target: str | None = None

for character in decoded_sequence:
if character.isalpha() is False:
continue

character_range: str | None = unicode_range(character)
# ASCII fast-path: a-z and A-Z are always "Basic Latin".
# Avoids unicode_range() function call overhead for the most common case.
character_ord: int = ord(character)
if character_ord < 128:
character_range: str | None = "Basic Latin"
else:
character_range = unicode_range(character)

if character_range is None:
continue

# Fast path: same range as previous character → reuse cached layer target.
if character_range == prev_character_range:
if prev_layer_target is not None:
layers[prev_layer_target].append(character)
continue

layer_target_range: str | None = None

if multi_layer:
Expand Down Expand Up @@ -311,6 +340,10 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:

layers[layer_target_range].append(character)

# Cache for next iteration
prev_character_range = character_range
prev_layer_target = layer_target_range

return ["".join(chars).lower() for chars in layers.values()]


Expand Down
45 changes: 22 additions & 23 deletions lib/charset_normalizer/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from charset_normalizer.version import __version__


def query_yes_no(question: str, default: str = "yes") -> bool:
def query_yes_no(question: str, default: str = "yes") -> bool: # Defensive:
"""Ask a yes/no question via input() and return the answer as a bool."""
prompt = " [Y/n] " if default == "yes" else " [y/N] "

Expand Down Expand Up @@ -244,25 +244,24 @@ def cli_detect(argv: list[str] | None = None) -> int:
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
cli_result = CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
x_.append(cli_result)

if len(matches) > 1 and args.alternatives:
for el in matches:
Expand Down Expand Up @@ -323,11 +322,11 @@ def cli_detect(argv: list[str] | None = None) -> int:
continue

try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
cli_result.unicode_path = join(dir_path, ".".join(o_))

with open(x_[0].unicode_path, "wb") as fp:
with open(cli_result.unicode_path, "wb") as fp:
fp.write(best_guess.output())
except OSError as e:
except OSError as e: # Defensive:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
Expand Down Expand Up @@ -359,5 +358,5 @@ def cli_detect(argv: list[str] | None = None) -> int:
return 0


if __name__ == "__main__":
if __name__ == "__main__": # Defensive:
cli_detect()
23 changes: 21 additions & 2 deletions lib/charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

UTF8_MAXIMAL_ALLOCATION: int = 1_112_064

# Up-to-date Unicode ucd/15.0.0
# Up-to-date Unicode ucd/17.0.0
UNICODE_RANGES_COMBINED: dict[str, range] = {
"Control character": range(32),
"Basic Latin": range(32, 128),
Expand Down Expand Up @@ -213,6 +213,7 @@
"Elbasan": range(66816, 66864),
"Caucasian Albanian": range(66864, 66928),
"Vithkuqi": range(66928, 67008),
"Todhri": range(67008, 67072),
"Linear A": range(67072, 67456),
"Latin Extended-F": range(67456, 67520),
"Cypriot Syllabary": range(67584, 67648),
Expand All @@ -222,6 +223,7 @@
"Hatran": range(67808, 67840),
"Phoenician": range(67840, 67872),
"Lydian": range(67872, 67904),
"Sidetic": range(67904, 67936),
"Meroitic Hieroglyphs": range(67968, 68000),
"Meroitic Cursive": range(68000, 68096),
"Kharoshthi": range(68096, 68192),
Expand All @@ -235,6 +237,7 @@
"Old Turkic": range(68608, 68688),
"Old Hungarian": range(68736, 68864),
"Hanifi Rohingya": range(68864, 68928),
"Garay": range(68928, 69008),
"Rumi Numeral Symbols": range(69216, 69248),
"Yezidi": range(69248, 69312),
"Arabic Extended-C": range(69312, 69376),
Expand All @@ -254,12 +257,14 @@
"Multani": range(70272, 70320),
"Khudawadi": range(70320, 70400),
"Grantha": range(70400, 70528),
"Tulu-Tigalari": range(70528, 70656),
"Newa": range(70656, 70784),
"Tirhuta": range(70784, 70880),
"Siddham": range(71040, 71168),
"Modi": range(71168, 71264),
"Mongolian Supplement": range(71264, 71296),
"Takri": range(71296, 71376),
"Myanmar Extended-C": range(71376, 71424),
"Ahom": range(71424, 71504),
"Dogra": range(71680, 71760),
"Warang Citi": range(71840, 71936),
Expand All @@ -270,10 +275,13 @@
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
"Pau Cin Hau": range(72384, 72448),
"Devanagari Extended-A": range(72448, 72544),
"Sharada Supplement": range(72544, 72576),
"Sunuwar": range(72640, 72704),
"Bhaiksuki": range(72704, 72816),
"Marchen": range(72816, 72896),
"Masaram Gondi": range(72960, 73056),
"Gunjala Gondi": range(73056, 73136),
"Tolong Siki": range(73136, 73200),
"Makasar": range(73440, 73472),
"Kawi": range(73472, 73568),
"Lisu Supplement": range(73648, 73664),
Expand All @@ -284,26 +292,33 @@
"Cypro-Minoan": range(77712, 77824),
"Egyptian Hieroglyphs": range(77824, 78896),
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
"Egyptian Hieroglyphs Extended-A": range(78944, 82944),
"Anatolian Hieroglyphs": range(82944, 83584),
"Gurung Khema": range(90368, 90432),
"Bamum Supplement": range(92160, 92736),
"Mro": range(92736, 92784),
"Tangsa": range(92784, 92880),
"Bassa Vah": range(92880, 92928),
"Pahawh Hmong": range(92928, 93072),
"Kirat Rai": range(93504, 93568),
"Medefaidrin": range(93760, 93856),
"Beria Erfe": range(93856, 93920),
"Miao": range(93952, 94112),
"Ideographic Symbols and Punctuation": range(94176, 94208),
"Tangut": range(94208, 100352),
"Tangut Components": range(100352, 101120),
"Khitan Small Script": range(101120, 101632),
"Tangut Supplement": range(101632, 101760),
"Tangut Components Supplement": range(101760, 101888),
"Kana Extended-B": range(110576, 110592),
"Kana Supplement": range(110592, 110848),
"Kana Extended-A": range(110848, 110896),
"Small Kana Extension": range(110896, 110960),
"Nushu": range(110960, 111360),
"Duployan": range(113664, 113824),
"Shorthand Format Controls": range(113824, 113840),
"Symbols for Legacy Computing Supplement": range(117760, 118464),
"Miscellaneous Symbols Supplement": range(118464, 118528),
"Znamenny Musical Notation": range(118528, 118736),
"Byzantine Musical Symbols": range(118784, 119040),
"Musical Symbols": range(119040, 119296),
Expand All @@ -321,6 +336,8 @@
"Toto": range(123536, 123584),
"Wancho": range(123584, 123648),
"Nag Mundari": range(124112, 124160),
"Ol Onal": range(124368, 124416),
"Tai Yo": range(124608, 124672),
"Ethiopic Extended-B": range(124896, 124928),
"Mende Kikakui": range(124928, 125152),
"Adlam": range(125184, 125280),
Expand All @@ -333,7 +350,7 @@
"Enclosed Alphanumeric Supplement": range(127232, 127488),
"Enclosed Ideographic Supplement": range(127488, 127744),
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
"Emoticons range(Emoji)": range(128512, 128592),
"Emoticons": range(128512, 128592),
"Ornamental Dingbats": range(128592, 128640),
"Transport and Map Symbols": range(128640, 128768),
"Alchemical Symbols": range(128768, 128896),
Expand All @@ -348,9 +365,11 @@
"CJK Unified Ideographs Extension D": range(177984, 178208),
"CJK Unified Ideographs Extension E": range(178208, 183984),
"CJK Unified Ideographs Extension F": range(183984, 191472),
"CJK Unified Ideographs Extension I": range(191472, 192096),
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
"CJK Unified Ideographs Extension G": range(196608, 201552),
"CJK Unified Ideographs Extension H": range(201552, 205744),
"CJK Unified Ideographs Extension J": range(205744, 210048),
"Tags": range(917504, 917632),
"Variation Selectors Supplement": range(917760, 918000),
"Supplementary Private Use Area-A": range(983040, 1048576),
Expand Down
3 changes: 1 addition & 2 deletions lib/charset_normalizer/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE

# TODO: remove this check when dropping Python 3.7 support
if TYPE_CHECKING:
from typing_extensions import TypedDict
from typing import TypedDict

class ResultDict(TypedDict):
encoding: str | None
Expand Down
Loading