joepitt91 · dependabot · Mar 16, 2026 · Mar 16, 2026
diff --git a/lib/charset_normalizer/api.py b/lib/charset_normalizer/api.py
diff --git a/lib/charset_normalizer/cd.py b/lib/charset_normalizer/cd.py
@@ -31,7 +31,9 @@ def encoding_unicode_range(iana_name: str) -> list[str]:
     Return associated unicode ranges in a single byte code page.
     """
     if is_multi_byte_encoding(iana_name):
-        raise OSError("Function not supported on multi-byte code page")
+        raise OSError(  # Defensive:
+            "Function not supported on multi-byte code page"
+        )
 
     decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
 
@@ -179,7 +181,7 @@ def characters_popularity_compare(
     Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
     """
     if language not in FREQUENCIES:
-        raise ValueError(f"{language} not available")
+        raise ValueError(f"{language} not available")  # Defensive:
 
     character_approved_count: int = 0
     frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]
@@ -205,6 +207,13 @@ def characters_popularity_compare(
         (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank
     ]
 
+    # Pre-extract lr and orr arrays for faster iteration in the inner loop.
+    # Plain integer loops with local arrays are much faster under mypyc than
+    # generator expression sums over a list of tuples.
+    common_count: int = len(common_chars)
+    common_lr: list[int] = [p[0] for p in common_chars]
+    common_orr: list[int] = [p[1] for p in common_chars]
+
     for character, character_rank in zip(
         ordered_characters, range(0, ordered_characters_count)
     ):
@@ -230,18 +239,21 @@ def characters_popularity_compare(
 
         # Count how many characters appear "before" in both orderings,
         # and how many appear "at or after" in both orderings.
-        before_match_count: int = sum(
-            1
-            for lr, orr in common_chars
-            if lr < character_rank_in_language and orr < character_rank
-        )
+        # Single pass over pre-extracted arrays — much faster under mypyc
+        # than two generator expression sums.
+        before_match_count: int = 0
+        after_match_count: int = 0
+        for i in range(common_count):
+            lr_i: int = common_lr[i]
+            orr_i: int = common_orr[i]
+            if lr_i < character_rank_in_language:
+                if orr_i < character_rank:
+                    before_match_count += 1
+            else:
+                if orr_i >= character_rank:
+                    after_match_count += 1
 
         after_len: int = target_language_characters_count - character_rank_in_language
-        after_match_count: int = sum(
-            1
-            for lr, orr in common_chars
-            if lr >= character_rank_in_language and orr >= character_rank
-        )
 
         if character_rank_in_language == 0 and before_match_count <= 4:
             character_approved_count += 1
@@ -273,15 +285,32 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
     single_layer_key: str | None = None
     multi_layer: bool = False
 
+    # Cache the last character_range and its resolved layer to avoid repeated
+    # is_suspiciously_successive_range calls for consecutive same-range chars.
+    prev_character_range: str | None = None
+    prev_layer_target: str | None = None
+
     for character in decoded_sequence:
         if character.isalpha() is False:
             continue
 
-        character_range: str | None = unicode_range(character)
+        # ASCII fast-path: a-z and A-Z are always "Basic Latin".
+        # Avoids unicode_range() function call overhead for the most common case.
+        character_ord: int = ord(character)
+        if character_ord < 128:
+            character_range: str | None = "Basic Latin"
+        else:
+            character_range = unicode_range(character)
 
         if character_range is None:
             continue
 
+        # Fast path: same range as previous character → reuse cached layer target.
+        if character_range == prev_character_range:
+            if prev_layer_target is not None:
+                layers[prev_layer_target].append(character)
+            continue
+
         layer_target_range: str | None = None
 
         if multi_layer:
@@ -311,6 +340,10 @@ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
 
         layers[layer_target_range].append(character)
 
+        # Cache for next iteration
+        prev_character_range = character_range
+        prev_layer_target = layer_target_range
+
     return ["".join(chars).lower() for chars in layers.values()]
 
 

diff --git a/lib/charset_normalizer/cli/__main__.py b/lib/charset_normalizer/cli/__main__.py
@@ -14,7 +14,7 @@
 from charset_normalizer.version import __version__
 
 
-def query_yes_no(question: str, default: str = "yes") -> bool:
+def query_yes_no(question: str, default: str = "yes") -> bool:  # Defensive:
     """Ask a yes/no question via input() and return the answer as a bool."""
     prompt = " [Y/n] " if default == "yes" else " [y/N] "
 
@@ -244,25 +244,24 @@ def cli_detect(argv: list[str] | None = None) -> int:
                 )
             )
         else:
-            x_.append(
-                CliDetectionResult(
-                    abspath(my_file.name),
-                    best_guess.encoding,
-                    best_guess.encoding_aliases,
-                    [
-                        cp
-                        for cp in best_guess.could_be_from_charset
-                        if cp != best_guess.encoding
-                    ],
-                    best_guess.language,
-                    best_guess.alphabets,
-                    best_guess.bom,
-                    best_guess.percent_chaos,
-                    best_guess.percent_coherence,
-                    None,
-                    True,
-                )
+            cli_result = CliDetectionResult(
+                abspath(my_file.name),
+                best_guess.encoding,
+                best_guess.encoding_aliases,
+                [
+                    cp
+                    for cp in best_guess.could_be_from_charset
+                    if cp != best_guess.encoding
+                ],
+                best_guess.language,
+                best_guess.alphabets,
+                best_guess.bom,
+                best_guess.percent_chaos,
+                best_guess.percent_coherence,
+                None,
+                True,
             )
+            x_.append(cli_result)
 
             if len(matches) > 1 and args.alternatives:
                 for el in matches:
@@ -323,11 +322,11 @@ def cli_detect(argv: list[str] | None = None) -> int:
                     continue
 
                 try:
-                    x_[0].unicode_path = join(dir_path, ".".join(o_))
+                    cli_result.unicode_path = join(dir_path, ".".join(o_))
 
-                    with open(x_[0].unicode_path, "wb") as fp:
+                    with open(cli_result.unicode_path, "wb") as fp:
                         fp.write(best_guess.output())
-                except OSError as e:
+                except OSError as e:  # Defensive:
                     print(str(e), file=sys.stderr)
                     if my_file.closed is False:
                         my_file.close()
@@ -359,5 +358,5 @@ def cli_detect(argv: list[str] | None = None) -> int:
     return 0
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # Defensive:
     cli_detect()
diff --git a/lib/charset_normalizer/constant.py b/lib/charset_normalizer/constant.py
@@ -25,7 +25,7 @@
 
 UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
 
-# Up-to-date Unicode ucd/15.0.0
+# Up-to-date Unicode ucd/17.0.0
 UNICODE_RANGES_COMBINED: dict[str, range] = {
     "Control character": range(32),
     "Basic Latin": range(32, 128),
@@ -213,6 +213,7 @@
     "Elbasan": range(66816, 66864),
     "Caucasian Albanian": range(66864, 66928),
     "Vithkuqi": range(66928, 67008),
+    "Todhri": range(67008, 67072),
     "Linear A": range(67072, 67456),
     "Latin Extended-F": range(67456, 67520),
     "Cypriot Syllabary": range(67584, 67648),
@@ -222,6 +223,7 @@
     "Hatran": range(67808, 67840),
     "Phoenician": range(67840, 67872),
     "Lydian": range(67872, 67904),
+    "Sidetic": range(67904, 67936),
     "Meroitic Hieroglyphs": range(67968, 68000),
     "Meroitic Cursive": range(68000, 68096),
     "Kharoshthi": range(68096, 68192),
@@ -235,6 +237,7 @@
     "Old Turkic": range(68608, 68688),
     "Old Hungarian": range(68736, 68864),
     "Hanifi Rohingya": range(68864, 68928),
+    "Garay": range(68928, 69008),
     "Rumi Numeral Symbols": range(69216, 69248),
     "Yezidi": range(69248, 69312),
     "Arabic Extended-C": range(69312, 69376),
@@ -254,12 +257,14 @@
     "Multani": range(70272, 70320),
     "Khudawadi": range(70320, 70400),
     "Grantha": range(70400, 70528),
+    "Tulu-Tigalari": range(70528, 70656),
     "Newa": range(70656, 70784),
     "Tirhuta": range(70784, 70880),
     "Siddham": range(71040, 71168),
     "Modi": range(71168, 71264),
     "Mongolian Supplement": range(71264, 71296),
     "Takri": range(71296, 71376),
+    "Myanmar Extended-C": range(71376, 71424),
     "Ahom": range(71424, 71504),
     "Dogra": range(71680, 71760),
     "Warang Citi": range(71840, 71936),
@@ -270,10 +275,13 @@
     "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
     "Pau Cin Hau": range(72384, 72448),
     "Devanagari Extended-A": range(72448, 72544),
+    "Sharada Supplement": range(72544, 72576),
+    "Sunuwar": range(72640, 72704),
     "Bhaiksuki": range(72704, 72816),
     "Marchen": range(72816, 72896),
     "Masaram Gondi": range(72960, 73056),
     "Gunjala Gondi": range(73056, 73136),
+    "Tolong Siki": range(73136, 73200),
     "Makasar": range(73440, 73472),
     "Kawi": range(73472, 73568),
     "Lisu Supplement": range(73648, 73664),
@@ -284,26 +292,33 @@
     "Cypro-Minoan": range(77712, 77824),
     "Egyptian Hieroglyphs": range(77824, 78896),
     "Egyptian Hieroglyph Format Controls": range(78896, 78944),
+    "Egyptian Hieroglyphs Extended-A": range(78944, 82944),
     "Anatolian Hieroglyphs": range(82944, 83584),
+    "Gurung Khema": range(90368, 90432),
     "Bamum Supplement": range(92160, 92736),
     "Mro": range(92736, 92784),
     "Tangsa": range(92784, 92880),
     "Bassa Vah": range(92880, 92928),
     "Pahawh Hmong": range(92928, 93072),
+    "Kirat Rai": range(93504, 93568),
     "Medefaidrin": range(93760, 93856),
+    "Beria Erfe": range(93856, 93920),
     "Miao": range(93952, 94112),
     "Ideographic Symbols and Punctuation": range(94176, 94208),
     "Tangut": range(94208, 100352),
     "Tangut Components": range(100352, 101120),
     "Khitan Small Script": range(101120, 101632),
     "Tangut Supplement": range(101632, 101760),
+    "Tangut Components Supplement": range(101760, 101888),
     "Kana Extended-B": range(110576, 110592),
     "Kana Supplement": range(110592, 110848),
     "Kana Extended-A": range(110848, 110896),
     "Small Kana Extension": range(110896, 110960),
     "Nushu": range(110960, 111360),
     "Duployan": range(113664, 113824),
     "Shorthand Format Controls": range(113824, 113840),
+    "Symbols for Legacy Computing Supplement": range(117760, 118464),
+    "Miscellaneous Symbols Supplement": range(118464, 118528),
     "Znamenny Musical Notation": range(118528, 118736),
     "Byzantine Musical Symbols": range(118784, 119040),
     "Musical Symbols": range(119040, 119296),
@@ -321,6 +336,8 @@
     "Toto": range(123536, 123584),
     "Wancho": range(123584, 123648),
     "Nag Mundari": range(124112, 124160),
+    "Ol Onal": range(124368, 124416),
+    "Tai Yo": range(124608, 124672),
     "Ethiopic Extended-B": range(124896, 124928),
     "Mende Kikakui": range(124928, 125152),
     "Adlam": range(125184, 125280),
@@ -333,7 +350,7 @@
     "Enclosed Alphanumeric Supplement": range(127232, 127488),
     "Enclosed Ideographic Supplement": range(127488, 127744),
     "Miscellaneous Symbols and Pictographs": range(127744, 128512),
-    "Emoticons range(Emoji)": range(128512, 128592),
+    "Emoticons": range(128512, 128592),
     "Ornamental Dingbats": range(128592, 128640),
     "Transport and Map Symbols": range(128640, 128768),
     "Alchemical Symbols": range(128768, 128896),
@@ -348,9 +365,11 @@
     "CJK Unified Ideographs Extension D": range(177984, 178208),
     "CJK Unified Ideographs Extension E": range(178208, 183984),
     "CJK Unified Ideographs Extension F": range(183984, 191472),
+    "CJK Unified Ideographs Extension I": range(191472, 192096),
     "CJK Compatibility Ideographs Supplement": range(194560, 195104),
     "CJK Unified Ideographs Extension G": range(196608, 201552),
     "CJK Unified Ideographs Extension H": range(201552, 205744),
+    "CJK Unified Ideographs Extension J": range(205744, 210048),
     "Tags": range(917504, 917632),
     "Variation Selectors Supplement": range(917760, 918000),
     "Supplementary Private Use Area-A": range(983040, 1048576),

diff --git a/lib/charset_normalizer/legacy.py b/lib/charset_normalizer/legacy.py
@@ -6,9 +6,8 @@
 from .api import from_bytes
 from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
 
-# TODO: remove this check when dropping Python 3.7 support
 if TYPE_CHECKING:
-    from typing_extensions import TypedDict
+    from typing import TypedDict
 
     class ResultDict(TypedDict):
         encoding: str | None