Refactor dictionary into a new Spellchecker class

nthykier · nthykier · commit aa7792fee731 · 2024-05-25T08:27:40.000Z
This is as close to a 1:1 conversion as possible. It might change
whhen we get to designing the API. The callers have been refactored to
only perform the lookup once. This was mostly to keep the code more
readable.

The performance cost does seem noticable, which is unsurprising. This
method has a higher cost towards non-matches which is the most common
case.  This commit causes the performance to drop roughly 10% on its
and we are now slower than the goal.
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
@@ -26,10 +26,6 @@
 import sys
 import textwrap
 from ctypes import wintypes
-from .spellchecker import (
-    build_dict,
-    Misspelling,
-)
 from typing import (
     Any,
     Dict,
@@ -49,6 +45,10 @@
 from ._version import (  # type: ignore[import-not-found]
     __version__ as VERSION,  # noqa: N812
 )
+from .spellchecker import (
+    Misspelling,
+    Spellchecker,
+)
 
 word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
 # While we want to treat characters like ( or " as okay for a starting break,
@@ -837,7 +837,7 @@ def parse_file(
     filename: str,
     colors: TermColors,
     summary: Optional[Summary],
-    misspellings: Dict[str, Misspelling],
+    spellchecker: Spellchecker,
     ignore_words_cased: Set[str],
     exclude_lines: Set[str],
     file_opener: FileOpener,
@@ -862,10 +862,11 @@ def parse_file(
                 if word in ignore_words_cased:
                     continue
                 lword = word.lower()
-                if lword not in misspellings:
+                misspelling = spellchecker.check_lower_cased_word(lword)
+                if misspelling is None:
                     continue
-                fix = misspellings[lword].fix
-                candidates = fix_case(word, misspellings[lword].candidates)
+                fix = misspelling.fix
+                candidates = fix_case(word, misspelling.candidates)
 
                 if summary and fix:
                     summary.update(lword)
@@ -874,7 +875,7 @@ def parse_file(
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
                 crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                reason = misspellings[lword].reason
+                reason = misspelling.reason
                 if reason:
                     if options.quiet_level & QuietLevels.DISABLED_FIXES:
                         continue
@@ -946,7 +947,8 @@ def parse_file(
             if word in ignore_words_cased:
                 continue
             lword = word.lower()
-            if lword in misspellings and lword not in extra_words_to_ignore:
+            misspelling = spellchecker.check_lower_cased_word(lword)
+            if misspelling is not None and lword not in extra_words_to_ignore:
                 # Sometimes we find a 'misspelling' which is actually a valid word
                 # preceded by a string escape sequence.  Ignore such cases as
                 # they're usually false alarms; see issue #17 among others.
@@ -956,13 +958,13 @@ def parse_file(
                     and line[char_before_idx] == "\\"
                     # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
                     and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
-                    and lword[1:] not in misspellings
+                    and spellchecker.check_lower_cased_word(lword[1:]) is None
                 ):
                     continue
 
                 context_shown = False
-                fix = misspellings[lword].fix
-                candidates = fix_case(word, misspellings[lword].candidates)
+                fix = misspelling.fix
+                candidates = fix_case(word, misspelling.candidates)
 
                 if options.interactive and lword not in asked_for:
                     if context is not None:
@@ -971,7 +973,7 @@ def parse_file(
                     fix, candidates = ask_for_word_fix(
                         lines[i],
                         match,
-                        misspellings[lword],
+                        misspelling,
                         options.interactive,
                         colors=colors,
                     )
@@ -993,7 +995,7 @@ def parse_file(
                 if (
                     options.interactive & 2
                     and not fix
-                    and not misspellings[lword].reason
+                    and not misspelling.reason
                 ):
                     continue
 
@@ -1002,7 +1004,7 @@ def parse_file(
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
                 crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                reason = misspellings[lword].reason
+                reason = misspelling.reason
                 if reason:
                     if options.quiet_level & QuietLevels.DISABLED_FIXES:
                         continue
@@ -1174,9 +1176,9 @@ def main(*args: str) -> int:
                 parser.print_help()
                 return EX_USAGE
             use_dictionaries.append(dictionary)
-    misspellings: Dict[str, Misspelling] = {}
+    spellchecker = Spellchecker()
     for dictionary in use_dictionaries:
-        build_dict(dictionary, misspellings, ignore_words)
+        spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
     colors = TermColors()
     if not options.colors:
         colors.disable()
@@ -1251,7 +1253,7 @@ def main(*args: str) -> int:
                         fname,
                         colors,
                         summary,
-                        misspellings,
+                        spellchecker,
                         ignore_words_cased,
                         exclude_lines,
                         file_opener,
@@ -1276,7 +1278,7 @@ def main(*args: str) -> int:
                 filename,
                 colors,
                 summary,
-                misspellings,
+                spellchecker,
                 ignore_words_cased,
                 exclude_lines,
                 file_opener,
diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py
@@ -24,6 +24,6 @@ def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
         return tuple(c.capitalize() for c in candidates)
     if word == word.upper():
         return tuple(c.upper() for c in candidates)
-    # they are both lower case
+    # they are both lower-case
     # or we don't have any idea
     return candidates
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
@@ -17,9 +17,10 @@
 """
 
 from typing import (
+    Container,
     Dict,
+    Optional,
     Sequence,
-    Set,
 )
 
 # Pass all misspellings through this translation table to generate
@@ -34,7 +35,49 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.reason = reason
 
 
-def add_misspelling(
+class Spellchecker:
+    def __init__(self) -> None:
+        self._misspellings: Dict[str, Misspelling] = {}
+
+    def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
+        """Check a given word against the loaded dictionaries
+
+        :param word: The word to check. This should be all lower-case.
+        """
+        return self._misspellings.get(word)
+
+    def add_from_file(
+        self,
+        filename: str,
+        *,
+        ignore_words: Container[str] = frozenset(),
+    ) -> None:
+        """Parse a codespell dictionary
+
+        :param filename: The codespell dictionary file to parse
+        :param ignore_words: Words to ignore from this dictionary.
+        """
+        misspellings = self._misspellings
+        with open(filename, encoding="utf-8") as f:
+            translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
+            for line in f:
+                [key, data] = line.split("->")
+                # TODO: For now, convert both to lower.
+                #       Someday we can maybe add support for fixing caps.
+                key = key.lower()
+                data = data.lower()
+                if key not in ignore_words:
+                    _add_misspelling(key, data, misspellings)
+                # generate alternative misspellings/fixes
+                for x, table in translate_tables:
+                    if x in key:
+                        alt_key = key.translate(table)
+                        alt_data = data.translate(table)
+                        if alt_key not in ignore_words:
+                            _add_misspelling(alt_key, alt_data, misspellings)
+
+
+def _add_misspelling(
     key: str,
     data: str,
     misspellings: Dict[str, Misspelling],
@@ -54,27 +97,3 @@ def add_misspelling(
         fix,
         reason,
     )
-
-
-def build_dict(
-    filename: str,
-    misspellings: Dict[str, Misspelling],
-    ignore_words: Set[str],
-) -> None:
-    with open(filename, encoding="utf-8") as f:
-        translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
-        for line in f:
-            [key, data] = line.split("->")
-            # TODO: For now, convert both to lower.
-            #       Someday we can maybe add support for fixing caps.
-            key = key.lower()
-            data = data.lower()
-            if key not in ignore_words:
-                add_misspelling(key, data, misspellings)
-            # generate alternative misspellings/fixes
-            for x, table in translate_tables:
-                if x in key:
-                    alt_key = key.translate(table)
-                    alt_data = data.translate(table)
-                    if alt_key not in ignore_words:
-                        add_misspelling(alt_key, alt_data, misspellings)