Rewrite line spellchecking and move most of it into the Spellchecker

nthykier · nthykier · commit 1bffc2d40ecb · 2024-05-25T08:16:42.000Z
With this rewrite, performance improved slightly and is now down to 7%
slower than the baseline (6s vs. 5.6s).

There is deliberate an over-indentation left in this commit, since
that makes this commit easier to review (without ignoring space
changes).
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
@@ -27,7 +27,8 @@
 import textwrap
 from ctypes import wintypes
 from .spellchecker import (
-    Misspelling,
+    DetectedMisspelling,
+    LineTokenizer,
     Spellchecker,
 )
 from typing import (
@@ -41,7 +42,6 @@
     Sequence,
     Set,
     Tuple,
-    Callable,
 )
 
 from ._text_util import fix_case
@@ -716,15 +716,17 @@ def is_text_file(filename: str) -> bool:
 
 def ask_for_word_fix(
     line: str,
-    match: Match[str],
-    misspelling: Misspelling,
+    issue: DetectedMisspelling,
     interactivity: int,
     colors: TermColors,
 ) -> Tuple[bool, Sequence[str]]:
-    wrongword = match.group()
+    wrongword = issue.word
+    misspelling = issue.misspelling
     if interactivity <= 0:
         return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
+    match = issue.re_match
+
     line_ui = (
         f"{line[:match.start()]}"
         f"{colors.WWORD}{wrongword}{colors.DISABLE}"
@@ -839,7 +841,7 @@ def line_tokenizer_factory(
     uri_regex: Pattern[str],
     word_regex: Pattern[str],
     ignore_word_regex: Optional[Pattern[str]],
-) -> Callable[[str], Iterable[re.Match[str]]]:
+) -> LineTokenizer:
     def line_tokenizer(line: str) -> Iterable[Match[str]]:
         # If all URI spelling errors will be ignored, erase any URI before
         # extracting words. Otherwise, apply ignores after extracting words.
@@ -867,7 +869,6 @@ def parse_file(
     colors: TermColors,
     summary: Optional[Summary],
     spellchecker: Spellchecker,
-    ignore_words_cased: Set[str],
     exclude_lines: Set[str],
     file_opener: FileOpener,
     word_regex: Pattern[str],
@@ -888,7 +889,7 @@ def parse_file(
     else:
         if options.check_filenames:
             for word in extract_words(filename, word_regex, ignore_word_regex):
-                if word in ignore_words_cased:
+                if word in spellchecker.ignore_words_cased:
                     continue
                 lword = word.lower()
                 misspelling = spellchecker.check_lower_cased_word(lword)
@@ -962,25 +963,12 @@ def parse_file(
         fixed_words = set()
         asked_for = set()
 
-        for match in line_tokenizer(line):
-            word = match.group()
-            if word in ignore_words_cased:
-                continue
-            lword = word.lower()
-            misspelling = spellchecker.check_lower_cased_word(lword)
-            if misspelling is not None and lword not in extra_words_to_ignore:
-                # Sometimes we find a 'misspelling' which is actually a valid word
-                # preceded by a string escape sequence.  Ignore such cases as
-                # they're usually false alarms; see issue #17 among others.
-                char_before_idx = match.start() - 1
-                if (
-                    char_before_idx >= 0
-                    and line[char_before_idx] == "\\"
-                    # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
-                    and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
-                    and spellchecker.check_lower_cased_word(lword[1:]) is None
-                ):
-                    continue
+        issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore)
+        for issue in issues:
+                # TODO: De-indent in next commit
+                misspelling = issue.misspelling
+                word = issue.word
+                lword = issue.lword
 
                 context_shown = False
                 fix = misspelling.fix
@@ -992,8 +980,7 @@ def parse_file(
                         print_context(lines, i, context)
                     fix, candidates = ask_for_word_fix(
                         lines[i],
-                        match,
-                        misspelling,
+                        issue,
                         options.interactive,
                         colors=colors,
                     )
@@ -1197,6 +1184,7 @@ def main(*args: str) -> int:
                 return EX_USAGE
             use_dictionaries.append(dictionary)
     spellchecker = Spellchecker()
+    spellchecker.ignore_words_cased = ignore_words_cased
     for dictionary in use_dictionaries:
         spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
     colors = TermColors()
@@ -1274,7 +1262,6 @@ def main(*args: str) -> int:
                         colors,
                         summary,
                         spellchecker,
-                        ignore_words_cased,
                         exclude_lines,
                         file_opener,
                         word_regex,
@@ -1299,7 +1286,6 @@ def main(*args: str) -> int:
                 colors,
                 summary,
                 spellchecker,
-                ignore_words_cased,
                 exclude_lines,
                 file_opener,
                 word_regex,
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
@@ -17,8 +17,11 @@
 """
 
 from typing import (
+    Callable,
     Container,
     Dict,
+    Iterable,
+    Match,
     Optional,
     Sequence,
 )
@@ -28,16 +31,59 @@
 alt_chars = (("'", "’"),)  # noqa: RUF001
 
 
+LineTokenizer = Callable[[str], Iterable[Match[str]]]
+
+
 class Misspelling:
     def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.candidates = candidates
         self.fix = fix
         self.reason = reason
 
 
+class DetectedMisspelling:
+
+    def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None:
+        self.word = word
+        self.lword = lword
+        self.misspelling = misspelling
+        self.re_match = match
+
+
 class Spellchecker:
     def __init__(self) -> None:
         self._misspellings: Dict[str, Misspelling] = {}
+        self.ignore_words_cased: Container[str] = frozenset()
+
+    def spellcheck_line(
+        self,
+        line: str,
+        tokenizer: Callable[[str], Iterable[re.Match[str]]],
+        *,
+        extra_words_to_ignore: Container[str] = frozenset()
+    ) -> Iterable[DetectedMisspelling]:
+        misspellings = self._misspellings
+        ignore_words_cased = self.ignore_words_cased
+        for match in tokenizer(line):
+            word = match.group()
+            if word in ignore_words_cased:
+                continue
+            lword = word.lower()
+            misspelling = misspellings.get(lword)
+            if misspelling is not None and lword not in extra_words_to_ignore:
+                # Sometimes we find a 'misspelling' which is actually a valid word
+                # preceded by a string escape sequence.  Ignore such cases as
+                # they're usually false alarms; see issue #17 among others.
+                char_before_idx = match.start() - 1
+                if (
+                    char_before_idx >= 0
+                    and line[char_before_idx] == "\\"
+                    # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
+                    and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
+                    and lword[1:] not in misspellings
+                ):
+                    continue
+                yield DetectedMisspelling(word, lword, misspelling, match)
 
     def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
         """Check a given word against the loaded dictionaries