Skip to content

Commit 1bffc2d

Browse files
committed
Rewrite line spellchecking and move most of it into the Spellchecker
With this rewrite, performance improved slightly and is now down to 7% slower than the baseline (6s vs. 5.6s). There is deliberate an over-indentation left in this commit, since that makes this commit easier to review (without ignoring space changes).
1 parent 259c0d1 commit 1bffc2d

File tree

2 files changed

+63
-31
lines changed

2 files changed

+63
-31
lines changed

codespell_lib/_codespell.py

Lines changed: 17 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
import textwrap
2828
from ctypes import wintypes
2929
from .spellchecker import (
30-
Misspelling,
30+
DetectedMisspelling,
31+
LineTokenizer,
3132
Spellchecker,
3233
)
3334
from typing import (
@@ -41,7 +42,6 @@
4142
Sequence,
4243
Set,
4344
Tuple,
44-
Callable,
4545
)
4646

4747
from ._text_util import fix_case
@@ -716,15 +716,17 @@ def is_text_file(filename: str) -> bool:
716716

717717
def ask_for_word_fix(
718718
line: str,
719-
match: Match[str],
720-
misspelling: Misspelling,
719+
issue: DetectedMisspelling,
721720
interactivity: int,
722721
colors: TermColors,
723722
) -> Tuple[bool, Sequence[str]]:
724-
wrongword = match.group()
723+
wrongword = issue.word
724+
misspelling = issue.misspelling
725725
if interactivity <= 0:
726726
return misspelling.fix, fix_case(wrongword, misspelling.candidates)
727727

728+
match = issue.re_match
729+
728730
line_ui = (
729731
f"{line[:match.start()]}"
730732
f"{colors.WWORD}{wrongword}{colors.DISABLE}"
@@ -839,7 +841,7 @@ def line_tokenizer_factory(
839841
uri_regex: Pattern[str],
840842
word_regex: Pattern[str],
841843
ignore_word_regex: Optional[Pattern[str]],
842-
) -> Callable[[str], Iterable[re.Match[str]]]:
844+
) -> LineTokenizer:
843845
def line_tokenizer(line: str) -> Iterable[Match[str]]:
844846
# If all URI spelling errors will be ignored, erase any URI before
845847
# extracting words. Otherwise, apply ignores after extracting words.
@@ -867,7 +869,6 @@ def parse_file(
867869
colors: TermColors,
868870
summary: Optional[Summary],
869871
spellchecker: Spellchecker,
870-
ignore_words_cased: Set[str],
871872
exclude_lines: Set[str],
872873
file_opener: FileOpener,
873874
word_regex: Pattern[str],
@@ -888,7 +889,7 @@ def parse_file(
888889
else:
889890
if options.check_filenames:
890891
for word in extract_words(filename, word_regex, ignore_word_regex):
891-
if word in ignore_words_cased:
892+
if word in spellchecker.ignore_words_cased:
892893
continue
893894
lword = word.lower()
894895
misspelling = spellchecker.check_lower_cased_word(lword)
@@ -962,25 +963,12 @@ def parse_file(
962963
fixed_words = set()
963964
asked_for = set()
964965

965-
for match in line_tokenizer(line):
966-
word = match.group()
967-
if word in ignore_words_cased:
968-
continue
969-
lword = word.lower()
970-
misspelling = spellchecker.check_lower_cased_word(lword)
971-
if misspelling is not None and lword not in extra_words_to_ignore:
972-
# Sometimes we find a 'misspelling' which is actually a valid word
973-
# preceded by a string escape sequence. Ignore such cases as
974-
# they're usually false alarms; see issue #17 among others.
975-
char_before_idx = match.start() - 1
976-
if (
977-
char_before_idx >= 0
978-
and line[char_before_idx] == "\\"
979-
# bell, backspace, formfeed, newline, carriage-return, tab, vtab.
980-
and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
981-
and spellchecker.check_lower_cased_word(lword[1:]) is None
982-
):
983-
continue
966+
issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore)
967+
for issue in issues:
968+
# TODO: De-indent in next commit
969+
misspelling = issue.misspelling
970+
word = issue.word
971+
lword = issue.lword
984972

985973
context_shown = False
986974
fix = misspelling.fix
@@ -992,8 +980,7 @@ def parse_file(
992980
print_context(lines, i, context)
993981
fix, candidates = ask_for_word_fix(
994982
lines[i],
995-
match,
996-
misspelling,
983+
issue,
997984
options.interactive,
998985
colors=colors,
999986
)
@@ -1197,6 +1184,7 @@ def main(*args: str) -> int:
11971184
return EX_USAGE
11981185
use_dictionaries.append(dictionary)
11991186
spellchecker = Spellchecker()
1187+
spellchecker.ignore_words_cased = ignore_words_cased
12001188
for dictionary in use_dictionaries:
12011189
spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
12021190
colors = TermColors()
@@ -1274,7 +1262,6 @@ def main(*args: str) -> int:
12741262
colors,
12751263
summary,
12761264
spellchecker,
1277-
ignore_words_cased,
12781265
exclude_lines,
12791266
file_opener,
12801267
word_regex,
@@ -1299,7 +1286,6 @@ def main(*args: str) -> int:
12991286
colors,
13001287
summary,
13011288
spellchecker,
1302-
ignore_words_cased,
13031289
exclude_lines,
13041290
file_opener,
13051291
word_regex,

codespell_lib/spellchecker.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,11 @@
1717
"""
1818

1919
from typing import (
20+
Callable,
2021
Container,
2122
Dict,
23+
Iterable,
24+
Match,
2225
Optional,
2326
Sequence,
2427
)
@@ -28,16 +31,59 @@
2831
alt_chars = (("'", "’"),) # noqa: RUF001
2932

3033

34+
LineTokenizer = Callable[[str], Iterable[Match[str]]]
35+
36+
3137
class Misspelling:
3238
def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
3339
self.candidates = candidates
3440
self.fix = fix
3541
self.reason = reason
3642

3743

44+
class DetectedMisspelling:
45+
46+
def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None:
47+
self.word = word
48+
self.lword = lword
49+
self.misspelling = misspelling
50+
self.re_match = match
51+
52+
3853
class Spellchecker:
3954
def __init__(self) -> None:
4055
self._misspellings: Dict[str, Misspelling] = {}
56+
self.ignore_words_cased: Container[str] = frozenset()
57+
58+
def spellcheck_line(
59+
self,
60+
line: str,
61+
tokenizer: Callable[[str], Iterable[re.Match[str]]],
62+
*,
63+
extra_words_to_ignore: Container[str] = frozenset()
64+
) -> Iterable[DetectedMisspelling]:
65+
misspellings = self._misspellings
66+
ignore_words_cased = self.ignore_words_cased
67+
for match in tokenizer(line):
68+
word = match.group()
69+
if word in ignore_words_cased:
70+
continue
71+
lword = word.lower()
72+
misspelling = misspellings.get(lword)
73+
if misspelling is not None and lword not in extra_words_to_ignore:
74+
# Sometimes we find a 'misspelling' which is actually a valid word
75+
# preceded by a string escape sequence. Ignore such cases as
76+
# they're usually false alarms; see issue #17 among others.
77+
char_before_idx = match.start() - 1
78+
if (
79+
char_before_idx >= 0
80+
and line[char_before_idx] == "\\"
81+
# bell, backspace, formfeed, newline, carriage-return, tab, vtab.
82+
and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
83+
and lword[1:] not in misspellings
84+
):
85+
continue
86+
yield DetectedMisspelling(word, lword, misspelling, match)
4187

4288
def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
4389
"""Check a given word against the loaded dictionaries

0 commit comments

Comments
 (0)