Skip to content

Commit aa7792f

Browse files
committed
Refactor dictionary into a new Spellchecker class
This is as close to a 1:1 conversion as possible. It might change whhen we get to designing the API. The callers have been refactored to only perform the lookup once. This was mostly to keep the code more readable. The performance cost does seem noticable, which is unsurprising. This method has a higher cost towards non-matches which is the most common case. This commit causes the performance to drop roughly 10% on its and we are now slower than the goal.
1 parent 824bd7c commit aa7792f

File tree

3 files changed

+68
-47
lines changed

3 files changed

+68
-47
lines changed

codespell_lib/_codespell.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,6 @@
2626
import sys
2727
import textwrap
2828
from ctypes import wintypes
29-
from .spellchecker import (
30-
build_dict,
31-
Misspelling,
32-
)
3329
from typing import (
3430
Any,
3531
Dict,
@@ -49,6 +45,10 @@
4945
from ._version import ( # type: ignore[import-not-found]
5046
__version__ as VERSION, # noqa: N812
5147
)
48+
from .spellchecker import (
49+
Misspelling,
50+
Spellchecker,
51+
)
5252

5353
word_regex_def = r"[\w\-'’]+" # noqa: RUF001
5454
# While we want to treat characters like ( or " as okay for a starting break,
@@ -837,7 +837,7 @@ def parse_file(
837837
filename: str,
838838
colors: TermColors,
839839
summary: Optional[Summary],
840-
misspellings: Dict[str, Misspelling],
840+
spellchecker: Spellchecker,
841841
ignore_words_cased: Set[str],
842842
exclude_lines: Set[str],
843843
file_opener: FileOpener,
@@ -862,10 +862,11 @@ def parse_file(
862862
if word in ignore_words_cased:
863863
continue
864864
lword = word.lower()
865-
if lword not in misspellings:
865+
misspelling = spellchecker.check_lower_cased_word(lword)
866+
if misspelling is None:
866867
continue
867-
fix = misspellings[lword].fix
868-
candidates = fix_case(word, misspellings[lword].candidates)
868+
fix = misspelling.fix
869+
candidates = fix_case(word, misspelling.candidates)
869870

870871
if summary and fix:
871872
summary.update(lword)
@@ -874,7 +875,7 @@ def parse_file(
874875
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
875876
crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
876877

877-
reason = misspellings[lword].reason
878+
reason = misspelling.reason
878879
if reason:
879880
if options.quiet_level & QuietLevels.DISABLED_FIXES:
880881
continue
@@ -946,7 +947,8 @@ def parse_file(
946947
if word in ignore_words_cased:
947948
continue
948949
lword = word.lower()
949-
if lword in misspellings and lword not in extra_words_to_ignore:
950+
misspelling = spellchecker.check_lower_cased_word(lword)
951+
if misspelling is not None and lword not in extra_words_to_ignore:
950952
# Sometimes we find a 'misspelling' which is actually a valid word
951953
# preceded by a string escape sequence. Ignore such cases as
952954
# they're usually false alarms; see issue #17 among others.
@@ -956,13 +958,13 @@ def parse_file(
956958
and line[char_before_idx] == "\\"
957959
# bell, backspace, formfeed, newline, carriage-return, tab, vtab.
958960
and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
959-
and lword[1:] not in misspellings
961+
and spellchecker.check_lower_cased_word(lword[1:]) is None
960962
):
961963
continue
962964

963965
context_shown = False
964-
fix = misspellings[lword].fix
965-
candidates = fix_case(word, misspellings[lword].candidates)
966+
fix = misspelling.fix
967+
candidates = fix_case(word, misspelling.candidates)
966968

967969
if options.interactive and lword not in asked_for:
968970
if context is not None:
@@ -971,7 +973,7 @@ def parse_file(
971973
fix, candidates = ask_for_word_fix(
972974
lines[i],
973975
match,
974-
misspellings[lword],
976+
misspelling,
975977
options.interactive,
976978
colors=colors,
977979
)
@@ -993,7 +995,7 @@ def parse_file(
993995
if (
994996
options.interactive & 2
995997
and not fix
996-
and not misspellings[lword].reason
998+
and not misspelling.reason
997999
):
9981000
continue
9991001

@@ -1002,7 +1004,7 @@ def parse_file(
10021004
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
10031005
crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
10041006

1005-
reason = misspellings[lword].reason
1007+
reason = misspelling.reason
10061008
if reason:
10071009
if options.quiet_level & QuietLevels.DISABLED_FIXES:
10081010
continue
@@ -1174,9 +1176,9 @@ def main(*args: str) -> int:
11741176
parser.print_help()
11751177
return EX_USAGE
11761178
use_dictionaries.append(dictionary)
1177-
misspellings: Dict[str, Misspelling] = {}
1179+
spellchecker = Spellchecker()
11781180
for dictionary in use_dictionaries:
1179-
build_dict(dictionary, misspellings, ignore_words)
1181+
spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
11801182
colors = TermColors()
11811183
if not options.colors:
11821184
colors.disable()
@@ -1251,7 +1253,7 @@ def main(*args: str) -> int:
12511253
fname,
12521254
colors,
12531255
summary,
1254-
misspellings,
1256+
spellchecker,
12551257
ignore_words_cased,
12561258
exclude_lines,
12571259
file_opener,
@@ -1276,7 +1278,7 @@ def main(*args: str) -> int:
12761278
filename,
12771279
colors,
12781280
summary,
1279-
misspellings,
1281+
spellchecker,
12801282
ignore_words_cased,
12811283
exclude_lines,
12821284
file_opener,

codespell_lib/_text_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,6 @@ def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
2424
return tuple(c.capitalize() for c in candidates)
2525
if word == word.upper():
2626
return tuple(c.upper() for c in candidates)
27-
# they are both lower case
27+
# they are both lower-case
2828
# or we don't have any idea
2929
return candidates

codespell_lib/spellchecker.py

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@
1717
"""
1818

1919
from typing import (
20+
Container,
2021
Dict,
22+
Optional,
2123
Sequence,
22-
Set,
2324
)
2425

2526
# Pass all misspellings through this translation table to generate
@@ -34,7 +35,49 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
3435
self.reason = reason
3536

3637

37-
def add_misspelling(
38+
class Spellchecker:
39+
def __init__(self) -> None:
40+
self._misspellings: Dict[str, Misspelling] = {}
41+
42+
def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
43+
"""Check a given word against the loaded dictionaries
44+
45+
:param word: The word to check. This should be all lower-case.
46+
"""
47+
return self._misspellings.get(word)
48+
49+
def add_from_file(
50+
self,
51+
filename: str,
52+
*,
53+
ignore_words: Container[str] = frozenset(),
54+
) -> None:
55+
"""Parse a codespell dictionary
56+
57+
:param filename: The codespell dictionary file to parse
58+
:param ignore_words: Words to ignore from this dictionary.
59+
"""
60+
misspellings = self._misspellings
61+
with open(filename, encoding="utf-8") as f:
62+
translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
63+
for line in f:
64+
[key, data] = line.split("->")
65+
# TODO: For now, convert both to lower.
66+
# Someday we can maybe add support for fixing caps.
67+
key = key.lower()
68+
data = data.lower()
69+
if key not in ignore_words:
70+
_add_misspelling(key, data, misspellings)
71+
# generate alternative misspellings/fixes
72+
for x, table in translate_tables:
73+
if x in key:
74+
alt_key = key.translate(table)
75+
alt_data = data.translate(table)
76+
if alt_key not in ignore_words:
77+
_add_misspelling(alt_key, alt_data, misspellings)
78+
79+
80+
def _add_misspelling(
3881
key: str,
3982
data: str,
4083
misspellings: Dict[str, Misspelling],
@@ -54,27 +97,3 @@ def add_misspelling(
5497
fix,
5598
reason,
5699
)
57-
58-
59-
def build_dict(
60-
filename: str,
61-
misspellings: Dict[str, Misspelling],
62-
ignore_words: Set[str],
63-
) -> None:
64-
with open(filename, encoding="utf-8") as f:
65-
translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
66-
for line in f:
67-
[key, data] = line.split("->")
68-
# TODO: For now, convert both to lower.
69-
# Someday we can maybe add support for fixing caps.
70-
key = key.lower()
71-
data = data.lower()
72-
if key not in ignore_words:
73-
add_misspelling(key, data, misspellings)
74-
# generate alternative misspellings/fixes
75-
for x, table in translate_tables:
76-
if x in key:
77-
alt_key = key.translate(table)
78-
alt_data = data.translate(table)
79-
if alt_key not in ignore_words:
80-
add_misspelling(alt_key, alt_data, misspellings)

0 commit comments

Comments
 (0)