Skip to content

Commit 824bd7c

Browse files
committed
Replace data: str with candidates: Sequence[str]
When the spelling dictionaries are loaded, previously the correction line was just stored in memory as a simple text. Through out the code, callers would then have to deal with the `data` attribute, correctly `split()` + `strip()` it. With this change, the dictionary parsing code now encapsulates this problem. The auto-correction works from the assumption that there is only one candidate. This assumption is invariant and seem to be properly maintained in the code. Therefore, we can just pick the first candidate word when doing a correction. In the code, the following name changes are performed: * `Misspelling.data` -> `Misspelling.candidates` * `fixword` -> `candidates` when used for multiple candidates (`fixword` remains for when it is a correction) On performance: Performance-wise, this change moves computation from "checking" time to "startup" time. The performance cost does not appear to be noticeable in my baseline (#3419). Though, keep the corpus weakness on the ratio of cased vs. non-cased corrections with multiple candidates in mind. The all lowercase typo is now slightly more expensive (it was passed throughout `fix_case` and fed directly into the `print` in the original code. In the new code, it will always need a `join`). There are still an overweight of lower-case only corrections in general, so the unconditional `.join` alone is not sufficient to affect the performance noticeably.
1 parent b28a5a3 commit 824bd7c

File tree

3 files changed

+34
-23
lines changed

3 files changed

+34
-23
lines changed

codespell_lib/_codespell.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
import sys
2727
import textwrap
2828
from ctypes import wintypes
29+
from .spellchecker import (
30+
build_dict,
31+
Misspelling,
32+
)
2933
from typing import (
3034
Any,
3135
Dict,
@@ -45,7 +49,6 @@
4549
from ._version import ( # type: ignore[import-not-found]
4650
__version__ as VERSION, # noqa: N812
4751
)
48-
from .spellchecker import Misspelling, build_dict
4952

5053
word_regex_def = r"[\w\-'’]+" # noqa: RUF001
5154
# While we want to treat characters like ( or " as okay for a starting break,
@@ -716,10 +719,10 @@ def ask_for_word_fix(
716719
misspelling: Misspelling,
717720
interactivity: int,
718721
colors: TermColors,
719-
) -> Tuple[bool, str]:
722+
) -> Tuple[bool, Sequence[str]]:
720723
wrongword = match.group()
721724
if interactivity <= 0:
722-
return misspelling.fix, fix_case(wrongword, misspelling.data)
725+
return misspelling.fix, fix_case(wrongword, misspelling.candidates)
723726

724727
line_ui = (
725728
f"{line[:match.start()]}"
@@ -729,7 +732,8 @@ def ask_for_word_fix(
729732

730733
if misspelling.fix and interactivity & 1:
731734
r = ""
732-
fixword = fix_case(wrongword, misspelling.data)
735+
candidates = fix_case(wrongword, misspelling.candidates)
736+
fixword = candidates[0]
733737
while not r:
734738
print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
735739
r = sys.stdin.readline().strip().upper()
@@ -747,12 +751,12 @@ def ask_for_word_fix(
747751
# we ask the user which word to use
748752

749753
r = ""
750-
opt = [w.strip() for w in misspelling.data.split(",")]
754+
opt = misspelling.candidates
751755
while not r:
752756
print(f"{line_ui} Choose an option (blank for none): ", end="")
753-
for i, o in enumerate(opt):
754-
fixword = fix_case(wrongword, o)
755-
print(f" {i}) {fixword}", end="")
757+
cased_candidates = fix_case(wrongword, opt)
758+
for i, candidates in enumerate(cased_candidates):
759+
print(f" {i}) {candidates}", end="")
756760
print(": ", end="", flush=True)
757761

758762
n = sys.stdin.readline().strip()
@@ -767,9 +771,9 @@ def ask_for_word_fix(
767771

768772
if r:
769773
misspelling.fix = True
770-
misspelling.data = r
774+
misspelling.candidates = (r,)
771775

772-
return misspelling.fix, fix_case(wrongword, misspelling.data)
776+
return misspelling.fix, fix_case(wrongword, misspelling.candidates)
773777

774778

775779
def print_context(
@@ -861,14 +865,14 @@ def parse_file(
861865
if lword not in misspellings:
862866
continue
863867
fix = misspellings[lword].fix
864-
fixword = fix_case(word, misspellings[lword].data)
868+
candidates = fix_case(word, misspellings[lword].candidates)
865869

866870
if summary and fix:
867871
summary.update(lword)
868872

869873
cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
870874
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
871-
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
875+
crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
872876

873877
reason = misspellings[lword].reason
874878
if reason:
@@ -958,13 +962,13 @@ def parse_file(
958962

959963
context_shown = False
960964
fix = misspellings[lword].fix
961-
fixword = fix_case(word, misspellings[lword].data)
965+
candidates = fix_case(word, misspellings[lword].candidates)
962966

963967
if options.interactive and lword not in asked_for:
964968
if context is not None:
965969
context_shown = True
966970
print_context(lines, i, context)
967-
fix, fixword = ask_for_word_fix(
971+
fix, candidates = ask_for_word_fix(
968972
lines[i],
969973
match,
970974
misspellings[lword],
@@ -981,7 +985,7 @@ def parse_file(
981985

982986
if options.write_changes and fix:
983987
changed = True
984-
lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
988+
lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
985989
fixed_words.add(word)
986990
continue
987991

@@ -996,7 +1000,7 @@ def parse_file(
9961000
cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
9971001
cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
9981002
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
999-
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
1003+
crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
10001004

10011005
reason = misspellings[lword].reason
10021006
if reason:

codespell_lib/_text_util.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
Copyright (C) 2011 ProFUSION embedded systems
1717
"""
1818

19+
from typing import Sequence
1920

20-
def fix_case(word: str, fixword: str) -> str:
21+
22+
def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
2123
if word == word.capitalize():
22-
return ", ".join(w.strip().capitalize() for w in fixword.split(","))
24+
return tuple(c.capitalize() for c in candidates)
2325
if word == word.upper():
24-
return fixword.upper()
26+
return tuple(c.upper() for c in candidates)
2527
# they are both lower case
2628
# or we don't have any idea
27-
return fixword
29+
return candidates

codespell_lib/spellchecker.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from typing import (
2020
Dict,
21+
Sequence,
2122
Set,
2223
)
2324

@@ -27,8 +28,8 @@
2728

2829

2930
class Misspelling:
30-
def __init__(self, data: str, fix: bool, reason: str) -> None:
31-
self.data = data
31+
def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
32+
self.candidates = candidates
3233
self.fix = fix
3334
self.reason = reason
3435

@@ -48,7 +49,11 @@ def add_misspelling(
4849
fix = True
4950
reason = ""
5051

51-
misspellings[key] = Misspelling(data, fix, reason)
52+
misspellings[key] = Misspelling(
53+
tuple(c.strip() for c in data.split(",")),
54+
fix,
55+
reason,
56+
)
5257

5358

5459
def build_dict(

0 commit comments

Comments
 (0)