Skip to content

Commit 259c0d1

Browse files
committed
Refactor line tokenization to simplify an outer loop
The refactor is a stepping stone towards the next commit where the inner loop is moved to the `Spellchecker`.
1 parent d552f3d commit 259c0d1

File tree

1 file changed

+37
-17
lines changed

1 file changed

+37
-17
lines changed

codespell_lib/_codespell.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
Sequence,
4242
Set,
4343
Tuple,
44+
Callable,
4445
)
4546

4647
from ._text_util import fix_case
@@ -833,6 +834,34 @@ def apply_uri_ignore_words(
833834
return check_matches
834835

835836

837+
def line_tokenizer_factory(
838+
uri_ignore_words: Set[str],
839+
uri_regex: Pattern[str],
840+
word_regex: Pattern[str],
841+
ignore_word_regex: Optional[Pattern[str]],
842+
) -> Callable[[str], Iterable[re.Match[str]]]:
843+
def line_tokenizer(line: str) -> Iterable[Match[str]]:
844+
# If all URI spelling errors will be ignored, erase any URI before
845+
# extracting words. Otherwise, apply ignores after extracting words.
846+
# This ensures that if a URI ignore word occurs both inside a URI and
847+
# outside, it will still be a spelling error.
848+
if "*" in uri_ignore_words:
849+
line = uri_regex.sub(" ", line)
850+
check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
851+
if "*" not in uri_ignore_words:
852+
check_matches = apply_uri_ignore_words(
853+
check_matches,
854+
line,
855+
word_regex,
856+
ignore_word_regex,
857+
uri_regex,
858+
uri_ignore_words,
859+
)
860+
return check_matches
861+
862+
return line_tokenizer
863+
864+
836865
def parse_file(
837866
filename: str,
838867
colors: TermColors,
@@ -910,6 +939,13 @@ def parse_file(
910939
except OSError:
911940
return bad_count
912941

942+
line_tokenizer = line_tokenizer_factory(
943+
uri_ignore_words,
944+
uri_regex,
945+
word_regex,
946+
ignore_word_regex,
947+
)
948+
913949
for i, line in enumerate(lines):
914950
if line.rstrip() in exclude_lines:
915951
continue
@@ -926,23 +962,7 @@ def parse_file(
926962
fixed_words = set()
927963
asked_for = set()
928964

929-
# If all URI spelling errors will be ignored, erase any URI before
930-
# extracting words. Otherwise, apply ignores after extracting words.
931-
# This ensures that if a URI ignore word occurs both inside a URI and
932-
# outside, it will still be a spelling error.
933-
if "*" in uri_ignore_words:
934-
line = uri_regex.sub(" ", line)
935-
check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
936-
if "*" not in uri_ignore_words:
937-
check_matches = apply_uri_ignore_words(
938-
check_matches,
939-
line,
940-
word_regex,
941-
ignore_word_regex,
942-
uri_regex,
943-
uri_ignore_words,
944-
)
945-
for match in check_matches:
965+
for match in line_tokenizer(line):
946966
word = match.group()
947967
if word in ignore_words_cased:
948968
continue

0 commit comments

Comments
 (0)