Skip to content

Commit 7f24af2

Browse files
authored
Merge pull request #438 from PyThaiNLP/add-dict-input-norvigspell
Add dict and other iterables support for custom_dict input
2 parents ca1fd86 + ed3f5e9 commit 7f24af2

File tree

2 files changed

+100
-13
lines changed

2 files changed

+100
-13
lines changed

pythainlp/spell/pn.py

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,17 @@
88
"""
99
from collections import Counter
1010
from string import digits
11-
from typing import Callable, Iterable, ItemsView, List, Optional, Set, Tuple
11+
from typing import (
12+
Callable,
13+
Dict,
14+
ItemsView,
15+
Iterable,
16+
List,
17+
Optional,
18+
Set,
19+
Tuple,
20+
Union,
21+
)
1222

1323
from pythainlp import thai_digits, thai_letters
1424
from pythainlp.corpus import tnc
@@ -70,10 +80,53 @@ def _edits2(word: str) -> Set[str]:
7080
return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
7181

7282

83+
def _convert_custom_dict(
84+
custom_dict: Union[
85+
Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
86+
],
87+
min_freq: int,
88+
min_len: int,
89+
max_len: int,
90+
dict_filter: Optional[Callable[[str], bool]],
91+
) -> List[Tuple[str, int]]:
92+
"""
93+
Converts a custom dictionary to a list of (str, int) tuples
94+
"""
95+
if isinstance(custom_dict, dict):
96+
custom_dict = [(word, freq) for word, freq in custom_dict.items()]
97+
98+
i = iter(custom_dict)
99+
first_member = next(i)
100+
if isinstance(first_member, str):
101+
# create tuples of a word with frequency equal to 1,
102+
# and filter word list
103+
custom_dict = [
104+
(word, 1)
105+
for word in custom_dict
106+
if _keep((word, 1), 1, min_len, max_len, dict_filter)
107+
]
108+
elif isinstance(first_member, tuple):
109+
# filter word list
110+
custom_dict = [
111+
word_freq
112+
for word_freq in custom_dict
113+
if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
114+
]
115+
else:
116+
raise TypeError(
117+
"custom_dict must be either Dict[str, int], "
118+
"Iterable[Tuple[str, int]], or Iterable[str]"
119+
)
120+
121+
return custom_dict
122+
123+
73124
class NorvigSpellChecker:
74125
def __init__(
75126
self,
76-
custom_dict: List[Tuple[str, int]] = None,
127+
custom_dict: Union[
128+
Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
129+
] = None,
77130
min_freq: int = 2,
78131
min_len: int = 2,
79132
max_len: int = 40,
@@ -91,9 +144,17 @@ def __init__(
91144
Then, it selects the candidate with
92145
the highest word occurrence probability.
93146
94-
:param str custom_dict: A list of tuple (word, frequency) to create
95-
a spelling dictionary. Default is from
96-
Thai National Corpus (around 40,000 words).
147+
:param str custom_dict: A custom spelling dictionary. This can be:
148+
(1) a dictionary (`dict`), with words (`str`)
149+
as keys and frequencies (`int`) as values;
150+
(2) an iterable (list, tuple, or set) of word
151+
(`str`) and frequency (`int`) tuples:
152+
`(str, int)`; or
153+
(3) an iterable of just words (`str`), without
154+
frequencies -- in this case `1` will be
155+
assigned to every words.
156+
Default is from Thai National Corpus (around
157+
40,000 words).
97158
:param int min_freq: Minimum frequency of a word to keep (default = 2)
98159
:param int min_len: Minimum length (in characters) of a word to keep
99160
(default = 2)
@@ -110,12 +171,9 @@ def __init__(
110171
if not dict_filter:
111172
dict_filter = _no_filter
112173

113-
# filter word list
114-
custom_dict = [
115-
word_freq
116-
for word_freq in custom_dict
117-
if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
118-
]
174+
custom_dict = _convert_custom_dict(
175+
custom_dict, min_freq, min_len, max_len, dict_filter
176+
)
119177

120178
self.__WORDS = Counter(dict(custom_dict))
121179
self.__WORDS += Counter() # remove zero and negative counts

tests/test_spell.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def test_norvig_spell_checker(self):
3535
self.assertTrue(len(checker.dictionary()) > 0)
3636
self.assertGreaterEqual(checker.prob("มี"), 0)
3737

38-
custom_dict = [
38+
user_dict = [
3939
("การงาน", 31), # longer than max_len
4040
("กาม", 1), # fewer than min_freq
4141
("กาล0", 64), # has digit
@@ -45,6 +45,35 @@ def test_norvig_spell_checker(self):
4545
("การ", 42), # OK
4646
]
4747
checker = NorvigSpellChecker(
48-
custom_dict=custom_dict, min_freq=2, max_len=5
48+
custom_dict=user_dict, min_freq=2, max_len=5
4949
)
5050
self.assertEqual(len(checker.dictionary()), 1)
51+
52+
user_dict = [
53+
"เอกราช",
54+
"ปลอดภัย",
55+
"เศรษฐกิจ",
56+
"เสมอภาค",
57+
"เสรีภาพ",
58+
"การศึกษา",
59+
]
60+
checker = NorvigSpellChecker(custom_dict=user_dict)
61+
self.assertEqual(len(checker.dictionary()), len(user_dict))
62+
63+
user_dict = {
64+
"พหลโยธิน": 1,
65+
"ขีตตะสังคะ": 2,
66+
"พนมยงค์": 3,
67+
"ภมรมนตรี": 4,
68+
"มิตรภักดี": 5,
69+
"ลพานุกรม": 6,
70+
"สิงหเสนี": 7,
71+
}
72+
checker = NorvigSpellChecker(custom_dict=user_dict)
73+
# "พหลโยธิน" will be removed,
74+
# as it has frequency less than default min_freq (2)
75+
self.assertEqual(len(checker.dictionary()), len(user_dict) - 1)
76+
77+
user_dict = [24, 6, 2475]
78+
with self.assertRaises(TypeError):
79+
checker = NorvigSpellChecker(custom_dict=user_dict)

0 commit comments

Comments
 (0)