88"""
99from collections import Counter
1010from string import digits
11- from typing import Callable , Iterable , ItemsView , List , Optional , Set , Tuple
11+ from typing import (
12+ Callable ,
13+ Dict ,
14+ ItemsView ,
15+ Iterable ,
16+ List ,
17+ Optional ,
18+ Set ,
19+ Tuple ,
20+ Union ,
21+ )
1222
1323from pythainlp import thai_digits , thai_letters
1424from pythainlp .corpus import tnc
@@ -70,10 +80,53 @@ def _edits2(word: str) -> Set[str]:
7080 return set (e2 for e1 in _edits1 (word ) for e2 in _edits1 (e1 ))
7181
7282
83+ def _convert_custom_dict (
84+ custom_dict : Union [
85+ Dict [str , int ], Iterable [str ], Iterable [Tuple [str , int ]]
86+ ],
87+ min_freq : int ,
88+ min_len : int ,
89+ max_len : int ,
90+ dict_filter : Optional [Callable [[str ], bool ]],
91+ ) -> List [Tuple [str , int ]]:
92+ """
93+ Converts a custom dictionary to a list of (str, int) tuples
94+ """
95+ if isinstance (custom_dict , dict ):
96+ custom_dict = [(word , freq ) for word , freq in custom_dict .items ()]
97+
98+ i = iter (custom_dict )
99+ first_member = next (i )
100+ if isinstance (first_member , str ):
101+ # create tuples of a word with frequency equal to 1,
102+ # and filter word list
103+ custom_dict = [
104+ (word , 1 )
105+ for word in custom_dict
106+ if _keep ((word , 1 ), 1 , min_len , max_len , dict_filter )
107+ ]
108+ elif isinstance (first_member , tuple ):
109+ # filter word list
110+ custom_dict = [
111+ word_freq
112+ for word_freq in custom_dict
113+ if _keep (word_freq , min_freq , min_len , max_len , dict_filter )
114+ ]
115+ else :
116+ raise TypeError (
117+ "custom_dict must be either Dict[str, int], "
118+ "Iterable[Tuple[str, int]], or Iterable[str]"
119+ )
120+
121+ return custom_dict
122+
123+
73124class NorvigSpellChecker :
74125 def __init__ (
75126 self ,
76- custom_dict : List [Tuple [str , int ]] = None ,
127+ custom_dict : Union [
128+ Dict [str , int ], Iterable [str ], Iterable [Tuple [str , int ]]
129+ ] = None ,
77130 min_freq : int = 2 ,
78131 min_len : int = 2 ,
79132 max_len : int = 40 ,
@@ -91,9 +144,17 @@ def __init__(
91144 Then, it selects the candidate with
92145 the highest word occurrence probability.
93146
94- :param str custom_dict: A list of tuple (word, frequency) to create
95- a spelling dictionary. Default is from
96- Thai National Corpus (around 40,000 words).
147+ :param str custom_dict: A custom spelling dictionary. This can be:
148+ (1) a dictionary (`dict`), with words (`str`)
149+ as keys and frequencies (`int`) as values;
150+ (2) an iterable (list, tuple, or set) of word
151+ (`str`) and frequency (`int`) tuples:
152+ `(str, int)`; or
153+ (3) an iterable of just words (`str`), without
154+ frequencies -- in this case `1` will be
155+ assigned to every words.
156+ Default is from Thai National Corpus (around
157+ 40,000 words).
97158 :param int min_freq: Minimum frequency of a word to keep (default = 2)
98159 :param int min_len: Minimum length (in characters) of a word to keep
99160 (default = 2)
@@ -110,12 +171,9 @@ def __init__(
110171 if not dict_filter :
111172 dict_filter = _no_filter
112173
113- # filter word list
114- custom_dict = [
115- word_freq
116- for word_freq in custom_dict
117- if _keep (word_freq , min_freq , min_len , max_len , dict_filter )
118- ]
174+ custom_dict = _convert_custom_dict (
175+ custom_dict , min_freq , min_len , max_len , dict_filter
176+ )
119177
120178 self .__WORDS = Counter (dict (custom_dict ))
121179 self .__WORDS += Counter () # remove zero and negative counts
0 commit comments