Skip to content

Commit e9a300b

Browse files
authored
Merge pull request #214 from bact/dev
Reorder if checks for dict_trie and more test cases
2 parents 2a7d85d + 89b4151 commit e9a300b

File tree

3 files changed

+43
-17
lines changed

3 files changed

+43
-17
lines changed

pythainlp/corpus/tnc.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,14 @@
1818

1919
def word_freq(word: str, domain: str = "all") -> int:
2020
"""
21-
Get word frequency of a word.
21+
**Not officially supported.**
22+
Get word frequency of a word by domain.
2223
This function will make a query to the server of Thai National Corpus.
2324
Internet connection is required.
2425
26+
**IMPORTANT:** Currently (as of 29 April 2019) always return 0,
27+
as the service URL has been changed and the code is not updated yet.
28+
2529
:param string word: word
2630
:param string domain: domain
2731
"""
@@ -39,6 +43,7 @@ def word_freq(word: str, domain: str = "all") -> int:
3943
"others": "0",
4044
}
4145
url = "http://www.arts.chula.ac.th/~ling/TNCII/corp.php"
46+
# New URL is http://www.arts.chula.ac.th/~ling/tnc3/
4247
data = {"genre[]": "", "domain[]": listdomain[domain], "sortby": "perc", "p": word}
4348

4449
r = requests.post(url, data=data)

pythainlp/tokenize/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,16 +186,17 @@ def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
186186
"""
187187
trie = None
188188

189-
if isinstance(dict_source, str):
189+
if isinstance(dict_source, Trie):
190+
trie = dict_source
191+
elif isinstance(dict_source, str):
190192
# Receive a file path of the dict to read
191193
with open(dict_source, "r", encoding="utf8") as f:
192194
_vocabs = f.read().splitlines()
193195
trie = Trie(_vocabs)
194196
elif isinstance(dict_source, Iterable):
197+
# Note: Trie and str are both Iterable, Iterable check should be here
195198
# Received a sequence type object of vocabs
196199
trie = Trie(dict_source)
197-
elif isinstance(dict_source, Trie):
198-
trie = dict_source
199200
else:
200201
raise TypeError(
201202
"Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"

tests/__init__.py

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,17 @@
33
Unit test
44
"""
55
import datetime
6+
import os
67
import unittest
78
from collections import Counter
89

910
from nltk.corpus import wordnet as wn
1011
from pythainlp import word_vector
1112
from pythainlp.corpus import (
13+
_CORPUS_PATH,
1214
conceptnet,
1315
countries,
16+
download,
1417
provinces,
1518
remove,
1619
thai_negations,
@@ -20,44 +23,44 @@
2023
tnc,
2124
ttc,
2225
wordnet,
23-
download,
2426
)
27+
from pythainlp.corpus.common import _THAI_WORDS_FILENAME
2528
from pythainlp.soundex import lk82, metasound, soundex, udom83
26-
from pythainlp.spell import correct, spell, NorvigSpellChecker
29+
from pythainlp.spell import NorvigSpellChecker, correct, spell
2730
from pythainlp.summarize import summarize
2831
from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
2932
from pythainlp.tag.locations import tag_provinces
3033
from pythainlp.tag.named_entity import ThaiNameTagger
34+
from pythainlp.tokenize import DEFAULT_DICT_TRIE, FROZEN_DICT_TRIE, Tokenizer
35+
from pythainlp.tokenize import deepcut as tokenize_deepcut
3136
from pythainlp.tokenize import (
32-
DEFAULT_DICT_TRIE,
33-
FROZEN_DICT_TRIE,
34-
Tokenizer,
3537
dict_trie,
3638
dict_word_tokenize,
3739
etcc,
3840
longest,
3941
multi_cut,
4042
newmm,
43+
)
44+
from pythainlp.tokenize import pyicu as tokenize_pyicu
45+
from pythainlp.tokenize import (
4146
sent_tokenize,
4247
subword_tokenize,
4348
syllable_tokenize,
4449
tcc,
4550
word_tokenize,
4651
)
47-
from pythainlp.tokenize import deepcut as tokenize_deepcut
48-
from pythainlp.tokenize import pyicu as tokenize_pyicu
4952
from pythainlp.transliterate import romanize, transliterate
5053
from pythainlp.transliterate.ipa import trans_list, xsampa_list
5154
from pythainlp.transliterate.royin import romanize as romanize_royin
5255
from pythainlp.util import (
5356
arabic_digit_to_thai_digit,
5457
bahttext,
5558
collate,
59+
countthai,
5660
deletetone,
5761
digit_to_text,
5862
eng_to_thai,
5963
find_keyword,
60-
countthai,
6164
isthai,
6265
isthaichar,
6366
normalize,
@@ -70,8 +73,8 @@
7073
thai_digit_to_arabic_digit,
7174
thai_strftime,
7275
thai_to_eng,
73-
thaiword_to_num,
7476
thaicheck,
77+
thaiword_to_num,
7578
)
7679

7780

@@ -139,17 +142,25 @@ def test_soundex(self):
139142
self.assertIsNotNone(soundex("a", engine="metasound"))
140143
self.assertIsNotNone(soundex("a", engine="XXX"))
141144

145+
self.assertEqual(lk82(None), "")
146+
self.assertEqual(lk82(""), "")
147+
self.assertEqual(lk82("เหตุ"), lk82("เหด"))
142148
self.assertEqual(lk82("รถ"), "ร3000")
143149
self.assertIsNotNone(lk82("เกาะ"))
144150
self.assertIsNotNone(lk82("อุยกูร์"))
145151
self.assertIsNotNone(lk82("หยากไย่"))
146152
self.assertIsNotNone(lk82("หอ"))
147-
self.assertEqual(lk82(""), "")
148153
self.assertEqual(lk82("น์"), "")
149154

150-
self.assertEqual(udom83("รถ"), "ร800000")
151155
self.assertEqual(udom83(None), "")
156+
self.assertEqual(udom83(""), "")
157+
self.assertEqual(udom83("เหตุ"), udom83("เหด"))
158+
self.assertEqual(udom83("รถ"), "ร800000")
152159

160+
self.assertEqual(metasound(None), "")
161+
self.assertEqual(metasound(""), "")
162+
self.assertEqual(metasound("เหตุ"), metasound("เหด"))
163+
self.assertEqual(metasound("รักษ์"), metasound("รัก"))
153164
self.assertEqual(metasound("บูรณะ"), "บ550")
154165
self.assertEqual(metasound("คน"), "ค500")
155166
self.assertEqual(metasound("คนA"), "ค500")
@@ -161,8 +172,11 @@ def test_soundex(self):
161172
self.assertIsNotNone(metasound("มา"))
162173
self.assertIsNotNone(metasound("ยา"))
163174
self.assertIsNotNone(metasound("วา"))
164-
self.assertEqual(metasound("รักษ์"), metasound("รัก"))
165-
self.assertEqual(metasound(""), "")
175+
self.assertIsNotNone(metasound("บูชา"))
176+
self.assertIsNotNone(metasound("กมลา"))
177+
self.assertIsNotNone(metasound("กาโวกาโว"))
178+
self.assertIsNotNone(metasound("สุวรรณา"))
179+
self.assertIsNotNone(metasound("ดอยบอย"))
166180

167181
# ### pythainlp.spell
168182

@@ -324,6 +338,9 @@ def test_word_tokenize(self):
324338
self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
325339
self.assertIsNotNone(dict_trie(thai_words()))
326340
self.assertIsNotNone(dict_trie(FROZEN_DICT_TRIE))
341+
self.assertIsNotNone(
342+
dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))
343+
)
327344

328345
self.assertIsNotNone(word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE))
329346
self.assertIsNotNone(
@@ -339,6 +356,9 @@ def test_Tokenizer(self):
339356
t_test.set_tokenize_engine("longest")
340357
self.assertEqual(t_test.word_tokenize(None), [])
341358

359+
t_test = Tokenizer()
360+
self.assertEqual(t_test.word_tokenize("ก"), ["ก"])
361+
342362
def test_word_tokenize_icu(self):
343363
self.assertEqual(tokenize_pyicu.segment(None), [])
344364
self.assertEqual(tokenize_pyicu.segment(""), [])

0 commit comments

Comments
 (0)