Skip to content

Commit 6e1ba03

Browse files
authored
Merge pull request #323 from PyThaiNLP/more-tests
More unit tests
2 parents cb27a35 + 96b154c commit 6e1ba03

File tree

7 files changed

+84
-29
lines changed

7 files changed

+84
-29
lines changed

pythainlp/corpus/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ def corpus_db_path() -> str:
4444
def get_corpus_db_detail(name: str) -> dict:
4545
db = TinyDB(corpus_db_path())
4646
query = Query()
47-
4847
res = db.search(query.name == name)
48+
db.close()
49+
4950
if res:
5051
return res[0]
5152
else:
@@ -286,8 +287,10 @@ def remove(name: str) -> bool:
286287
path = get_corpus_path(name)
287288
os.remove(path)
288289
db.remove(query.name == name)
290+
db.close()
289291
return True
290292

293+
db.close()
291294
return False
292295

293296

pythainlp/tokenize/attacut.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
"""
66
from typing import List
77

8-
import attacut
8+
from attacut import tokenize
99

1010

1111
def segment(text: str) -> List[str]:
1212
if not text or not isinstance(text, str):
1313
return []
1414

15-
return attacut.tokenize(text)
15+
return tokenize(text)

pythainlp/tokenize/deepcut.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from typing import List, Union
1111

12-
import deepcut
12+
from deepcut import tokenize
1313

1414
from .trie import Trie
1515

@@ -22,6 +22,6 @@ def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[
2222
if isinstance(custom_dict, Trie):
2323
custom_dict = list(custom_dict)
2424

25-
return deepcut.tokenize(text, custom_dict)
25+
return tokenize(text, custom_dict)
2626

27-
return deepcut.tokenize(text)
27+
return tokenize(text)

pythainlp/tokenize/ssg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# -*- coding: utf-8 -*-
22
from typing import List
33

4-
import ssg
4+
from ssg import syllable_tokenize
55

66

77
def segment(text: str) -> List[str]:
88
if not text or not isinstance(text, str):
99
return []
1010

11-
return ssg.syllable_tokenize(text)
11+
return syllable_tokenize(text)

tests/test_corpus.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,12 @@ def test_corpus(self):
3535
self.assertIsNotNone(thai_words())
3636
self.assertIsNotNone(thai_female_names())
3737
self.assertIsNotNone(thai_male_names())
38+
self.assertEqual(get_corpus_db_detail("XXX"), {})
3839
self.assertIsNone(download("test"))
3940
self.assertIsNone(download("test", force=True))
4041
self.assertIsNotNone(get_corpus_db_detail("test"))
4142
self.assertIsNotNone(remove("test"))
43+
self.assertFalse(remove("test"))
4244

4345
def test_tnc(self):
4446
self.assertIsNotNone(tnc.word_freqs())
@@ -48,6 +50,7 @@ def test_ttc(self):
4850

4951
def test_wordnet(self):
5052
self.assertIsNotNone(wordnet.langs())
53+
self.assertTrue("tha" in wordnet.langs())
5154

5255
self.assertEqual(
5356
wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]
@@ -69,6 +72,9 @@ def test_wordnet(self):
6972
self.assertEqual(
7073
wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)
7174
)
75+
self.assertEqual(
76+
wordnet.lch_similarity(bird, mouse), bird.lch_similarity(mouse)
77+
)
7278

7379
cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
7480
self.assertIsNotNone(wordnet.lemma_from_key(cat_key))

tests/test_tokenize.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,6 @@
2626

2727

2828
class TestTokenizePackage(unittest.TestCase):
29-
def test_dict_word_tokenize(self):
30-
self.assertEqual(dict_word_tokenize(""), [])
31-
3229
def test_etcc(self):
3330
self.assertEqual(etcc.segment(""), "")
3431
self.assertIsInstance(etcc.segment("คืนความสุข"), list)
@@ -61,24 +58,34 @@ def test_word_tokenize(self):
6158
self.assertIsNotNone(
6259
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="deepcut")
6360
)
64-
self.assertIsNotNone(
65-
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
66-
)
6761
self.assertIsNotNone(
6862
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="attacut")
6963
)
64+
self.assertIsNotNone(
65+
word_tokenize("หมอนทองตากลมหูว์MBK39", engine="XX")
66+
) # XX engine is not existed
7067

7168
self.assertIsNotNone(dict_trie(()))
7269
self.assertIsNotNone(dict_trie(("ทดสอบ", "สร้าง", "Trie")))
7370
self.assertIsNotNone(dict_trie(["ทดสอบ", "สร้าง", "Trie"]))
71+
self.assertIsNotNone(dict_trie({"ทดสอบ", "สร้าง", "Trie"}))
7472
self.assertIsNotNone(dict_trie(thai_words()))
7573
self.assertIsNotNone(dict_trie(DEFAULT_DICT_TRIE))
7674
self.assertIsNotNone(
7775
dict_trie(os.path.join(_CORPUS_PATH, _THAI_WORDS_FILENAME))
7876
)
7977

80-
self.assertIsNotNone(
81-
word_tokenize("รถไฟฟ้าBTS", custom_dict=DEFAULT_DICT_TRIE)
78+
self.assertTrue(
79+
"ไฟ" in word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
80+
)
81+
82+
# Commented out until this unittest bug get fixed:
83+
# https://bugs.python.org/issue29620
84+
# with self.assertWarns(DeprecationWarning):
85+
# dict_word_tokenize("เลิกใช้แล้ว", custom_dict=DEFAULT_DICT_TRIE)
86+
self.assertEqual(
87+
word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
88+
dict_word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])),
8289
)
8390

8491
def test_Tokenizer(self):
@@ -224,31 +231,45 @@ def test_sent_tokenize(self):
224231
def test_subword_tokenize(self):
225232
self.assertEqual(subword_tokenize(None), [])
226233
self.assertEqual(subword_tokenize(""), [])
234+
227235
self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="tcc"))
236+
self.assertFalse(
237+
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tcc")
238+
)
228239

229240
self.assertEqual(subword_tokenize(None, engine="etcc"), [])
230241
self.assertEqual(subword_tokenize("", engine="etcc"), [])
231242
self.assertIsNotNone(
232243
subword_tokenize("สวัสดิีดาวอังคาร", engine="etcc")
233244
)
245+
self.assertFalse(
246+
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="etcc")
247+
)
234248
self.assertIsNotNone(subword_tokenize("เบียร์สิงห์", engine="etcc"))
235249

236250
self.assertEqual(subword_tokenize(None, engine="ssg"), [])
237251
self.assertEqual(subword_tokenize("", engine="ssg"), [])
238-
self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร", engine="ssg"))
252+
self.assertTrue(
253+
"ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
254+
)
255+
self.assertFalse(
256+
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
257+
)
239258

240259
def test_syllable_tokenize(self):
241260
self.assertEqual(syllable_tokenize(None), [])
242261
self.assertEqual(syllable_tokenize(""), [])
243262
self.assertEqual(
244263
syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
245264
)
265+
self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก"))
246266

247267
self.assertEqual(syllable_tokenize(None, engine="ssg"), [])
248268
self.assertEqual(syllable_tokenize("", engine="ssg"), [])
249269
self.assertEqual(
250270
syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
251271
)
272+
self.assertFalse("า" in syllable_tokenize("แมวกินปลา", engine="etcc"))
252273

253274
def test_tcc(self):
254275
self.assertEqual(tcc.segment(None), [])

tests/test_util.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,10 @@ def test_thai_strftime(self):
168168
# ### pythainlp.util.thai_time
169169

170170
def test_thai_time(self):
171+
self.assertEqual(thai_time("8:17"), thai_time("08:17"))
171172
self.assertEqual(thai_time("8:17"), "แปดนาฬิกาสิบเจ็ดนาที")
172173
self.assertEqual(thai_time("8:17", "6h"), "สองโมงเช้าสิบเจ็ดนาที")
173174
self.assertEqual(thai_time("8:17", "m6h"), "แปดโมงสิบเจ็ดนาที")
174-
self.assertEqual(thai_time("18:30", "m6h"), "หกโมงครึ่ง")
175175
self.assertEqual(thai_time("13:30:01", "6h", "m"), "บ่ายโมงครึ่ง")
176176
self.assertEqual(
177177
thai_time(datetime.time(12, 3, 0)), "สิบสองนาฬิกาสามนาที"
@@ -181,23 +181,38 @@ def test_thai_time(self):
181181
"สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
182182
)
183183
self.assertEqual(
184-
thai_time(
185-
datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s"
186-
),
184+
thai_time(datetime.datetime(2014, 5, 22, 12, 3, 0), precision="s"),
187185
"สิบสองนาฬิกาสามนาทีศูนย์วินาที",
188186
)
189187
self.assertEqual(
190-
thai_time(
191-
datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m"
192-
),
188+
thai_time(datetime.datetime(2014, 5, 22, 12, 3, 1), precision="m"),
193189
"สิบสองนาฬิกาสามนาที",
194190
)
195191
self.assertEqual(
196-
thai_time(
197-
datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m"
198-
),
192+
thai_time(datetime.datetime(1976, 10, 6, 12, 30, 1), "6h", "m"),
199193
"เที่ยงครึ่ง",
200194
)
195+
self.assertEqual(thai_time("18:30"), "สิบแปดนาฬิกาสามสิบนาที")
196+
self.assertEqual(thai_time("18:30:00"), "สิบแปดนาฬิกาสามสิบนาที")
197+
self.assertEqual(
198+
thai_time("18:30:01"), "สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที"
199+
)
200+
self.assertEqual(
201+
thai_time("18:30:01", precision="m"), "สิบแปดนาฬิกาสามสิบนาที"
202+
)
203+
self.assertEqual(
204+
thai_time("18:30:01", precision="s"),
205+
"สิบแปดนาฬิกาสามสิบนาทีหนึ่งวินาที",
206+
)
207+
self.assertEqual(
208+
thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง"
209+
)
210+
self.assertEqual(
211+
thai_time("18:30:01", fmt="m6h"), "หกโมงสามสิบนาทีหนึ่งวินาที"
212+
)
213+
self.assertEqual(
214+
thai_time("18:30:01", fmt="m6h", precision="m"), "หกโมงครึ่ง"
215+
)
201216
self.assertIsNotNone(thai_time("0:30"))
202217
self.assertIsNotNone(thai_time("0:30", "6h"))
203218
self.assertIsNotNone(thai_time("0:30", "m6h"))
@@ -228,7 +243,12 @@ def test_thai_time(self):
228243
def test_delete_tone(self):
229244
self.assertEqual(delete_tone("จิ้น"), "จิน")
230245
self.assertEqual(delete_tone("เก๋า"), "เกา")
231-
self.assertEqual(delete_tone("จิ้น"), deletetone("จิ้น"))
246+
247+
# Commented out until this unittest bug get fixed:
248+
# https://bugs.python.org/issue29620
249+
# with self.assertWarns(DeprecationWarning):
250+
# deletetone("จิ้น")
251+
self.assertEqual(deletetone("จิ้น"), delete_tone("จิ้น"))
232252

233253
def test_normalize(self):
234254
self.assertEqual(normalize("เเปลก"), "แปลก")
@@ -256,7 +276,6 @@ def test_isthai(self):
256276
self.assertEqual(isthai("(ต.ค.)", ignore_chars=".()"), True)
257277

258278
def test_is_native_thai(self):
259-
self.assertEqual(is_native_thai("เลข"), thaicheck("เลข"))
260279
self.assertEqual(is_native_thai(None), False)
261280
self.assertEqual(is_native_thai(""), False)
262281
self.assertEqual(is_native_thai("116"), False)
@@ -276,3 +295,9 @@ def test_is_native_thai(self):
276295
self.assertEqual(is_native_thai("เลข"), False)
277296
self.assertEqual(is_native_thai("เทเวศน์"), False)
278297
self.assertEqual(is_native_thai("เทเวศร์"), False)
298+
299+
# Commented out until this unittest bug get fixed:
300+
# https://bugs.python.org/issue29620
301+
# with self.assertWarns(DeprecationWarning):
302+
# thaicheck("เลข")
303+
self.assertEqual(thaicheck("เลข"), is_native_thai("เลข"))

0 commit comments

Comments
 (0)