33Unit test
44"""
55import datetime
6+ import os
67import unittest
78from collections import Counter
89
910from nltk .corpus import wordnet as wn
1011from pythainlp import word_vector
1112from pythainlp .corpus import (
13+ _CORPUS_PATH ,
1214 conceptnet ,
1315 countries ,
16+ download ,
1417 provinces ,
1518 remove ,
1619 thai_negations ,
2023 tnc ,
2124 ttc ,
2225 wordnet ,
23- download ,
2426)
27+ from pythainlp .corpus .common import _THAI_WORDS_FILENAME
2528from pythainlp .soundex import lk82 , metasound , soundex , udom83
26- from pythainlp .spell import correct , spell , NorvigSpellChecker
29+ from pythainlp .spell import NorvigSpellChecker , correct , spell
2730from pythainlp .summarize import summarize
2831from pythainlp .tag import perceptron , pos_tag , pos_tag_sents , unigram
2932from pythainlp .tag .locations import tag_provinces
3033from pythainlp .tag .named_entity import ThaiNameTagger
34+ from pythainlp .tokenize import DEFAULT_DICT_TRIE , FROZEN_DICT_TRIE , Tokenizer
35+ from pythainlp .tokenize import deepcut as tokenize_deepcut
3136from pythainlp .tokenize import (
32- DEFAULT_DICT_TRIE ,
33- FROZEN_DICT_TRIE ,
34- Tokenizer ,
3537 dict_trie ,
3638 dict_word_tokenize ,
3739 etcc ,
3840 longest ,
3941 multi_cut ,
4042 newmm ,
43+ )
44+ from pythainlp .tokenize import pyicu as tokenize_pyicu
45+ from pythainlp .tokenize import (
4146 sent_tokenize ,
4247 subword_tokenize ,
4348 syllable_tokenize ,
4449 tcc ,
4550 word_tokenize ,
4651)
47- from pythainlp .tokenize import deepcut as tokenize_deepcut
48- from pythainlp .tokenize import pyicu as tokenize_pyicu
4952from pythainlp .transliterate import romanize , transliterate
5053from pythainlp .transliterate .ipa import trans_list , xsampa_list
5154from pythainlp .transliterate .royin import romanize as romanize_royin
5255from pythainlp .util import (
5356 arabic_digit_to_thai_digit ,
5457 bahttext ,
5558 collate ,
59+ countthai ,
5660 deletetone ,
5761 digit_to_text ,
5862 eng_to_thai ,
5963 find_keyword ,
60- countthai ,
6164 isthai ,
6265 isthaichar ,
6366 normalize ,
7073 thai_digit_to_arabic_digit ,
7174 thai_strftime ,
7275 thai_to_eng ,
73- thaiword_to_num ,
7476 thaicheck ,
77+ thaiword_to_num ,
7578)
7679
7780
@@ -139,17 +142,25 @@ def test_soundex(self):
139142 self .assertIsNotNone (soundex ("a" , engine = "metasound" ))
140143 self .assertIsNotNone (soundex ("a" , engine = "XXX" ))
141144
145+ self .assertEqual (lk82 (None ), "" )
146+ self .assertEqual (lk82 ("" ), "" )
147+ self .assertEqual (lk82 ("เหตุ" ), lk82 ("เหด" ))
142148 self .assertEqual (lk82 ("รถ" ), "ร3000" )
143149 self .assertIsNotNone (lk82 ("เกาะ" ))
144150 self .assertIsNotNone (lk82 ("อุยกูร์" ))
145151 self .assertIsNotNone (lk82 ("หยากไย่" ))
146152 self .assertIsNotNone (lk82 ("หอ" ))
147- self .assertEqual (lk82 ("" ), "" )
148153 self .assertEqual (lk82 ("น์" ), "" )
149154
150- self .assertEqual (udom83 ("รถ" ), "ร800000" )
151155 self .assertEqual (udom83 (None ), "" )
156+ self .assertEqual (udom83 ("" ), "" )
157+ self .assertEqual (udom83 ("เหตุ" ), udom83 ("เหด" ))
158+ self .assertEqual (udom83 ("รถ" ), "ร800000" )
152159
160+ self .assertEqual (metasound (None ), "" )
161+ self .assertEqual (metasound ("" ), "" )
162+ self .assertEqual (metasound ("เหตุ" ), metasound ("เหด" ))
163+ self .assertEqual (metasound ("รักษ์" ), metasound ("รัก" ))
153164 self .assertEqual (metasound ("บูรณะ" ), "บ550" )
154165 self .assertEqual (metasound ("คน" ), "ค500" )
155166 self .assertEqual (metasound ("คนA" ), "ค500" )
@@ -161,8 +172,11 @@ def test_soundex(self):
161172 self .assertIsNotNone (metasound ("มา" ))
162173 self .assertIsNotNone (metasound ("ยา" ))
163174 self .assertIsNotNone (metasound ("วา" ))
164- self .assertEqual (metasound ("รักษ์" ), metasound ("รัก" ))
165- self .assertEqual (metasound ("" ), "" )
175+ self .assertIsNotNone (metasound ("บูชา" ))
176+ self .assertIsNotNone (metasound ("กมลา" ))
177+ self .assertIsNotNone (metasound ("กาโวกาโว" ))
178+ self .assertIsNotNone (metasound ("สุวรรณา" ))
179+ self .assertIsNotNone (metasound ("ดอยบอย" ))
166180
167181 # ### pythainlp.spell
168182
@@ -324,6 +338,9 @@ def test_word_tokenize(self):
324338 self .assertIsNotNone (dict_trie (["ทดสอบ" , "สร้าง" , "Trie" ]))
325339 self .assertIsNotNone (dict_trie (thai_words ()))
326340 self .assertIsNotNone (dict_trie (FROZEN_DICT_TRIE ))
341+ self .assertIsNotNone (
342+ dict_trie (os .path .join (_CORPUS_PATH , _THAI_WORDS_FILENAME ))
343+ )
327344
328345 self .assertIsNotNone (word_tokenize ("รถไฟฟ้าBTS" , custom_dict = DEFAULT_DICT_TRIE ))
329346 self .assertIsNotNone (
@@ -339,6 +356,9 @@ def test_Tokenizer(self):
339356 t_test .set_tokenize_engine ("longest" )
340357 self .assertEqual (t_test .word_tokenize (None ), [])
341358
359+ t_test = Tokenizer ()
360+ self .assertEqual (t_test .word_tokenize ("ก" ), ["ก" ])
361+
342362 def test_word_tokenize_icu (self ):
343363 self .assertEqual (tokenize_pyicu .segment (None ), [])
344364 self .assertEqual (tokenize_pyicu .segment ("" ), [])
0 commit comments