Skip to content

Commit a2a397c

Browse files
committed
More test cases for corpus
1 parent d57b3be commit a2a397c

File tree

6 files changed

+57
-31
lines changed

6 files changed

+57
-31
lines changed

pythainlp/corpus/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,23 @@
5151

5252

5353
def corpus_path() -> str:
54+
"""
55+
Get path where corpus files are kept locally.
56+
"""
5457
return _CORPUS_PATH
5558

5659

5760
def corpus_db_url() -> str:
61+
"""
62+
Get remote URL of corpus catalog.
63+
"""
5864
return _CORPUS_DB_URL
5965

6066

6167
def corpus_db_path() -> str:
68+
"""
69+
Get local path of corpus catalog.
70+
"""
6271
return _CORPUS_DB_PATH
6372

6473

pythainlp/corpus/core.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def get_corpus(filename: str) -> frozenset:
4848
`this file
4949
<https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
5050
51-
:param string filename: filename of the corpus to be read
51+
:param str filename: filename of the corpus to be read
5252
5353
:return: :mod:`frozenset` consist of lines in the file
5454
:rtype: :mod:`frozenset`
@@ -81,7 +81,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
8181
"""
8282
Get corpus path.
8383
84-
:param string name: corpus name
84+
:param str name: corpus name
8585
:return: path to the corpus or **None** of the corpus doesn't
8686
exist in the device
8787
:rtype: str
@@ -141,6 +141,7 @@ def _download(url: str, dst: str) -> int:
141141
pbar = None
142142
try:
143143
from tqdm import tqdm
144+
144145
pbar = tqdm(total=int(r.headers["Content-Length"]))
145146
except ImportError:
146147
pbar = None
@@ -173,15 +174,18 @@ def _check_hash(dst: str, md5: str) -> None:
173174
raise Exception("Hash does not match expected.")
174175

175176

176-
def download(name: str, force: bool = False) -> bool:
177+
def download(
178+
name: str, force: bool = False, corpus_db_url: str = None
179+
) -> bool:
177180
"""
178181
Download corpus.
179182
180183
The available corpus names can be seen in this file:
181184
https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
182185
183-
:param string name: corpus name
186+
:param str name: corpus name
184187
:param bool force: force download
188+
:param str
185189
:return: **True** if the corpus is found and succesfully downloaded.
186190
Otherwise, it returns **False**.
187191
:rtype: bool
@@ -201,9 +205,12 @@ def download(name: str, force: bool = False) -> bool:
201205
``$HOME/pythainlp-data/``
202206
(e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
203207
"""
204-
corpus_db = get_corpus_db(corpus_db_url())
208+
if not corpus_db_url:
209+
corpus_db_url = corpus_db_url()
210+
211+
corpus_db = get_corpus_db(corpus_db_url)
205212
if not corpus_db:
206-
print(f"Cannot download corpus database from: {corpus_db_url()}")
213+
print(f"Cannot download corpus database from: {corpus_db_url}")
207214
return False
208215

209216
corpus_db = corpus_db.json()
@@ -259,7 +266,7 @@ def remove(name: str) -> bool:
259266
"""
260267
Remove corpus
261268
262-
:param string name: corpus name
269+
:param str name: corpus name
263270
:return: **True** if the corpus is found and succesfully removed.
264271
Otherwise, it returns **False**.
265272
:rtype: bool

pythainlp/soundex/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE) -> str:
1717
"""
1818
This function converts Thai text into phonetic code.
1919
20-
:param string text: word
20+
:param str text: word
2121
:param str engine: soundex engine
2222
:return: Soundex code
2323
:rtype: str

pythainlp/tag/named_entity.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,10 @@ def get_ner(
8989
"""
9090
This function tags named-entitiy from text in IOB format.
9191
92-
:param string text: text in Thai to be tagged
93-
:param boolean pos: To include POS tags in the results (`True`) or
92+
:param str text: text in Thai to be tagged
93+
:param bool pos: To include POS tags in the results (`True`) or
9494
exclude (`False`). The defualt value is `True`
95-
:param boolean tag: output like html tag.
95+
:param bool tag: output like html tag.
9696
:return: a list of tuple associated with tokenized word, NER tag,
9797
POS tag (if the parameter `pos` is specified as `True`),
9898
and output like html tag (if the parameter `tag` is

pythainlp/word_vector/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ def similarity(word1: str, word2: str) -> float:
177177
"""
178178
This function computae cosine similarity between two words.
179179
180-
:param string word1: first word to be compared
181-
:param string word2: second word to be compared
180+
:param str word1: first word to be compared
181+
:param str word2: second word to be compared
182182
183183
:raises KeyError: if either `word1` or `word2` is not in the vocabulary
184184
of the model.
@@ -218,8 +218,8 @@ def sentence_vectorizer(text: str, use_mean: bool = True) -> ndarray:
218218
Then, word vectors are aggregatesd into one vector of 300 dimension
219219
by calulating either mean, or summation of all word vectors.
220220
221-
:param string text: text input
222-
:param boolean use_mean: if `True` aggregate word vectors with mean of all
221+
:param str text: text input
222+
:param bool use_mean: if `True` aggregate word vectors with mean of all
223223
word vectors. Otherwise, aggregate with summation
224224
of all word vectors
225225

tests/test_corpus.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,31 @@ def test_conceptnet(self):
2727
self.assertIsNotNone(conceptnet.edges("รัก"))
2828

2929
def test_corpus(self):
30-
self.assertIsNotNone(countries())
31-
self.assertIsNotNone(provinces())
32-
self.assertIsNotNone(thai_negations())
33-
self.assertIsNotNone(thai_stopwords())
34-
self.assertIsNotNone(thai_syllables())
35-
self.assertIsNotNone(thai_words())
36-
self.assertIsNotNone(thai_female_names())
37-
self.assertIsNotNone(thai_male_names())
38-
self.assertEqual(get_corpus_db_detail("XXX"), {})
39-
self.assertIsNotNone(download("test"))
40-
self.assertIsNotNone(download("test", force=True))
41-
self.assertFalse(download("XxxXXxxx817d37sf")) # not exist
42-
self.assertIsNotNone(get_corpus_db_detail("test"))
43-
self.assertIsNotNone(remove("test"))
44-
self.assertFalse(remove("test"))
30+
self.assertTrue(isinstance(thai_negations(), frozenset))
31+
self.assertTrue(isinstance(thai_stopwords(), frozenset))
32+
self.assertTrue(isinstance(thai_syllables(), frozenset))
33+
self.assertTrue(isinstance(thai_words(), frozenset))
34+
35+
self.assertTrue(isinstance(countries(), frozenset))
36+
self.assertTrue(isinstance(provinces(), frozenset))
37+
self.assertTrue(isinstance(thai_female_names(), frozenset))
38+
self.assertTrue(isinstance(thai_male_names(), frozenset))
39+
40+
self.assertEqual(
41+
get_corpus_db_detail("XXX"), {}
42+
) # corpus does not exist
43+
self.assertTrue(download("test")) # download the first time
44+
self.assertTrue(download(name="test", force=True)) # force download
45+
self.assertTrue(download(name="test")) # try download existing
46+
self.assertFalse(
47+
download(name="test", corpus_db_url="wrongurl")
48+
) # URL not exist
49+
self.assertFalse(
50+
download(name="XxxXXxxx817d37sf")
51+
) # corpus name not exist
52+
self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists
53+
self.assertTrue(remove("test")) # remove existing
54+
self.assertFalse(remove("test")) # remove non-existing
4555

4656
def test_tnc(self):
4757
self.assertIsNotNone(tnc.word_freqs())
@@ -50,7 +60,7 @@ def test_ttc(self):
5060
self.assertIsNotNone(ttc.word_freqs())
5161

5262
def test_wordnet(self):
53-
self.assertIsNotNone(wordnet.langs())
63+
self.assertTrue(isinstance(wordnet.langs(), list))
5464
self.assertTrue("tha" in wordnet.langs())
5565

5666
self.assertEqual(

0 commit comments

Comments
 (0)