Skip to content

Commit 4d3660e

Browse files
committed
rename argument name for download()
1 parent a2a397c commit 4d3660e

File tree

10 files changed

+142
-134
lines changed

10 files changed

+142
-134
lines changed

pythainlp/corpus/core.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def _check_hash(dst: str, md5: str) -> None:
175175

176176

177177
def download(
178-
name: str, force: bool = False, corpus_db_url: str = None
178+
name: str, force: bool = False, url: str = None
179179
) -> bool:
180180
"""
181181
Download corpus.
@@ -185,7 +185,7 @@ def download(
185185
186186
:param str name: corpus name
187187
:param bool force: force download
188-
:param str
188+
:param str url: URL of the corpus catalog
189189
:return: **True** if the corpus is found and succesfully downloaded.
190190
Otherwise, it returns **False**.
191191
:rtype: bool
@@ -205,12 +205,12 @@ def download(
205205
``$HOME/pythainlp-data/``
206206
(e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
207207
"""
208-
if not corpus_db_url:
209-
corpus_db_url = corpus_db_url()
208+
if not url:
209+
url = corpus_db_url()
210210

211-
corpus_db = get_corpus_db(corpus_db_url)
211+
corpus_db = get_corpus_db(url)
212212
if not corpus_db:
213-
print(f"Cannot download corpus database from: {corpus_db_url}")
213+
print(f"Cannot download corpus catalog from: {url}")
214214
return False
215215

216216
corpus_db = corpus_db.json()

pythainlp/tools/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"get_full_data_path",
44
"get_pythainlp_data_path",
55
"get_pythainlp_path",
6-
"PYTHAINLP_DATA_DIR"
6+
"PYTHAINLP_DATA_DIR",
77
]
88

99
from pythainlp.tools.path import (

pythainlp/tools/path.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ def get_pythainlp_data_path() -> str:
4949
get_pythainlp_data_path()
5050
# output: '/root/pythainlp-data'
5151
"""
52-
path = os.getenv('PYTHAINLP_DATA_DIR',
53-
os.path.join("~", PYTHAINLP_DATA_DIR))
52+
path = os.getenv(
53+
"PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
54+
)
5455
path = os.path.expanduser(path)
5556
os.makedirs(path, exist_ok=True)
5657
return path

tests/test_benchmarks.py

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,68 +9,70 @@
99

1010

1111
class TestBenchmarksPackage(unittest.TestCase):
12-
1312
def test_preprocessing(self):
14-
self.assertIsNotNone(word_tokenization.preprocessing(
15-
txt="ทดสอบ การ ทำ ความสะอาด ข้อมูล<tag>ok</tag>"
16-
))
13+
self.assertIsNotNone(
14+
word_tokenization.preprocessing(
15+
txt="ทดสอบ การ ทำ ความสะอาด ข้อมูล<tag>ok</tag>"
16+
)
17+
)
1718

1819
def test_benchmark_not_none(self):
19-
self.assertIsNotNone(word_tokenization.benchmark(
20-
["วัน", "จัน", "ทร์", "สี", "เหลือง"],
21-
["วัน", "จันทร์", "สี", "เหลือง"]
22-
))
20+
self.assertIsNotNone(
21+
word_tokenization.benchmark(
22+
["วัน", "จัน", "ทร์", "สี", "เหลือง"],
23+
["วัน", "จันทร์", "สี", "เหลือง"],
24+
)
25+
)
2326

2427
def test_binary_representation(self):
2528
sentence = "อากาศ|ร้อน|มาก|ครับ"
2629
rept = word_tokenization._binary_representation(sentence)
2730

2831
self.assertEqual(
29-
[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
30-
rept.tolist()
32+
[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], rept.tolist()
3133
)
3234

3335
def test_compute_stats(self):
34-
for pair in TEST_DATA['sentences']:
35-
exp, act = pair['expected'], pair['actual']
36+
for pair in TEST_DATA["sentences"]:
37+
exp, act = pair["expected"], pair["actual"]
3638

3739
result = word_tokenization.compute_stats(
3840
word_tokenization.preprocessing(exp),
39-
word_tokenization.preprocessing(act)
40-
)
41+
word_tokenization.preprocessing(act),
42+
)
4143

4244
self.assertIsNotNone(result)
4345

4446
def test_benchmark(self):
4547
expected = []
4648
actual = []
47-
for pair in TEST_DATA['sentences']:
48-
expected.append(pair['expected'])
49-
actual.append(pair['actual'])
49+
for pair in TEST_DATA["sentences"]:
50+
expected.append(pair["expected"])
51+
actual.append(pair["actual"])
5052

5153
df = word_tokenization.benchmark(expected, actual)
5254

5355
self.assertIsNotNone(df)
5456

5557
def test_count_correctly_tokenised_words(self):
56-
for d in TEST_DATA['binary_sentences']:
57-
sample = np.array(list(d['actual'])).astype(int)
58-
ref_sample = np.array(list(d['expected'])).astype(int)
58+
for d in TEST_DATA["binary_sentences"]:
59+
sample = np.array(list(d["actual"])).astype(int)
60+
ref_sample = np.array(list(d["expected"])).astype(int)
5961

6062
sb = list(word_tokenization._find_word_boudaries(sample))
6163
rb = list(word_tokenization._find_word_boudaries(ref_sample))
6264

6365
# in binary [{0, 1}, ...]
64-
correctly_tokenized_words = word_tokenization\
65-
._find_words_correctly_tokenised(rb, sb)
66+
correctly_tokenized_words = word_tokenization._find_words_correctly_tokenised(
67+
rb, sb
68+
)
6669

6770
self.assertEqual(
68-
np.sum(correctly_tokenized_words),
69-
d['expected_count']
71+
np.sum(correctly_tokenized_words), d["expected_count"]
7072
)
7173

7274
def test_words_correctly_tokenised(self):
73-
r = [(0, 2), (2, 10), (10, 12) ]
75+
r = [(0, 2), (2, 10), (10, 12)]
7476
s = [(0, 10), (10, 12)]
7577

7678
expected = "01"
@@ -79,10 +81,7 @@ def test_words_correctly_tokenised(self):
7981
self.assertEqual(expected, "".join(np.array(labels).astype(str)))
8082

8183
def test_flatten_result(self):
82-
result = dict(
83-
key1=dict(v1=6),
84-
key2=dict(v2=7)
85-
)
84+
result = dict(key1=dict(v1=6), key2=dict(v2=7))
8685

8786
actual = word_tokenization._flatten_result(result)
88-
self.assertEqual(actual, {'key1:v1': 6, 'key2:v2': 7})
87+
self.assertEqual(actual, {"key1:v1": 6, "key2:v2": 7})

tests/test_corpus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_corpus(self):
4444
self.assertTrue(download(name="test", force=True)) # force download
4545
self.assertTrue(download(name="test")) # try download existing
4646
self.assertFalse(
47-
download(name="test", corpus_db_url="wrongurl")
47+
download(name="test", url="wrongurl")
4848
) # URL not exist
4949
self.assertFalse(
5050
download(name="XxxXXxxx817d37sf")

tests/test_soundex.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77

88
class TestSoundexPackage(unittest.TestCase):
9-
109
def test_soundex(self):
1110
self.assertIsNotNone(soundex("a", engine="lk82"))
1211
self.assertIsNotNone(soundex("a", engine="udom83"))

tests/test_tag.py

Lines changed: 54 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,10 @@
55
from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
66
from pythainlp.tag.locations import tag_provinces
77
from pythainlp.tag.named_entity import ThaiNameTagger
8-
from pythainlp.tokenize import (
9-
word_tokenize,
10-
)
8+
from pythainlp.tokenize import word_tokenize
119

1210

1311
class TestTagPackage(unittest.TestCase):
14-
1512
def test_pos_tag(self):
1613
tokens = ["ผม", "รัก", "คุณ"]
1714

@@ -81,108 +78,102 @@ def test_ner(self):
8178
"""คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น
8279
วิทยาเขตหนองคาย 112 หมู่ 7 บ้านหนองเดิ่น ตำบลหนองกอมเกาะ อำเภอเมือง
8380
จังหวัดหนองคาย 43000""",
84-
tag=True
81+
tag=True,
8582
)
8683
)
8784

8885
# arguement `tag` is True
8986
self.assertEqual(
90-
ner.get_ner(
91-
"วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
92-
tag=True
93-
),
87+
ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True),
9488
"วันที่ <DATE>15 ก.ย. 61</DATE> "
95-
"ทดสอบระบบเวลา <TIME>14:49 น.</TIME>")
89+
"ทดสอบระบบเวลา <TIME>14:49 น.</TIME>",
90+
)
9691

9792
self.assertEqual(
9893
ner.get_ner(
99-
"url = https://thainlp.org/pythainlp/docs/2.0/",
100-
tag=True
94+
"url = https://thainlp.org/pythainlp/docs/2.0/", tag=True
10195
),
102-
"url = <URL>https://thainlp.org/pythainlp/docs/2.0/</URL>")
96+
"url = <URL>https://thainlp.org/pythainlp/docs/2.0/</URL>",
97+
)
10398

10499
self.assertEqual(
105-
ner.get_ner(
106-
"example@gmail.com",
107-
tag=True
108-
),
109-
"<EMAIL>example@gmail.com</EMAIL>")
100+
ner.get_ner("example@gmail.com", tag=True),
101+
"<EMAIL>example@gmail.com</EMAIL>",
102+
)
110103

111104
self.assertEqual(
112-
ner.get_ner(
113-
"รหัสไปรษณีย์ 19130",
114-
tag=True
115-
),
116-
"รหัสไปรษณีย์ <ZIP>19130</ZIP>")
105+
ner.get_ner("รหัสไปรษณีย์ 19130", tag=True),
106+
"รหัสไปรษณีย์ <ZIP>19130</ZIP>",
107+
)
117108

118109
self.assertEqual(
119-
ner.get_ner(
120-
"เบอร์โทรศัพท์ 091-123-4567",
121-
tag=True
122-
),
123-
"เบอร์โทรศัพท์ <PHONE>091-123-4567</PHONE>")
110+
ner.get_ner("เบอร์โทรศัพท์ 091-123-4567", tag=True),
111+
"เบอร์โทรศัพท์ <PHONE>091-123-4567</PHONE>",
112+
)
124113

125114
self.assertEqual(
126-
ner.get_ner(
127-
"อาจารย์เอกพล ประจำคณะวิศวกรรมศาสตร์ ",
128-
tag=True
129-
),
115+
ner.get_ner("อาจารย์เอกพล ประจำคณะวิศวกรรมศาสตร์ ", tag=True),
130116
"<PERSON>อาจารย์เอกพล</PERSON> ประจำ<ORGANIZATION>"
131-
"คณะวิศวกรรมศาสตร์</ORGANIZATION> ")
117+
"คณะวิศวกรรมศาสตร์</ORGANIZATION> ",
118+
)
132119

133120
self.assertEqual(
134121
ner.get_ner(
135122
"มาตรา 80 ปพพ ให้ใช้อัตราภาษีร้อยละ 10.0"
136123
" ในการคำนวณภาษีมูลค่าเพิ่ม",
137-
tag=True
124+
tag=True,
138125
),
139126
"<LAW>มาตรา 80 ปพพ</LAW> "
140127
"ให้ใช้อัตราภาษี<PERCENT>ร้อยละ 10.0</PERCENT>"
141-
" ในการคำนวณภาษีมูลค่าเพิ่ม")
128+
" ในการคำนวณภาษีมูลค่าเพิ่ม",
129+
)
142130

143131
self.assertEqual(
144-
ner.get_ner(
145-
"ยาว 20 เซนติเมตร",
146-
tag=True
147-
),
148-
"ยาว <LEN>20 เซนติเมตร</LEN>")
132+
ner.get_ner("ยาว 20 เซนติเมตร", tag=True),
133+
"ยาว <LEN>20 เซนติเมตร</LEN>",
134+
)
149135

150136
self.assertEqual(
151-
ner.get_ner(
152-
"1 บาท",
153-
pos=True,
154-
tag=True),
155-
"<MONEY>1 บาท</MONEY>")
137+
ner.get_ner("1 บาท", pos=True, tag=True), "<MONEY>1 บาท</MONEY>"
138+
)
156139

157140
self.assertEqual(
158-
ner.get_ner(
159-
"ไทย",
160-
pos=False,
161-
tag=True
162-
),
163-
"<LOCATION>ไทย</LOCATION>")
141+
ner.get_ner("ไทย", pos=False, tag=True), "<LOCATION>ไทย</LOCATION>"
142+
)
164143

165144
# arguement `tag` is False and `pos` is True
166145
self.assertEqual(
167-
ner.get_ner(
168-
"ไทย",
169-
pos=True,
170-
tag=False
171-
),
172-
[('ไทย', 'PROPN', 'B-LOCATION')])
146+
ner.get_ner("ไทย", pos=True, tag=False),
147+
[("ไทย", "PROPN", "B-LOCATION")],
148+
)
173149

174150
# arguement `tag` is False and `pos` is False
175151
self.assertEqual(
176152
ner.get_ner(
177153
"วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
178154
pos=False,
179-
tag=False
155+
tag=False,
180156
),
181-
[('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'),
182-
(' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
183-
('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'),
184-
('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'),
185-
('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')])
157+
[
158+
("วันที่", "O"),
159+
(" ", "O"),
160+
("15", "B-DATE"),
161+
(" ", "I-DATE"),
162+
("ก.ย.", "I-DATE"),
163+
(" ", "I-DATE"),
164+
("61", "I-DATE"),
165+
(" ", "O"),
166+
("ทดสอบ", "O"),
167+
("ระบบ", "O"),
168+
("เวลา", "O"),
169+
(" ", "O"),
170+
("14", "B-TIME"),
171+
(":", "I-TIME"),
172+
("49", "I-TIME"),
173+
(" ", "I-TIME"),
174+
("น.", "I-TIME"),
175+
],
176+
)
186177

187178
# self.assertEqual(
188179
# ner.get_ner("แมวทำอะไรตอนห้าโมงเช้า"),

tests/test_tools.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,18 @@
22

33
import unittest
44

5+
from pythainlp.tools import (
6+
get_full_data_path,
7+
get_pythainlp_data_path,
8+
get_pythainlp_path,
9+
)
510

6-
class TestToolsPackage(unittest.TestCase):
711

8-
def setUp():
9-
pass
12+
class TestToolsPackage(unittest.TestCase):
13+
def test_path(self):
14+
data_filename = "ttc_freq.txt"
15+
self.assertTrue(
16+
get_full_data_path(data_filename).endswith(data_filename)
17+
)
18+
self.assertTrue(isinstance(get_pythainlp_data_path(), str))
19+
self.assertTrue(isinstance(get_pythainlp_path, str))

0 commit comments

Comments
 (0)