Skip to content

Commit 25bd28b

Browse files
authored
Merge pull request #421 from PyThaiNLP/corpus-version
Update check corpus version
2 parents d460708 + 800d36b commit 25bd28b

File tree

6 files changed

+32
-18
lines changed

6 files changed

+32
-18
lines changed

pythainlp/cli/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def catalog(self, argv):
9090
corpus_names = sorted(corpus_db.keys())
9191
print("Dataset/corpus available for download:")
9292
for name in corpus_names:
93-
print(f"- {name} {corpus_db[name]['version']}", end="")
93+
print(f"- {name} {corpus_db[name]['latest_version']}", end="")
9494
corpus_info = corpus.get_corpus_db_detail(name)
9595
if corpus_info:
9696
print(f" (Local: {corpus_info['version']})")

pythainlp/corpus/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
_CORPUS_DB_URL = (
4141
"https://raw.githubusercontent.com/"
4242
"PyThaiNLP/pythainlp-corpus/"
43-
"2.2/db.json"
43+
"fix-thainer/db.json" # 2.2
4444
)
4545

4646
# local corpus catalog filename

pythainlp/corpus/core.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def _check_hash(dst: str, md5: str) -> None:
189189
raise Exception("Hash does not match expected.")
190190

191191

192-
def download(name: str, force: bool = False, url: str = None) -> bool:
192+
def download(name: str, force: bool = False, url: str = None, version: str = None) -> bool:
193193
"""
194194
Download corpus.
195195
@@ -199,6 +199,7 @@ def download(name: str, force: bool = False, url: str = None) -> bool:
199199
:param str name: corpus name
200200
:param bool force: force download
201201
:param str url: URL of the corpus catalog
202+
:param str version: Version of the corpus
202203
:return: **True** if the corpus is found and succesfully downloaded.
203204
Otherwise, it returns **False**.
204205
:rtype: bool
@@ -233,39 +234,49 @@ def download(name: str, force: bool = False, url: str = None) -> bool:
233234
local_db = TinyDB(corpus_db_path())
234235
query = Query()
235236

236-
corpus = corpus_db[name]
237+
corpus = corpus_db[name.lower()]
237238
print("Corpus:", name)
238-
found = local_db.search(query.name == name)
239+
if version is None:
240+
version = corpus['latest_version']
241+
corpus_versions = corpus["versions"][version]
242+
file_name = corpus_versions["filename"]
243+
found = local_db.search((query.name == name) & (query.version == version))
239244

240245
# If not found in local, download
241246
if force or not found:
242-
print(f"- Downloading: {name} {corpus['version']}")
243-
_download(corpus["download"], corpus["file_name"])
244-
_check_hash(corpus["file_name"], corpus["md5"])
247+
print(f"- Downloading: {name} {version}")
248+
_download(
249+
corpus_versions["download_url"],
250+
file_name,
251+
)
252+
_check_hash(
253+
file_name,
254+
corpus_versions["md5"],
255+
)
245256

246257
if found:
247258
local_db.update(
248-
{"version": corpus["version"]}, query.name == name
259+
{"version": version}, query.name == name
249260
)
250261
else:
251262
local_db.insert(
252263
{
253264
"name": name,
254-
"version": corpus["version"],
255-
"file_name": corpus["file_name"],
265+
"version": version,
266+
"file_name": file_name,
256267
}
257268
)
258269
else:
259270
if local_db.search(
260-
query.name == name and query.version == corpus["version"]
271+
query.name == name and query.version == version
261272
):
262273
# Already has the same version
263274
print("- Already up to date.")
264275
else:
265276
# Has the corpus but different version
266277
current_ver = local_db.search(query.name == name)[0]["version"]
267278
print(f"- Existing version: {current_ver}")
268-
print(f"- New version available: {corpus['version']}")
279+
print(f"- New version available: {version}")
269280
print("- Use download(data_name, force=True) to update")
270281

271282
local_db.close()

pythainlp/tag/named_entity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pythainlp.tokenize import word_tokenize
1414
from pythainlp.util import isthai
1515

16-
_CORPUS_NAME = "thainer-1-4"
16+
_CORPUS_NAME = "thainer"
1717
_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data
1818

1919

tests/test_corpus.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def test_corpus(self):
5252
self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists
5353
self.assertTrue(remove("test")) # remove existing
5454
self.assertFalse(remove("test")) # remove non-existing
55+
self.assertTrue(download(name="test", version="0.1"))
56+
self.assertTrue(remove("test"))
5557

5658
def test_tnc(self):
5759
self.assertIsNotNone(tnc.word_freqs())

tests/test_util.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -361,10 +361,11 @@ def test_thaiword_to_date(self):
361361
now + timedelta(days=-2), thaiword_to_date("วานซืน", now)
362362
)
363363

364-
self.assertEqual(
365-
thaiword_to_date("วันนี้").day + 1,
366-
thaiword_to_date("พรุ่งนี้").day,
367-
)
364+
# it's error if "พรุ่งนี้" is 1 not 32.
365+
#self.assertEqual(
366+
# thaiword_to_date("วันนี้").day + 1,
367+
# thaiword_to_date("พรุ่งนี้").day,
368+
#)
368369
self.assertIsNone(thaiword_to_date("วันไหน"))
369370

370371
# ### pythainlp.util.trie

0 commit comments

Comments
 (0)