11# -*- coding: utf-8 -*-
22import hashlib
33import os
4- import queue
5- import threading
64from typing import NoReturn , Union
75from urllib .request import urlopen
86
2624_CORPUS_DB_FILENAME = "db.json"
2725_CORPUS_DB_PATH = get_full_data_path (_CORPUS_DB_FILENAME )
2826
27+ # Create a local corpus database if it does not already exist
2928if not os .path .exists (_CORPUS_DB_PATH ):
3029 TinyDB (_CORPUS_DB_PATH )
3130
@@ -45,14 +44,12 @@ def corpus_db_path() -> str:
4544def get_corpus_db_detail (name : str ) -> dict :
4645 db = TinyDB (corpus_db_path ())
4746 query = Query ()
48- return db .search (query .name == name )[0 ]
4947
50-
51- def read_text_corpus (path : str ) -> list :
52- lines = []
53- with open (path , "r" , encoding = "utf-8-sig" ) as fh :
54- lines = fh .read ().splitlines ()
55- return lines
48+ res = db .search (query .name == name )
49+ if res :
50+ return res [0 ]
51+ else :
52+ return dict ()
5653
5754
5855def get_corpus (filename : str ) -> frozenset :
@@ -71,6 +68,10 @@ def get_corpus(filename: str) -> frozenset:
7168
7269 from pythainlp.corpus import get_corpus
7370
71+ get_corpus('negations_th.txt')
72+ # output:
73+ # frozenset({'แต่', 'ไม่'})
74+
7475 get_corpus('ttc_freq.txt')
7576 # output:
7677 # frozenset({'โดยนัยนี้\\ t1',
@@ -81,12 +82,11 @@ def get_corpus(filename: str) -> frozenset:
8182 # 'เหนี่ยง\\ t3',
8283 # 'ชงฆ์\\ t3',
8384 # ...})
84-
85- get_corpus('negations_th.txt')
86- # output:
87- # frozenset({'แต่', 'ไม่'})
8885 """
89- lines = read_text_corpus (os .path .join (corpus_path (), filename ))
86+ path = os .path .join (corpus_path (), filename )
87+ lines = []
88+ with open (path , "r" , encoding = "utf-8-sig" ) as fh :
89+ lines = fh .read ().splitlines ()
9090
9191 return frozenset (lines )
9292
@@ -140,25 +140,6 @@ def get_corpus_path(name: str) -> Union[str, None]:
140140 return path
141141
142142
143- def _get_input (message , channel ):
144- response = input (message )
145- channel .put (response )
146-
147-
148- def _input_with_timeout (message , timeout , default_response ):
149- channel = queue .Queue ()
150- thread = threading .Thread (target = _get_input , args = (message , channel ))
151- thread .daemon = True
152- thread .start ()
153-
154- try :
155- response = channel .get (True , timeout )
156- return response
157- except queue .Empty :
158- pass
159- return default_response
160-
161-
162143def _download (url : str , dst : str ) -> int :
163144 """
164145 @param: url to download file
@@ -199,7 +180,7 @@ def download(name: str, force: bool = False) -> NoReturn:
199180 https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
200181
201182 :param string name: corpus name
202- :param bool force: force install
183+ :param bool force: force download
203184
204185 :Example:
205186 ::
@@ -234,41 +215,40 @@ def download(name: str, force: bool = False) -> NoReturn:
234215 if name in list (corpus_data .keys ()):
235216 corpus = corpus_data [name ]
236217 print ("Corpus:" , name )
218+ found = local_db .search (query .name == name )
237219
238220 # If not found in local, download
239- if not local_db . search ( query . name == name ) :
221+ if force or not found :
240222 print (f"- Downloading: { name } { corpus ['version' ]} " )
241223 _download (corpus ["download" ], corpus ["file_name" ])
242224 _check_hash (corpus ["file_name" ], corpus ["md5" ])
243- local_db .insert (
244- {
245- "name" : name ,
246- "version" : corpus ["version" ],
247- "file" : corpus ["file_name" ],
248- }
249- )
225+
226+ if found :
227+ local_db .update (
228+ {"version" : corpus ["version" ]}, query .name == name
229+ )
230+ else :
231+ local_db .insert (
232+ {
233+ "name" : name ,
234+ "version" : corpus ["version" ],
235+ "file" : corpus ["file_name" ],
236+ }
237+ )
250238 else :
251239 if local_db .search (
252240 query .name == name and query .version == corpus ["version" ]
253241 ):
254242 # Already has the same version
255243 print ("- Already up to date." )
256244 else :
257- # Has the corpus but different version, update
245+ # Has the corpus but different version
258246 current_ver = local_db .search (query .name == name )[0 ]["version" ]
259- message = f"- Update from { current_ver } to { corpus ['version' ]} [y/n]?"
260- response = _input_with_timeout (message , 10 , "n" )
261- response = response .lower ()
262-
263- if force or response == "y" :
264- print (f"- Downloading: { name } { corpus ['version' ]} " )
265- _download (corpus ["download" ], corpus ["file_name" ])
266- _check_hash (corpus ["file_name" ], corpus ["md5" ])
267- local_db .update (
268- {"version" : corpus ["version" ]}, query .name == name
269- )
270- else :
271- print ("- Not update." )
247+ print (f"- Existing version: { current_ver } " )
248+ print (f"- New version available: { corpus ['version' ]} " )
249+ print ("- Use download(data_name, force=True) to update" )
250+ else :
251+ print ("Corpus not found:" , name )
272252
273253 local_db .close ()
274254
0 commit comments