1111import requests
1212from requests .exceptions import HTTPError
1313from tinydb import Query , TinyDB
14- from tqdm import tqdm
1514
1615from pythainlp .corpus import corpus_db_path , corpus_db_url , corpus_path
1716from pythainlp .tools import get_full_data_path
@@ -49,7 +48,7 @@ def get_corpus(filename: str) -> frozenset:
4948 `this file
5049 <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
5150
52- :param string filename: filename of the corpus to be read
51+ :param str filename: filename of the corpus to be read
5352
5453 :return: :mod:`frozenset` consist of lines in the file
5554 :rtype: :mod:`frozenset`
@@ -82,7 +81,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
8281 """
8382 Get corpus path.
8483
85- :param string name: corpus name
84+ :param str name: corpus name
8685 :return: path to the corpus or **None** of the corpus doesn't
8786 exist in the device
8887 :rtype: str
@@ -134,17 +133,28 @@ def _download(url: str, dst: str) -> int:
134133 @param: url to download file
135134 @param: dst place to put the file
136135 """
137- _CHUNK_SIZE = 1024 * 64
136+ _CHUNK_SIZE = 64 * 1024 # 64 KiB
138137
139138 file_size = int (urlopen (url ).info ().get ("Content-Length" , - 1 ))
140139 r = requests .get (url , stream = True )
141140 with open (get_full_data_path (dst ), "wb" ) as f :
142- pbar = tqdm (total = int (r .headers ["Content-Length" ]))
141+ pbar = None
142+ try :
143+ from tqdm import tqdm
144+
145+ pbar = tqdm (total = int (r .headers ["Content-Length" ]))
146+ except ImportError :
147+ pbar = None
148+
143149 for chunk in r .iter_content (chunk_size = _CHUNK_SIZE ):
144150 if chunk :
145151 f .write (chunk )
146- pbar .update (len (chunk ))
147- pbar .close ()
152+ if pbar :
153+ pbar .update (len (chunk ))
154+ if pbar :
155+ pbar .close ()
156+ else :
157+ print ("Done." )
148158 return file_size
149159
150160
@@ -164,15 +174,21 @@ def _check_hash(dst: str, md5: str) -> None:
164174 raise Exception ("Hash does not match expected." )
165175
166176
167- def download (name : str , force : bool = False ) -> None :
177+ def download (
178+ name : str , force : bool = False , url : str = None
179+ ) -> bool :
168180 """
169181 Download corpus.
170182
171183 The available corpus names can be seen in this file:
172184 https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
173185
174- :param string name: corpus name
186+ :param str name: corpus name
175187 :param bool force: force download
188+ :param str url: URL of the corpus catalog
189+ :return: **True** if the corpus is found and succesfully downloaded.
190+ Otherwise, it returns **False**.
191+ :rtype: bool
176192
177193 :Example:
178194 ::
@@ -189,9 +205,12 @@ def download(name: str, force: bool = False) -> None:
189205 ``$HOME/pythainlp-data/``
190206 (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
191207 """
192- corpus_db = get_corpus_db (corpus_db_url ())
208+ if not url :
209+ url = corpus_db_url ()
210+
211+ corpus_db = get_corpus_db (url )
193212 if not corpus_db :
194- print (f"Cannot download corpus database from: { corpus_db_url () } " )
213+ print (f"Cannot download corpus catalog from: { url } " )
195214 return False
196215
197216 corpus_db = corpus_db .json ()
@@ -247,7 +266,7 @@ def remove(name: str) -> bool:
247266 """
248267 Remove corpus
249268
250- :param string name: corpus name
269+ :param str name: corpus name
251270 :return: **True** if the corpus is found and succesfully removed.
252271 Otherwise, it returns **False**.
253272 :rtype: bool
0 commit comments