Skip to content

Commit 841a4a5

Browse files
authored
Merge pull request #409 from PyThaiNLP/rm-default-tqdm
Remove tqdm from base requirements
2 parents 4460a35 + 5a319a1 commit 841a4a5

File tree

17 files changed

+164
-106
lines changed

17 files changed

+164
-106
lines changed

pythainlp/benchmarks/word_tokenization.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22

33
import re
44
import sys
5+
from typing import List, Tuple
56

67
import numpy as np
78
import pandas as pd
89

9-
1010
SEPARATOR = "|"
1111

1212
# regex for removing to a space surrounded by separators, i.e. | |
@@ -65,7 +65,7 @@ def _flatten_result(my_dict: dict, sep: str = ":") -> dict:
6565
return dict(items)
6666

6767

68-
def benchmark(ref_samples: list, samples: list):
68+
def benchmark(ref_samples: List[str], samples: List[str]) -> pd.DataFrame:
6969
"""
7070
Performace benchmark of samples.
7171
@@ -257,8 +257,9 @@ def _find_word_boudaries(bin_reps) -> list:
257257

258258

259259
def _find_words_correctly_tokenised(
260-
ref_boundaries: list, predicted_boundaries: list
261-
) -> tuple:
260+
ref_boundaries: List[Tuple[int, int]],
261+
predicted_boundaries: List[Tuple[int, int]],
262+
) -> Tuple[int]:
262263
"""
263264
Find whether each word is correctly tokenized.
264265

pythainlp/corpus/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,23 @@
5151

5252

5353
def corpus_path() -> str:
54+
"""
55+
Get path where corpus files are kept locally.
56+
"""
5457
return _CORPUS_PATH
5558

5659

5760
def corpus_db_url() -> str:
61+
"""
62+
Get remote URL of corpus catalog.
63+
"""
5864
return _CORPUS_DB_URL
5965

6066

6167
def corpus_db_path() -> str:
68+
"""
69+
Get local path of corpus catalog.
70+
"""
6271
return _CORPUS_DB_PATH
6372

6473

pythainlp/corpus/core.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import requests
1212
from requests.exceptions import HTTPError
1313
from tinydb import Query, TinyDB
14-
from tqdm import tqdm
1514

1615
from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
1716
from pythainlp.tools import get_full_data_path
@@ -49,7 +48,7 @@ def get_corpus(filename: str) -> frozenset:
4948
`this file
5049
<https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
5150
52-
:param string filename: filename of the corpus to be read
51+
:param str filename: filename of the corpus to be read
5352
5453
:return: :mod:`frozenset` consist of lines in the file
5554
:rtype: :mod:`frozenset`
@@ -82,7 +81,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
8281
"""
8382
Get corpus path.
8483
85-
:param string name: corpus name
84+
:param str name: corpus name
8685
:return: path to the corpus or **None** of the corpus doesn't
8786
exist in the device
8887
:rtype: str
@@ -134,17 +133,28 @@ def _download(url: str, dst: str) -> int:
134133
@param: url to download file
135134
@param: dst place to put the file
136135
"""
137-
_CHUNK_SIZE = 1024 * 64
136+
_CHUNK_SIZE = 64 * 1024 # 64 KiB
138137

139138
file_size = int(urlopen(url).info().get("Content-Length", -1))
140139
r = requests.get(url, stream=True)
141140
with open(get_full_data_path(dst), "wb") as f:
142-
pbar = tqdm(total=int(r.headers["Content-Length"]))
141+
pbar = None
142+
try:
143+
from tqdm import tqdm
144+
145+
pbar = tqdm(total=int(r.headers["Content-Length"]))
146+
except ImportError:
147+
pbar = None
148+
143149
for chunk in r.iter_content(chunk_size=_CHUNK_SIZE):
144150
if chunk:
145151
f.write(chunk)
146-
pbar.update(len(chunk))
147-
pbar.close()
152+
if pbar:
153+
pbar.update(len(chunk))
154+
if pbar:
155+
pbar.close()
156+
else:
157+
print("Done.")
148158
return file_size
149159

150160

@@ -164,15 +174,21 @@ def _check_hash(dst: str, md5: str) -> None:
164174
raise Exception("Hash does not match expected.")
165175

166176

167-
def download(name: str, force: bool = False) -> None:
177+
def download(
178+
name: str, force: bool = False, url: str = None
179+
) -> bool:
168180
"""
169181
Download corpus.
170182
171183
The available corpus names can be seen in this file:
172184
https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
173185
174-
:param string name: corpus name
186+
:param str name: corpus name
175187
:param bool force: force download
188+
:param str url: URL of the corpus catalog
189+
:return: **True** if the corpus is found and succesfully downloaded.
190+
Otherwise, it returns **False**.
191+
:rtype: bool
176192
177193
:Example:
178194
::
@@ -189,9 +205,12 @@ def download(name: str, force: bool = False) -> None:
189205
``$HOME/pythainlp-data/``
190206
(e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
191207
"""
192-
corpus_db = get_corpus_db(corpus_db_url())
208+
if not url:
209+
url = corpus_db_url()
210+
211+
corpus_db = get_corpus_db(url)
193212
if not corpus_db:
194-
print(f"Cannot download corpus database from: {corpus_db_url()}")
213+
print(f"Cannot download corpus catalog from: {url}")
195214
return False
196215

197216
corpus_db = corpus_db.json()
@@ -247,7 +266,7 @@ def remove(name: str) -> bool:
247266
"""
248267
Remove corpus
249268
250-
:param string name: corpus name
269+
:param str name: corpus name
251270
:return: **True** if the corpus is found and succesfully removed.
252271
Otherwise, it returns **False**.
253272
:rtype: bool

pythainlp/soundex/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE) -> str:
1717
"""
1818
This function converts Thai text into phonetic code.
1919
20-
:param string text: word
20+
:param str text: word
2121
:param str engine: soundex engine
2222
:return: Soundex code
2323
:rtype: str

pythainlp/tag/named_entity.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,10 @@ def get_ner(
8989
"""
9090
This function tags named-entitiy from text in IOB format.
9191
92-
:param string text: text in Thai to be tagged
93-
:param boolean pos: To include POS tags in the results (`True`) or
92+
:param str text: text in Thai to be tagged
93+
:param bool pos: To include POS tags in the results (`True`) or
9494
exclude (`False`). The defualt value is `True`
95-
:param boolean tag: output like html tag.
95+
:param bool tag: output like html tag.
9696
:return: a list of tuple associated with tokenized word, NER tag,
9797
POS tag (if the parameter `pos` is specified as `True`),
9898
and output like html tag (if the parameter `tag` is

pythainlp/tools/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"get_full_data_path",
44
"get_pythainlp_data_path",
55
"get_pythainlp_path",
6-
"PYTHAINLP_DATA_DIR"
6+
"PYTHAINLP_DATA_DIR",
77
]
88

99
from pythainlp.tools.path import (

pythainlp/tools/path.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ def get_pythainlp_data_path() -> str:
4949
get_pythainlp_data_path()
5050
# output: '/root/pythainlp-data'
5151
"""
52-
path = os.getenv('PYTHAINLP_DATA_DIR',
53-
os.path.join("~", PYTHAINLP_DATA_DIR))
52+
path = os.getenv(
53+
"PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
54+
)
5455
path = os.path.expanduser(path)
5556
os.makedirs(path, exist_ok=True)
5657
return path

pythainlp/transliterate/core.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
DEFAULT_ROMANIZE_ENGINE = "royin"
44
DEFAULT_TRANSLITERATE_ENGINE = "thaig2p"
55

6+
67
def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
78
"""
89
This function renders Thai words in the Latin alphabet or "romanization",
@@ -51,7 +52,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
5152
return romanize(text)
5253

5354

54-
def transliterate(text: str, engine: str = DEFAULT_TRANSLITERATE_ENGINE) -> str:
55+
def transliterate(
56+
text: str, engine: str = DEFAULT_TRANSLITERATE_ENGINE
57+
) -> str:
5558
"""
5659
This function transliterates Thai text.
5760

pythainlp/word_vector/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ def similarity(word1: str, word2: str) -> float:
177177
"""
178178
This function computae cosine similarity between two words.
179179
180-
:param string word1: first word to be compared
181-
:param string word2: second word to be compared
180+
:param str word1: first word to be compared
181+
:param str word2: second word to be compared
182182
183183
:raises KeyError: if either `word1` or `word2` is not in the vocabulary
184184
of the model.
@@ -218,8 +218,8 @@ def sentence_vectorizer(text: str, use_mean: bool = True) -> ndarray:
218218
Then, word vectors are aggregatesd into one vector of 300 dimension
219219
by calulating either mean, or summation of all word vectors.
220220
221-
:param string text: text input
222-
:param boolean use_mean: if `True` aggregate word vectors with mean of all
221+
:param str text: text input
222+
:param bool use_mean: if `True` aggregate word vectors with mean of all
223223
word vectors. Otherwise, aggregate with summation
224224
of all word vectors
225225

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
python-crfsuite==0.9.*
22
requests==2.23.*
33
tinydb==4.1.*
4-
tqdm==4.46.*

0 commit comments

Comments
 (0)