Merge pull request #409 from PyThaiNLP/rm-default-tqdm

bact · web-flow · commit 841a4a5ecda1 · 2020-05-22T23:00:18.000+01:00
Remove tqdm from base requirements
diff --git a/pythainlp/benchmarks/word_tokenization.py b/pythainlp/benchmarks/word_tokenization.py
@@ -2,11 +2,11 @@
 
 import re
 import sys
+from typing import List, Tuple
 
 import numpy as np
 import pandas as pd
 
-
 SEPARATOR = "|"
 
 # regex for removing to a space surrounded by separators, i.e. | |
@@ -65,7 +65,7 @@ def _flatten_result(my_dict: dict, sep: str = ":") -> dict:
     return dict(items)
 
 
-def benchmark(ref_samples: list, samples: list):
+def benchmark(ref_samples: List[str], samples: List[str]) -> pd.DataFrame:
     """
     Performace benchmark of samples.
 
@@ -257,8 +257,9 @@ def _find_word_boudaries(bin_reps) -> list:
 
 
 def _find_words_correctly_tokenised(
-    ref_boundaries: list, predicted_boundaries: list
-) -> tuple:
+    ref_boundaries: List[Tuple[int, int]],
+    predicted_boundaries: List[Tuple[int, int]],
+) -> Tuple[int]:
     """
     Find whether each word is correctly tokenized.
 
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -51,14 +51,23 @@
 
 
 def corpus_path() -> str:
+    """
+    Get path where corpus files are kept locally.
+    """
     return _CORPUS_PATH
 
 
 def corpus_db_url() -> str:
+    """
+    Get remote URL of corpus catalog.
+    """
     return _CORPUS_DB_URL
 
 
 def corpus_db_path() -> str:
+    """
+    Get local path of corpus catalog.
+    """
     return _CORPUS_DB_PATH
 
 
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -11,7 +11,6 @@
 import requests
 from requests.exceptions import HTTPError
 from tinydb import Query, TinyDB
-from tqdm import tqdm
 
 from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
 from pythainlp.tools import get_full_data_path
@@ -49,7 +48,7 @@ def get_corpus(filename: str) -> frozenset:
     `this file
     <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_
 
-    :param string filename: filename of the corpus to be read
+    :param str filename: filename of the corpus to be read
 
     :return: :mod:`frozenset` consist of lines in the file
     :rtype: :mod:`frozenset`
@@ -82,7 +81,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
     """
     Get corpus path.
 
-    :param string name: corpus name
+    :param str name: corpus name
     :return: path to the corpus or **None** of the corpus doesn't
              exist in the device
     :rtype: str
@@ -134,17 +133,28 @@ def _download(url: str, dst: str) -> int:
     @param: url to download file
     @param: dst place to put the file
     """
-    _CHUNK_SIZE = 1024 * 64
+    _CHUNK_SIZE = 64 * 1024  # 64 KiB
 
     file_size = int(urlopen(url).info().get("Content-Length", -1))
     r = requests.get(url, stream=True)
     with open(get_full_data_path(dst), "wb") as f:
-        pbar = tqdm(total=int(r.headers["Content-Length"]))
+        pbar = None
+        try:
+            from tqdm import tqdm
+
+            pbar = tqdm(total=int(r.headers["Content-Length"]))
+        except ImportError:
+            pbar = None
+
         for chunk in r.iter_content(chunk_size=_CHUNK_SIZE):
             if chunk:
                 f.write(chunk)
-                pbar.update(len(chunk))
-        pbar.close()
+                if pbar:
+                    pbar.update(len(chunk))
+        if pbar:
+            pbar.close()
+        else:
+            print("Done.")
     return file_size
 
 
@@ -164,15 +174,21 @@ def _check_hash(dst: str, md5: str) -> None:
                 raise Exception("Hash does not match expected.")
 
 
-def download(name: str, force: bool = False) -> None:
+def download(
+    name: str, force: bool = False, url: str = None
+) -> bool:
     """
     Download corpus.
 
     The available corpus names can be seen in this file:
     https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json
 
-    :param string name: corpus name
+    :param str name: corpus name
     :param bool force: force download
+    :param str url: URL of the corpus catalog
+    :return: **True** if the corpus is found and succesfully downloaded.
+             Otherwise, it returns **False**.
+    :rtype: bool
 
     :Example:
     ::
@@ -189,9 +205,12 @@ def download(name: str, force: bool = False) -> None:
     ``$HOME/pythainlp-data/``
     (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
     """
-    corpus_db = get_corpus_db(corpus_db_url())
+    if not url:
+        url = corpus_db_url()
+
+    corpus_db = get_corpus_db(url)
     if not corpus_db:
-        print(f"Cannot download corpus database from: {corpus_db_url()}")
+        print(f"Cannot download corpus catalog from: {url}")
         return False
 
     corpus_db = corpus_db.json()
@@ -247,7 +266,7 @@ def remove(name: str) -> bool:
     """
     Remove corpus
 
-    :param string name: corpus name
+    :param str name: corpus name
     :return: **True** if the corpus is found and succesfully removed.
              Otherwise, it returns **False**.
     :rtype: bool
diff --git a/pythainlp/soundex/core.py b/pythainlp/soundex/core.py
@@ -17,7 +17,7 @@ def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE) -> str:
     """
     This function converts Thai text into phonetic code.
 
-    :param string text: word
+    :param str text: word
     :param str engine: soundex engine
     :return: Soundex code
     :rtype: str
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -89,10 +89,10 @@ def get_ner(
         """
         This function tags named-entitiy from text in IOB format.
         
-        :param string text: text in Thai to be tagged
-        :param boolean pos: To include POS tags in the results (`True`) or
+        :param str text: text in Thai to be tagged
+        :param bool pos: To include POS tags in the results (`True`) or
                             exclude (`False`). The defualt value is `True`
-        :param boolean tag: output like html tag.
+        :param bool tag: output like html tag.
         :return: a list of tuple associated with tokenized word, NER tag,
                  POS tag (if the parameter `pos` is specified as `True`),
                  and output like html tag (if the parameter `tag` is
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -3,7 +3,7 @@
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
-    "PYTHAINLP_DATA_DIR"
+    "PYTHAINLP_DATA_DIR",
 ]
 
 from pythainlp.tools.path import (
diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py
@@ -49,8 +49,9 @@ def get_pythainlp_data_path() -> str:
         get_pythainlp_data_path()
         # output: '/root/pythainlp-data'
     """
-    path = os.getenv('PYTHAINLP_DATA_DIR',
-                     os.path.join("~", PYTHAINLP_DATA_DIR))
+    path = os.getenv(
+        "PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
+    )
     path = os.path.expanduser(path)
     os.makedirs(path, exist_ok=True)
     return path
diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py
@@ -3,6 +3,7 @@
 DEFAULT_ROMANIZE_ENGINE = "royin"
 DEFAULT_TRANSLITERATE_ENGINE = "thaig2p"
 
+
 def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
     """
     This function renders Thai words in the Latin alphabet or "romanization",
@@ -51,7 +52,9 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
     return romanize(text)
 
 
-def transliterate(text: str, engine: str = DEFAULT_TRANSLITERATE_ENGINE) -> str:
+def transliterate(
+    text: str, engine: str = DEFAULT_TRANSLITERATE_ENGINE
+) -> str:
     """
     This function transliterates Thai text.
 
diff --git a/pythainlp/word_vector/core.py b/pythainlp/word_vector/core.py
@@ -177,8 +177,8 @@ def similarity(word1: str, word2: str) -> float:
     """
     This function computae cosine similarity between two words.
 
-    :param string word1: first word to be compared
-    :param string word2: second word to be compared
+    :param str word1: first word to be compared
+    :param str word2: second word to be compared
 
     :raises KeyError: if either `word1` or `word2` is not in the vocabulary
                       of the model.
@@ -218,8 +218,8 @@ def sentence_vectorizer(text: str, use_mean: bool = True) -> ndarray:
     Then, word vectors are aggregatesd into one vector of 300 dimension
     by calulating either mean, or summation of all word vectors.
 
-    :param string text: text input
-    :param boolean use_mean: if `True` aggregate word vectors with mean of all
+    :param str text: text input
+    :param bool use_mean: if `True` aggregate word vectors with mean of all
                              word vectors. Otherwise, aggregate with summation
                              of all word vectors
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
 python-crfsuite==0.9.*
 requests==2.23.*
 tinydb==4.1.*
-tqdm==4.46.*
diff --git a/setup.py b/setup.py
@@ -42,7 +42,6 @@
     "python-crfsuite>=0.9.6",
     "requests>=2.22.0",
     "tinydb>=3.0",
-    "tqdm>=4.1",
 ]
 
 extras = {
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -9,68 +9,70 @@
 
 
 class TestBenchmarksPackage(unittest.TestCase):
-
     def test_preprocessing(self):
-        self.assertIsNotNone(word_tokenization.preprocessing(
-            txt="ทดสอบ การ ทำ ความสะอาด ข้อมูล<tag>ok</tag>"
-        ))
+        self.assertIsNotNone(
+            word_tokenization.preprocessing(
+                txt="ทดสอบ การ ทำ ความสะอาด ข้อมูล<tag>ok</tag>"
+            )
+        )
 
     def test_benchmark_not_none(self):
-        self.assertIsNotNone(word_tokenization.benchmark(
-            ["วัน", "จัน", "ทร์", "สี", "เหลือง"],
-            ["วัน", "จันทร์", "สี", "เหลือง"]
-        ))
+        self.assertIsNotNone(
+            word_tokenization.benchmark(
+                ["วัน", "จัน", "ทร์", "สี", "เหลือง"],
+                ["วัน", "จันทร์", "สี", "เหลือง"],
+            )
+        )
 
     def test_binary_representation(self):
         sentence = "อากาศ|ร้อน|มาก|ครับ"
         rept = word_tokenization._binary_representation(sentence)
 
         self.assertEqual(
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
-            rept.tolist()
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], rept.tolist()
         )
 
     def test_compute_stats(self):
-        for pair in TEST_DATA['sentences']:
-            exp, act = pair['expected'], pair['actual']
+        for pair in TEST_DATA["sentences"]:
+            exp, act = pair["expected"], pair["actual"]
 
             result = word_tokenization.compute_stats(
                 word_tokenization.preprocessing(exp),
-                word_tokenization.preprocessing(act)
-            ) 
+                word_tokenization.preprocessing(act),
+            )
 
             self.assertIsNotNone(result)
 
     def test_benchmark(self):
         expected = []
         actual = []
-        for pair in TEST_DATA['sentences']:
-            expected.append(pair['expected'])
-            actual.append(pair['actual'])
+        for pair in TEST_DATA["sentences"]:
+            expected.append(pair["expected"])
+            actual.append(pair["actual"])
 
         df = word_tokenization.benchmark(expected, actual)
 
         self.assertIsNotNone(df)
 
     def test_count_correctly_tokenised_words(self):
-        for d in TEST_DATA['binary_sentences']:
-            sample = np.array(list(d['actual'])).astype(int)
-            ref_sample = np.array(list(d['expected'])).astype(int)
+        for d in TEST_DATA["binary_sentences"]:
+            sample = np.array(list(d["actual"])).astype(int)
+            ref_sample = np.array(list(d["expected"])).astype(int)
 
             sb = list(word_tokenization._find_word_boudaries(sample))
             rb = list(word_tokenization._find_word_boudaries(ref_sample))
 
             # in binary [{0, 1}, ...]
-            correctly_tokenized_words = word_tokenization\
-                ._find_words_correctly_tokenised(rb, sb)
+            correctly_tokenized_words = word_tokenization._find_words_correctly_tokenised(
+                rb, sb
+            )
 
             self.assertEqual(
-                np.sum(correctly_tokenized_words),
-                d['expected_count']
+                np.sum(correctly_tokenized_words), d["expected_count"]
             )
 
     def test_words_correctly_tokenised(self):
-        r = [(0, 2), (2, 10), (10, 12) ]
+        r = [(0, 2), (2, 10), (10, 12)]
         s = [(0, 10), (10, 12)]
 
         expected = "01"
@@ -79,10 +81,7 @@ def test_words_correctly_tokenised(self):
         self.assertEqual(expected, "".join(np.array(labels).astype(str)))
 
     def test_flatten_result(self):
-        result = dict(
-            key1=dict(v1=6),
-            key2=dict(v2=7)
-        )
+        result = dict(key1=dict(v1=6), key2=dict(v2=7))
 
         actual = word_tokenization._flatten_result(result)
-        self.assertEqual(actual, {'key1:v1': 6, 'key2:v2': 7})
+        self.assertEqual(actual, {"key1:v1": 6, "key2:v2": 7})
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
diff --git a/tests/test_soundex.py b/tests/test_soundex.py
diff --git a/tests/test_tools.py b/tests/test_tools.py
diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py
diff --git a/tests/test_word_vector.py b/tests/test_word_vector.py

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`"get_full_data_path",`
`4`	`4`	`"get_pythainlp_data_path",`
`5`	`5`	`"get_pythainlp_path",`
`6`		`- "PYTHAINLP_DATA_DIR"`
	`6`	`+ "PYTHAINLP_DATA_DIR",`
`7`	`7`	`]`
`8`	`8`
`9`	`9`	`from pythainlp.tools.path import (`