rename argument name for download()

bact · bact · commit 4d3660e2622f · 2020-05-22T21:51:19.000+01:00
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -175,7 +175,7 @@ def _check_hash(dst: str, md5: str) -> None:
 
 
 def download(
-    name: str, force: bool = False, corpus_db_url: str = None
+    name: str, force: bool = False, url: str = None
 ) -> bool:
     """
     Download corpus.
@@ -185,7 +185,7 @@ def download(
 
     :param str name: corpus name
     :param bool force: force download
-    :param str
+    :param str url: URL of the corpus catalog
     :return: **True** if the corpus is found and succesfully downloaded.
              Otherwise, it returns **False**.
     :rtype: bool
@@ -205,12 +205,12 @@ def download(
     ``$HOME/pythainlp-data/``
     (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``).
     """
-    if not corpus_db_url:
-        corpus_db_url = corpus_db_url()
+    if not url:
+        url = corpus_db_url()
 
-    corpus_db = get_corpus_db(corpus_db_url)
+    corpus_db = get_corpus_db(url)
     if not corpus_db:
-        print(f"Cannot download corpus database from: {corpus_db_url}")
+        print(f"Cannot download corpus catalog from: {url}")
         return False
 
     corpus_db = corpus_db.json()
diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -3,7 +3,7 @@
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
-    "PYTHAINLP_DATA_DIR"
+    "PYTHAINLP_DATA_DIR",
 ]
 
 from pythainlp.tools.path import (
diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py
@@ -49,8 +49,9 @@ def get_pythainlp_data_path() -> str:
         get_pythainlp_data_path()
         # output: '/root/pythainlp-data'
     """
-    path = os.getenv('PYTHAINLP_DATA_DIR',
-                     os.path.join("~", PYTHAINLP_DATA_DIR))
+    path = os.getenv(
+        "PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
+    )
     path = os.path.expanduser(path)
     os.makedirs(path, exist_ok=True)
     return path
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -9,68 +9,70 @@
 
 
 class TestBenchmarksPackage(unittest.TestCase):
-
     def test_preprocessing(self):
-        self.assertIsNotNone(word_tokenization.preprocessing(
-            txt="ทดสอบ การ ทำ ความสะอาด ข้อมูล<tag>ok</tag>"
-        ))
+        self.assertIsNotNone(
+            word_tokenization.preprocessing(
+                txt="ทดสอบ การ ทำ ความสะอาด ข้อมูล<tag>ok</tag>"
+            )
+        )
 
     def test_benchmark_not_none(self):
-        self.assertIsNotNone(word_tokenization.benchmark(
-            ["วัน", "จัน", "ทร์", "สี", "เหลือง"],
-            ["วัน", "จันทร์", "สี", "เหลือง"]
-        ))
+        self.assertIsNotNone(
+            word_tokenization.benchmark(
+                ["วัน", "จัน", "ทร์", "สี", "เหลือง"],
+                ["วัน", "จันทร์", "สี", "เหลือง"],
+            )
+        )
 
     def test_binary_representation(self):
         sentence = "อากาศ|ร้อน|มาก|ครับ"
         rept = word_tokenization._binary_representation(sentence)
 
         self.assertEqual(
-            [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
-            rept.tolist()
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], rept.tolist()
         )
 
     def test_compute_stats(self):
-        for pair in TEST_DATA['sentences']:
-            exp, act = pair['expected'], pair['actual']
+        for pair in TEST_DATA["sentences"]:
+            exp, act = pair["expected"], pair["actual"]
 
             result = word_tokenization.compute_stats(
                 word_tokenization.preprocessing(exp),
-                word_tokenization.preprocessing(act)
-            ) 
+                word_tokenization.preprocessing(act),
+            )
 
             self.assertIsNotNone(result)
 
     def test_benchmark(self):
         expected = []
         actual = []
-        for pair in TEST_DATA['sentences']:
-            expected.append(pair['expected'])
-            actual.append(pair['actual'])
+        for pair in TEST_DATA["sentences"]:
+            expected.append(pair["expected"])
+            actual.append(pair["actual"])
 
         df = word_tokenization.benchmark(expected, actual)
 
         self.assertIsNotNone(df)
 
     def test_count_correctly_tokenised_words(self):
-        for d in TEST_DATA['binary_sentences']:
-            sample = np.array(list(d['actual'])).astype(int)
-            ref_sample = np.array(list(d['expected'])).astype(int)
+        for d in TEST_DATA["binary_sentences"]:
+            sample = np.array(list(d["actual"])).astype(int)
+            ref_sample = np.array(list(d["expected"])).astype(int)
 
             sb = list(word_tokenization._find_word_boudaries(sample))
             rb = list(word_tokenization._find_word_boudaries(ref_sample))
 
             # in binary [{0, 1}, ...]
-            correctly_tokenized_words = word_tokenization\
-                ._find_words_correctly_tokenised(rb, sb)
+            correctly_tokenized_words = word_tokenization._find_words_correctly_tokenised(
+                rb, sb
+            )
 
             self.assertEqual(
-                np.sum(correctly_tokenized_words),
-                d['expected_count']
+                np.sum(correctly_tokenized_words), d["expected_count"]
             )
 
     def test_words_correctly_tokenised(self):
-        r = [(0, 2), (2, 10), (10, 12) ]
+        r = [(0, 2), (2, 10), (10, 12)]
         s = [(0, 10), (10, 12)]
 
         expected = "01"
@@ -79,10 +81,7 @@ def test_words_correctly_tokenised(self):
         self.assertEqual(expected, "".join(np.array(labels).astype(str)))
 
     def test_flatten_result(self):
-        result = dict(
-            key1=dict(v1=6),
-            key2=dict(v2=7)
-        )
+        result = dict(key1=dict(v1=6), key2=dict(v2=7))
 
         actual = word_tokenization._flatten_result(result)
-        self.assertEqual(actual, {'key1:v1': 6, 'key2:v2': 7})
+        self.assertEqual(actual, {"key1:v1": 6, "key2:v2": 7})
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -44,7 +44,7 @@ def test_corpus(self):
         self.assertTrue(download(name="test", force=True))  # force download
         self.assertTrue(download(name="test"))  # try download existing
         self.assertFalse(
-            download(name="test", corpus_db_url="wrongurl")
+            download(name="test", url="wrongurl")
         )  # URL not exist
         self.assertFalse(
             download(name="XxxXXxxx817d37sf")
diff --git a/tests/test_soundex.py b/tests/test_soundex.py
@@ -6,7 +6,6 @@
 
 
 class TestSoundexPackage(unittest.TestCase):
-
     def test_soundex(self):
         self.assertIsNotNone(soundex("a", engine="lk82"))
         self.assertIsNotNone(soundex("a", engine="udom83"))
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -5,13 +5,10 @@
 from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
 from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.named_entity import ThaiNameTagger
-from pythainlp.tokenize import (
-    word_tokenize,
-)
+from pythainlp.tokenize import word_tokenize
 
 
 class TestTagPackage(unittest.TestCase):
-
     def test_pos_tag(self):
         tokens = ["ผม", "รัก", "คุณ"]
 
@@ -81,108 +78,102 @@ def test_ner(self):
                 """คณะวิทยาศาสตร์ประยุกต์และวิศวกรรมศาสตร์ มหาวิทยาลัยขอนแก่น
                 วิทยาเขตหนองคาย 112 หมู่ 7 บ้านหนองเดิ่น ตำบลหนองกอมเกาะ อำเภอเมือง
                 จังหวัดหนองคาย 43000""",
-                tag=True
+                tag=True,
             )
         )
 
         # arguement `tag` is True
         self.assertEqual(
-            ner.get_ner(
-                "วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
-                tag=True
-            ),
+            ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True),
             "วันที่ <DATE>15 ก.ย. 61</DATE> "
-            "ทดสอบระบบเวลา <TIME>14:49 น.</TIME>")
+            "ทดสอบระบบเวลา <TIME>14:49 น.</TIME>",
+        )
 
         self.assertEqual(
             ner.get_ner(
-                "url = https://thainlp.org/pythainlp/docs/2.0/",
-                tag=True
+                "url = https://thainlp.org/pythainlp/docs/2.0/", tag=True
             ),
-            "url = <URL>https://thainlp.org/pythainlp/docs/2.0/</URL>")
+            "url = <URL>https://thainlp.org/pythainlp/docs/2.0/</URL>",
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "example@gmail.com",
-                tag=True
-            ),
-            "<EMAIL>example@gmail.com</EMAIL>")
+            ner.get_ner("example@gmail.com", tag=True),
+            "<EMAIL>example@gmail.com</EMAIL>",
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "รหัสไปรษณีย์ 19130",
-                tag=True
-            ),
-            "รหัสไปรษณีย์ <ZIP>19130</ZIP>")
+            ner.get_ner("รหัสไปรษณีย์ 19130", tag=True),
+            "รหัสไปรษณีย์ <ZIP>19130</ZIP>",
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "เบอร์โทรศัพท์ 091-123-4567",
-                tag=True
-            ),
-            "เบอร์โทรศัพท์ <PHONE>091-123-4567</PHONE>")
+            ner.get_ner("เบอร์โทรศัพท์ 091-123-4567", tag=True),
+            "เบอร์โทรศัพท์ <PHONE>091-123-4567</PHONE>",
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "อาจารย์เอกพล ประจำคณะวิศวกรรมศาสตร์ ",
-                tag=True
-            ),
+            ner.get_ner("อาจารย์เอกพล ประจำคณะวิศวกรรมศาสตร์ ", tag=True),
             "<PERSON>อาจารย์เอกพล</PERSON> ประจำ<ORGANIZATION>"
-            "คณะวิศวกรรมศาสตร์</ORGANIZATION> ")
+            "คณะวิศวกรรมศาสตร์</ORGANIZATION> ",
+        )
 
         self.assertEqual(
             ner.get_ner(
                 "มาตรา 80 ปพพ ให้ใช้อัตราภาษีร้อยละ 10.0"
                 " ในการคำนวณภาษีมูลค่าเพิ่ม",
-                tag=True
+                tag=True,
             ),
             "<LAW>มาตรา 80 ปพพ</LAW> "
             "ให้ใช้อัตราภาษี<PERCENT>ร้อยละ 10.0</PERCENT>"
-            " ในการคำนวณภาษีมูลค่าเพิ่ม")
+            " ในการคำนวณภาษีมูลค่าเพิ่ม",
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "ยาว 20 เซนติเมตร",
-                tag=True
-            ),
-            "ยาว <LEN>20 เซนติเมตร</LEN>")
+            ner.get_ner("ยาว 20 เซนติเมตร", tag=True),
+            "ยาว <LEN>20 เซนติเมตร</LEN>",
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "1 บาท",
-                pos=True,
-                tag=True),
-            "<MONEY>1 บาท</MONEY>")
+            ner.get_ner("1 บาท", pos=True, tag=True), "<MONEY>1 บาท</MONEY>"
+        )
 
         self.assertEqual(
-            ner.get_ner(
-                "ไทย",
-                pos=False,
-                tag=True
-            ),
-            "<LOCATION>ไทย</LOCATION>")
+            ner.get_ner("ไทย", pos=False, tag=True), "<LOCATION>ไทย</LOCATION>"
+        )
 
         # arguement `tag` is False and `pos` is True
         self.assertEqual(
-            ner.get_ner(
-                "ไทย",
-                pos=True,
-                tag=False
-            ),
-            [('ไทย', 'PROPN', 'B-LOCATION')])
+            ner.get_ner("ไทย", pos=True, tag=False),
+            [("ไทย", "PROPN", "B-LOCATION")],
+        )
 
         # arguement `tag` is False and `pos` is False
         self.assertEqual(
             ner.get_ner(
                 "วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",
                 pos=False,
-                tag=False
+                tag=False,
             ),
-            [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'),
-             (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'),
-             ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'),
-             ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'),
-             ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')])
+            [
+                ("วันที่", "O"),
+                (" ", "O"),
+                ("15", "B-DATE"),
+                (" ", "I-DATE"),
+                ("ก.ย.", "I-DATE"),
+                (" ", "I-DATE"),
+                ("61", "I-DATE"),
+                (" ", "O"),
+                ("ทดสอบ", "O"),
+                ("ระบบ", "O"),
+                ("เวลา", "O"),
+                (" ", "O"),
+                ("14", "B-TIME"),
+                (":", "I-TIME"),
+                ("49", "I-TIME"),
+                (" ", "I-TIME"),
+                ("น.", "I-TIME"),
+            ],
+        )
 
         # self.assertEqual(
         #     ner.get_ner("แมวทำอะไรตอนห้าโมงเช้า"),
diff --git a/tests/test_tools.py b/tests/test_tools.py
@@ -2,8 +2,18 @@
 
 import unittest
 
+from pythainlp.tools import (
+    get_full_data_path,
+    get_pythainlp_data_path,
+    get_pythainlp_path,
+)
 
-class TestToolsPackage(unittest.TestCase):
 
-    def setUp():
-        pass
+class TestToolsPackage(unittest.TestCase):
+    def test_path(self):
+        data_filename = "ttc_freq.txt"
+        self.assertTrue(
+            get_full_data_path(data_filename).endswith(data_filename)
+        )
+        self.assertTrue(isinstance(get_pythainlp_data_path(), str))
+        self.assertTrue(isinstance(get_pythainlp_path, str))
diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py
diff --git a/tests/test_word_vector.py b/tests/test_word_vector.py

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`"get_full_data_path",`
`4`	`4`	`"get_pythainlp_data_path",`
`5`	`5`	`"get_pythainlp_path",`
`6`		`- "PYTHAINLP_DATA_DIR"`
	`6`	`+ "PYTHAINLP_DATA_DIR",`
`7`	`7`	`]`
`8`	`8`
`9`	`9`	`from pythainlp.tools.path import (`