Merge pull request #325 from PyThaiNLP/dev

bact · web-flow · commit ca27105c0834 · 2019-12-06T16:52:30.000Z
Update 2.1 branche from dev branche.
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ PyThaiNLP is a Python package for text processing and linguistic analysis, simil
 **This is a document for development branch (post 2.0). Things will break.**
 
 - The latest stable release is [2.0.7](https://github.com/PyThaiNLP/pythainlp/releases)
-- The latest development release is [2.1.dev7](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181).
+- The latest development release is [2.1.dev8](https://github.com/PyThaiNLP/pythainlp/releases). See the ongoing [2.1 change log](https://github.com/PyThaiNLP/pythainlp/issues/181).
 - 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page
 
 
@@ -68,7 +68,6 @@ pip install pythainlp[extra1,extra2,...]
 ```
 
 where `extras` can be
-  - `artagger` (to support artagger part-of-speech tagger)
   - `attacut` (to support attacut, a fast and accurate tokenizer)
   - `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization)
   - `ipa` (for IPA, International Phonetic Alphabet, support in transliteration)
@@ -177,7 +176,6 @@ pip install pythainlp[extra1,extra2,...]
 ```
 
 โดยที่ `extras` คือ
-  - `artagger` (สำหรับตัวติดป้ายกำกับชนิดคำ artagger)
   - `attacut` (ตัวตัดคำที่แม่นกว่า `newmm` เมื่อเทียบกับชุดข้อมูล BEST)
   - `icu` (สำหรับการถอดตัวสะกดเป็นสัทอักษรและการตัดคำด้วย ICU)
   - `ipa` (สำหรับการถอดตัวสะกดเป็นสัทอักษรสากล (IPA))
diff --git a/appveyor.docs.yml b/appveyor.docs.yml
@@ -42,8 +42,8 @@ install:
   - export LD_LIBRARY_PATH=/usr/local/lib
   - sudo pip3 install -r requirements.txt
   - sudo pip3 install torch==1.2.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-  - sudo pip3 install --upgrade artagger emoji epitran gensim numpy pandas pyicu sklearn-crfsuite ssg
-  - sudo pip3 install --upgrade "tensorflow==1.14,<2"deepcut
+  - sudo pip3 install --upgrade emoji epitran gensim numpy pandas pyicu sklearn-crfsuite ssg
+  - sudo pip3 install --upgrade "tensorflow>=2,<3"deepcut
   - sudo pip3 install --upgrade boto smart_open sphinx sphinx-rtd-theme
 
 #---------------------------------#
diff --git a/appveyor.yml b/appveyor.yml
@@ -48,7 +48,6 @@ environment:
     PYTHONIOENCODING: "utf-8"
     ICU_VERSION: "64.2"
     DISTUTILS_USE_SDK: "1"
-    ARTAGGER_PKG: "https://github.com/franziz/artagger/archive/master.zip"
     PYTHAINLP_DATA_DIR: "%LOCALAPPDATA%/pythainlp-data"
 
   matrix:
@@ -101,7 +100,6 @@ install:
   - pip install "tensorflow>=2,<3" deepcut
   - pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
   - pip install %PYICU_PKG%
-  - pip install %ARTAGGER_PKG%
   - pip install -e .[full]
 
 #---------------------------------#
diff --git a/docs/api/tag.rst b/docs/api/tag.rst
@@ -207,14 +207,10 @@ unigram
 
 Unigram tagger doesn't take the ordering of words in the list into account.
 
-artagger
-++++++++
-
-`artagger <https://github.com/franziz/artagger>`_ is an implementation of `RDRPOSTagger <https://github.com/datquocnguyen/RDRPOSTagger>`_ for tagging POS in Thai language.
 
 References
 ----------
 
 .. [#Sornlertlamvanich_2000] Takahashi, Naoto & Isahara, Hitoshi & Sornlertlamvanich, Virach. (2000).
             Building a Thai part-of-speech tagged corpus (ORCHID). 
-            ournal of the Acoustical Society of Japan (E). 20. 10.1250/ast.20.189. 
+            Journal of the Acoustical Society of Japan (E). 20. 10.1250/ast.20.189. 
diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
@@ -14,7 +14,6 @@ For some functionalities, like named entity recognition, extra packages may be n
     pip install pythainlp[extra1,extra2,...]
 
 where ``extras`` can be
-  - ``artagger`` (to support artagger part-of-speech tagger)
   - ``attacut`` (to support attacut, a fast and accurate tokenizer)
   - ``icu`` (for ICU, International Components for Unicode, support in transliteration and tokenization)
   - ``ipa`` (for IPA, International Phonetic Alphabet, support in transliteration)
diff --git a/pythainlp/cli/soundex.py b/pythainlp/cli/soundex.py
@@ -7,7 +7,7 @@
 class App:
 
     def __init__(self, argv):
-        parser = argparse.ArgumentParser("sounddex")
+        parser = argparse.ArgumentParser("soundex")
         parser.add_argument(
             "--text",
             type=str,
diff --git a/pythainlp/corpus/words_th.txt b/pythainlp/corpus/words_th.txt
@@ -15594,7 +15594,6 @@
 ชิงไหวชิงพริบ
 ชิงฮื้อ
 ชิชะ
-ชิชิ
 ชิณณะ
 ชิด
 ชิดขวา
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
@@ -104,23 +104,14 @@ def _orchid_to_ud(tag) -> List[Tuple[str, str]]:
     _i = 0
     temp = []
     while _i < len(tag):
-        temp.append((tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]])))
+        temp.append(
+            (tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]]))
+        )
         _i += 1
 
     return temp
 
 
-def _artagger_tag(words: List[str], corpus: str = None) -> List[Tuple[str, str]]:
-    if not words:
-        return []
-
-    from artagger import Tagger
-
-    words_ = Tagger().tag(" ".join(words))
-
-    return [(word.word, word.tag) for word in words_]
-
-
 def pos_tag(
     words: List[str], engine: str = "perceptron", corpus: str = "orchid"
 ) -> List[Tuple[str, str]]:
@@ -132,7 +123,6 @@ def pos_tag(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
-        * *artagger* - RDR POS tagger
     :param str corpus:
         * *orchid* - annotated Thai academic articles namedly
           `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_ (default)
@@ -145,10 +135,6 @@ def pos_tag(
     :return: returns a list of labels regarding which part of speech it is
     :rtype: list[tuple[str, str]]
 
-    :Note:
-        * *artagger*, only support one sentence and the sentence must
-          be tokenized beforehand.
-
     :Example:
 
     Tag words with corpus `orchid` (default)::
@@ -187,8 +173,7 @@ def pos_tag(
         #   ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'),
         #   ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')]
 
-    Tag words with different engines including *perceptron*, *unigram*,
-    and *artagger*::
+    Tag words with different engines including *perceptron* and *unigram*::
 
         from pythainlp.tag import pos_tag
 
@@ -204,12 +189,6 @@ def pos_tag(
         # output:
         # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
         #   ('<space>', None), ('<equal>', None), ('3', 'NUM')]
-
-        pos_tag(words, engine='artagger', corpus='orchid')
-        # output:
-        # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'),
-        #   ('ขา', 'NCMN'), ('<space>', 'PUNC'),
-        #   ('<equal>', 'PUNC'), ('3', 'NCNM')]
     """
 
     # NOTE:
@@ -222,8 +201,6 @@ def pos_tag(
 
     if engine == "perceptron":
         from .perceptron import tag as tag_
-    elif engine == "artagger":
-        tag_ = _artagger_tag
     else:  # default, use "unigram" ("old") engine
         from .unigram import tag as tag_
     _tag = tag_(words, corpus=corpus)
@@ -235,7 +212,9 @@ def pos_tag(
 
 
 def pos_tag_sents(
-    sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid"
+    sentences: List[List[str]],
+    engine: str = "perceptron",
+    corpus: str = "orchid",
 ) -> List[List[Tuple[str, str]]]:
     """
     The function tag multiple list of tokenized words into Part-of-Speech
@@ -245,7 +224,6 @@ def pos_tag_sents(
     :param str engine:
         * *perceptron* - perceptron tagger (default)
         * *unigram* - unigram tagger
-        * *artagger* - RDR POS tagger
     :param str corpus:
         * *orchid* - annotated Thai academic articles namedly\
             `Orchid <https://www.academia.edu/9127599/Thai_Treebank>`_\
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -418,7 +418,7 @@ class Tokenizer:
         #   'ผิดปกติ', 'ของ', 'การ', 'พูด']
 
     Tokenizer object instantiated with a file path containing list of
-    word separated with *newline*  and explicitly set a new tokeneizer
+    word separated with *newline* and explicitly set a new tokenizer
     after initiation::
 
         PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt'
diff --git a/setup.py b/setup.py
@@ -43,8 +43,7 @@
 ]
 
 extras = {
-    "artagger": ["artagger>=0.1.0.3"],
-    "attacut": ["attacut>=1.0.4"],
+    "attacut": ["attacut>=1.0.6"],
     "benchmarks": ["numpy>=1.16", "pandas>=0.24"],
     "icu": ["pyicu>=2.3"],
     "ipa": ["epitran>=1.1"],
@@ -54,7 +53,6 @@
     "thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16"],
     "thai2rom": ["torch>=1.0.0", "numpy>=1.16"],
     "full": [
-        "artagger>=0.1.0.3",
         "attacut>=1.0.4",
         "emoji>=0.5.1",
         "epitran>=1.1",
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -44,14 +44,6 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
         self.assertEqual(perceptron.tag([], corpus="orchid"), [])
 
-        self.assertIsNotNone(pos_tag(None, engine="artagger"))
-        self.assertIsNotNone(pos_tag([], engine="artagger"))
-        self.assertIsNotNone(pos_tag(tokens, engine="artagger"))
-        self.assertEqual(
-            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"),
-            [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
-        )
-
         self.assertEqual(pos_tag_sents(None), [])
         self.assertEqual(pos_tag_sents([]), [])
         self.assertEqual(
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -196,17 +196,26 @@ def test_word_tokenize_newmm(self):
         self.assertIsNotNone(word_tokenize(long_text, engine="newmm"))
         self.assertIsNotNone(word_tokenize(long_text, engine="newmm-safe"))
 
-        short_danger_text = """
+        danger_text1 = """
+    ชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิชิ
+    """
+        danger_text2 = """
     ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้าน
     """
-        long_danger_text = """
+        danger_text3 = """
     ด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านหน้าด้านกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก
     """
         self.assertIsNotNone(
-            word_tokenize(short_danger_text, engine="newmm-safe")
+            word_tokenize(danger_text1, engine="newmm")
+        )
+        self.assertIsNotNone(
+            word_tokenize(danger_text1, engine="newmm-safe")
+        )
+        self.assertIsNotNone(
+            word_tokenize(danger_text2, engine="newmm-safe")
         )
         self.assertIsNotNone(
-            word_tokenize(long_danger_text, engine="newmm-safe")
+            word_tokenize(danger_text3, engine="newmm-safe")
         )
 
     def test_word_tokenize_attacut(self):