Merge pull request #200 from bact/dev

bact · web-flow · commit 195473649d46 · 2019-04-19T11:29:37.000+01:00
Improvement in romanization
diff --git a/README-pypi.md b/README-pypi.md
@@ -1,14 +1,14 @@
 ![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4)
 
-# PyThaiNLP 2.0.3
+# PyThaiNLP
 
 PyThaiNLP is a Python library for natural language processing (NLP) of Thai language.
 
 PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers.
 
 📫 follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/)
 
-## What's new in version 2.0 ?
+## What's new in 2.0 ?
 
 - Terminate Python 2 support. Remove all Python 2 compatibility code.
 - Improved `word_tokenize` ("newmm" and "mm" engine) and `dict_word_tokenize`
diff --git a/README.md b/README.md
@@ -65,8 +65,8 @@ $ pip install pythainlp[extra1,extra2,...]
 where ```extras``` can be
   - ```artagger``` (to support artagger part-of-speech tagger)*
   - ```deepcut``` (to support deepcut machine-learnt tokenizer)
-  - ```icu``` (for ICU support in transliteration and tokenization)
-  - ```ipa``` (for International Phonetic Alphabet support in transliteration)
+  - ```icu``` (for ICU, International Components for Unicode, support in transliteration and tokenization)
+  - ```ipa``` (for IPA, International Phonetic Alphabet, support in transliteration)
   - ```ml``` (to support fastai 1.0.22 ULMFiT models)
   - ```ner``` (for named-entity recognizer)
   - ```thai2fit``` (for Thai word vector)
diff --git a/notebooks/pythainlp-get-started.ipynb b/notebooks/pythainlp-get-started.ipynb
@@ -386,7 +386,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Thai Character Cluster (TCC)\n",
+    "### Subword and Thai Character Cluster (TCC)\n",
     "\n",
     "According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)."
    ]
@@ -408,31 +408,11 @@
     }
    ],
    "source": [
-    "from pythainlp.tokenize import subword_tokenize\n",
+    "from pythainlp import subword_tokenize\n",
     "\n",
     "subword_tokenize(\"ประเทศไทย\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "isinstance(subword_tokenize(\"ประเทศไทย\", engine=\"etcc\"), str)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -442,7 +422,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -451,20 +431,20 @@
        "['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from pythainlp import tcc\n",
+    "from pythainlp.tokenize import tcc\n",
     "\n",
     "tcc.segment(\"ประเทศไทย\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -473,7 +453,7 @@
        "{1, 3, 5, 6, 8, 9}"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -484,7 +464,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -509,7 +489,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -518,7 +498,7 @@
        "'maeo'"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -531,21 +511,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "mɛːw\n"
-     ]
+     "data": {
+      "text/plain": [
+       "'mɛːw'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "from pythainlp.transliterate import transliterate\n",
     "\n",
-    "print(transliterate(\"แมว\"))"
+    "transliterate(\"แมว\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip3 install pythainlp[icu]\n",
+    "#transliterate(\"แมว\", engine=\"icu\")"
    ]
   },
   {
@@ -736,15 +729,15 @@
     {
      "data": {
       "text/plain": [
-       "[('งวงช้าง', 12),\n",
-       " ('เทิบทาบ', 7),\n",
-       " ('กริน', 3),\n",
-       " ('นาภี', 2),\n",
-       " ('แด่วๆ', 3),\n",
-       " ('คู่ใจ', 7),\n",
-       " ('คุณพ่อ', 732),\n",
-       " ('สิ้น', 755),\n",
-       " ('เยาะ', 150)]"
+       "[('ลุ่น', 4),\n",
+       " ('คั่น', 53),\n",
+       " ('ไก่ป่า', 29),\n",
+       " ('ปริพาชก', 4),\n",
+       " ('สิกขาบท', 4),\n",
+       " ('คัดลายมือ', 2),\n",
+       " ('เลียบ', 53),\n",
+       " ('เกือบๆ', 6),\n",
+       " ('จันทรคติ', 6)]"
       ]
      },
      "execution_count": 28,
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -27,6 +27,11 @@
 from pythainlp.soundex import soundex
 from pythainlp.spell import correct, spell
 from pythainlp.tag import pos_tag
-from pythainlp.tokenize import sent_tokenize, tcc, word_tokenize, Tokenizer
+from pythainlp.tokenize import (
+    Tokenizer,
+    sent_tokenize,
+    subword_tokenize,
+    word_tokenize,
+)
 from pythainlp.transliterate import romanize, transliterate
 from pythainlp.util import collate, thai_strftime
diff --git a/pythainlp/soundex/__init__.py b/pythainlp/soundex/__init__.py
@@ -12,7 +12,7 @@
 # [KSS97] https://linux.thai.net/~thep/soundex/soundex.html
 
 
-def soundex(text: str, engine="udom83") -> str:
+def soundex(text: str, engine: str = "udom83") -> str:
     """
     Thai Soundex
 
@@ -24,9 +24,7 @@ def soundex(text: str, engine="udom83") -> str:
         * metasound
     :return: soundex code
     """
-    if engine == "udom83":
-        _soundex = udom83
-    elif engine == "lk82":
+    if engine == "lk82":
         _soundex = lk82
     elif engine == "metasound":
         _soundex = metasound
diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py
@@ -10,7 +10,7 @@
 __all__ = ["DEFAULT_SPELL_CHECKER", "correct", "spell", "NorvigSpellChecker"]
 
 
-def spell(word: str, engine="pn") -> List[str]:
+def spell(word: str, engine: str = "pn") -> List[str]:
     """
     :param str word: word to check spelling
     :param str engine:
@@ -21,7 +21,7 @@ def spell(word: str, engine="pn") -> List[str]:
     return DEFAULT_SPELL_CHECKER.spell(word)
 
 
-def correct(word: str, engine="pn") -> str:
+def correct(word: str, engine: str = "pn") -> str:
     """
     :param str word: word to correct spelling
     :param str engine:
diff --git a/pythainlp/summarize/__init__.py b/pythainlp/summarize/__init__.py
@@ -3,12 +3,16 @@
 Summarization
 """
 
+from typing import List
+
 from pythainlp.tokenize import sent_tokenize
 
 from .freq import FrequencySummarizer
 
 
-def summarize(text, n, engine="frequency", tokenizer="newmm"):
+def summarize(
+    text: str, n: int, engine: str = "frequency", tokenizer: str = "newmm"
+) -> List[str]:
     """
     Thai text summarization
 
diff --git a/pythainlp/summarize/freq.py b/pythainlp/summarize/freq.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 from heapq import nlargest
 from string import punctuation
+from typing import List
 
 from pythainlp.corpus import thai_stopwords
 from pythainlp.tokenize import sent_tokenize, word_tokenize
@@ -36,7 +37,7 @@ def __compute_frequencies(self, word_tokenized_sents):
     def __rank(self, ranking, n: int):
         return nlargest(n, ranking, key=ranking.get)
 
-    def summarize(self, text: str, n: int, tokenizer: str):
+    def summarize(self, text: str, n: int, tokenizer: str) -> List[str]:
         sents = sent_tokenize(text)
         word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
         self.__freq = self.__compute_frequencies(word_tokenized_sents)
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -23,7 +23,7 @@ def word_tokenize(
     :Parameters for engine:
         * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
         * longest - dictionary-based, Longest Matching
-        * icu - wrapper for ICU, dictionary-based
+        * icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based
         * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
         * ulmfit - use newmm engine with a specific dictionary for use with thai2vec
     :return: list of words, tokenized from the text
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Wrapper for ICU word segmentation
+Wrapper for PyICU word segmentation
+https://github.com/ovalhub/pyicu
 """
 import re
 from typing import List
diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py
@@ -1,43 +1,40 @@
 # -*- coding: utf-8 -*-
 
-from pythainlp.tokenize import word_tokenize
-
 
 def romanize(text: str, engine: str = "royin") -> str:
     """
+    Rendering Thai words in the Latin alphabet or "romanization",
+    using the Royal Thai General System of Transcription (RTGS),
+    which is the official system published by the Royal Institute of Thailand.
     ถอดเสียงภาษาไทยเป็นอักษรละติน
     :param str text: Thai text to be romanized
-    :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras).
-    :return: English (more or less) text that spells out how the Thai text should read.
+    :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses the Royal Thai General System of Transcription issued by Royal Institute of Thailand. 'thai2rom' is deep learning Thai romanization (require keras).
+    :return: A string of Thai words rendered in the Latin alphabet.
     """
 
     if not isinstance(text, str) or not text:
         return ""
 
     if engine == "thai2rom":
         from .thai2rom import romanize
-
-        return romanize(text)
     else:  # use default engine "royin"
         from .royin import romanize
 
-        words = word_tokenize(text)
-        romanized_words = [romanize(word) for word in words]
-
-        return "".join(romanized_words)
+    return romanize(text)
 
 
 def transliterate(text: str, engine: str = "ipa") -> str:
     """
+    Transliteration of Thai text
     :param str text: Thai text to be transliterated
-    :param str engine: 'ipa' (default) or 'pyicu'.
-    :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read.
+    :param str engine: 'ipa' (International Phonetic Alphabet; default) or 'icu'.
+    :return: A string of Internaitonal Phonetic Alphabets indicating how the text should be pronounced.
     """
 
     if not isinstance(text, str) or not text:
         return ""
 
-    if engine == "pyicu":
+    if engine == "icu" or engine == "pyicu":
         from .pyicu import transliterate
     else:
         from .ipa import transliterate
diff --git a/pythainlp/transliterate/ipa.py b/pythainlp/transliterate/ipa.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 """
 Transliterating text to International Phonetic Alphabet (IPA)
+Using epitran
+https://github.com/dmort27/epitran
 """
 import epitran
 
diff --git a/pythainlp/transliterate/pyicu.py b/pythainlp/transliterate/pyicu.py
@@ -1,13 +1,20 @@
 # -*- coding: utf-8 -*-
+"""
+Transliterating text to International Phonetic Alphabet (IPA)
+Using International Components for Unicode (ICU)
+https://github.com/ovalhub/pyicu
+"""
 from icu import Transliterator
 
-
 _ICU_THAI_TO_LATIN = Transliterator.createInstance("Thai-Latin")
 
 
 # ถอดเสียงภาษาไทยเป็นอักษรละติน
 def transliterate(text: str) -> str:
     """
+    Use ICU (International Components for Unicode) for transliteration
     ถอดเสียงภาษาไทยเป็นอักษรละติน รับค่า ''str'' ข้อความ คืนค่า ''str'' อักษรละติน
+    :param str text: Thai text to be transliterated.
+    :return: A string of Internaitonal Phonetic Alphabets indicating how the text should be pronounced.
     """
     return _ICU_THAI_TO_LATIN.transliterate(text)
diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py
diff --git a/pythainlp/transliterate/thai2rom.py b/pythainlp/transliterate/thai2rom.py
diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py
diff --git a/tests/__init__.py b/tests/__init__.py