Merge pull request #697 from PyThaiNLP/add-word_detokenize

wannaphong · web-flow · commit bb2c7b445077 · 2022-09-15T16:52:26.000+07:00
Add word_detokenize
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -12,6 +12,7 @@ Modules
 .. autofunction:: sent_tokenize
 .. autofunction:: subword_tokenize
 .. autofunction:: word_tokenize
+.. autofunction:: word_detokenize
 .. autoclass:: Tokenizer
    :members:
 
diff --git a/notebooks/word_detokenize.ipynb b/notebooks/word_detokenize.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pythainlp.tokenize import word_detokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ผมเลี้ยง 5 ตัว'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_detokenize([\"ผม\",\"เลี้ยง\",\"5\",\"ตัว\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['ผม', 'เลี้ยง', ' ', '5', ' ', 'ตัว']]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_detokenize([\"ผม\",\"เลี้ยง\",\" \",\"5\",\"ตัว\"],\"list\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ผมเลี้ยง 5 5 ตัว'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_detokenize([\"ผม\",\"เลี้ยง\",\"5\",\"5\",\"ตัว\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาในกลุ่มภาษาไทซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า - ไท และเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_detokenize(['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ผมเลี้ยง 5 5 ตัว ๆ คนดี'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_detokenize([\"ผม\",\"เลี้ยง\",\"5\",\"5\",\"ตัว\",\"ๆ\",\"คน\",\"ดี\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "48b90c76b600d2ec6cf3e350b23a5df9176e3eef7b22ad90377f14c1de9c1bf6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -11,6 +11,7 @@
     "sent_tokenize",
     "subword_tokenize",
     "word_tokenize",
+    "word_detokenize"
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -31,6 +32,7 @@
     sent_tokenize,
     subword_tokenize,
     word_tokenize,
+    word_detokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -14,6 +14,7 @@
     DEFAULT_WORD_DICT_TRIE,
     DEFAULT_WORD_TOKENIZE_ENGINE,
 )
+from pythainlp import thai_characters
 from pythainlp.util.trie import Trie, dict_trie
 
 
@@ -45,6 +46,64 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
     return segment(doc)
 
 
+def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
+    """
+    Word detokenizer.
+
+    This function will detokenize the list word in each sentence to text.
+
+    :param str segments: List sentences with list words.
+    :param str output: the output type (str or list)
+    :return: the thai text
+    :rtype: Union[str,List[str]]
+    """
+    _list_all = []
+    if isinstance(segments[0], str):
+        segments = [segments]
+    for i, s in enumerate(segments):
+        _list_sents = []
+        _add_index = []
+        _space_index = []
+        _mark_index = []
+        for j, w in enumerate(s):
+            if j > 0:
+                # previous word
+                p_w = s[j-1]
+                # if w is number or other language and not be space
+                if (
+                    w[0] not in thai_characters
+                    and not w.isspace()
+                    and not p_w.isspace()
+                ):
+                    _list_sents.append(" ")
+                    _add_index.append(j)
+                # if previous word is number or other language and not be space
+                elif p_w[0] not in thai_characters and not p_w.isspace():
+                    _list_sents.append(" ")
+                    _add_index.append(j)
+                # if word is Thai iteration mark
+                elif w == "ๆ":
+                    if not p_w.isspace():
+                        _list_sents.append(" ")
+                    _mark_index.append(j)
+                elif w.isspace() and j-1 not in _space_index:
+                    _space_index.append(j)
+                elif j-1 in _mark_index:
+                    _list_sents.append(" ")
+            _list_sents.append(w)
+        _list_all.append(_list_sents)
+    if output == "list":
+        return _list_all
+    else:
+        _text = []
+        for i in _list_all:
+            _temp = ""
+            for j in i:
+                _temp += j
+            _text.append(_temp)
+        return ' '.join(_text)
+
+
 def word_tokenize(
     text: str,
     custom_dict: Trie = None,
@@ -63,7 +122,7 @@ def word_tokenize(
                                  for end of phrase in Thai.
                                  Otherwise, whitespaces are omitted.
     :return: list of words
-    :rtype: list[str]
+    :rtype: List[str]
     **Options for engine**
         * *newmm* (default) - dictionary-based, Maximum Matching +
           Thai Character Cluster
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -22,6 +22,7 @@
     sefr_cut,
     tltk,
     oskut,
+    word_detokenize,
 )
 from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
 from pythainlp.util import dict_trie
@@ -640,3 +641,32 @@ def test_oskut(self):
         self.assertIsNotNone(
             oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="scads"),
         )
+
+    def test_word_detokenize(self):
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]),
+            "ผมเลี้ยง 5 ตัว"
+        )
+        self.assertEqual(word_detokenize(
+            ["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
+            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]]
+        )
+        self.assertEqual(
+            word_detokenize(
+                ["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]
+            ),
+            "ผมเลี้ยง 5 10 ตัว ๆ คนดี"
+        )
+        self.assertEqual(
+            word_detokenize(
+                ["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]
+            ),
+            "ผมเลี้ยง 5 ตัว ๆ คนดี"
+        )
+        self.assertTrue(
+            isinstance(word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), str)
+        )
+        self.assertEqual(
+            word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
+            "ม่ายย ผมเลี้ยง 5 ตัว"
+        )

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`"sent_tokenize",`
`12`	`12`	`"subword_tokenize",`
`13`	`13`	`"word_tokenize",`
	`14`	`+ "word_detokenize"`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`from pythainlp.corpus import thai_syllables, thai_words`
`@@ -31,6 +32,7 @@`
`31`	`32`	`sent_tokenize,`
`32`	`33`	`subword_tokenize,`
`33`	`34`	`word_tokenize,`
	`35`	`+ word_detokenize,`
`34`	`36`	`)`
`35`	`37`
`36`	`38`	`from pythainlp.corpus import get_corpus as _get_corpus`