Skip to content

Commit bb2c7b4

Browse files
authored
Merge pull request #697 from PyThaiNLP/add-word_detokenize
Add word_detokenize
2 parents ff77556 + 218fd27 commit bb2c7b4

File tree

5 files changed

+240
-1
lines changed

5 files changed

+240
-1
lines changed

docs/api/tokenize.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Modules
1212
.. autofunction:: sent_tokenize
1313
.. autofunction:: subword_tokenize
1414
.. autofunction:: word_tokenize
15+
.. autofunction:: word_detokenize
1516
.. autoclass:: Tokenizer
1617
:members:
1718

notebooks/word_detokenize.ipynb

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from pythainlp.tokenize import word_detokenize"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 2,
15+
"metadata": {},
16+
"outputs": [
17+
{
18+
"data": {
19+
"text/plain": [
20+
"'ผมเลี้ยง 5 ตัว'"
21+
]
22+
},
23+
"execution_count": 2,
24+
"metadata": {},
25+
"output_type": "execute_result"
26+
}
27+
],
28+
"source": [
29+
"word_detokenize([\"ผม\",\"เลี้ยง\",\"5\",\"ตัว\"])"
30+
]
31+
},
32+
{
33+
"cell_type": "code",
34+
"execution_count": 3,
35+
"metadata": {},
36+
"outputs": [
37+
{
38+
"data": {
39+
"text/plain": [
40+
"[['ผม', 'เลี้ยง', ' ', '5', ' ', 'ตัว']]"
41+
]
42+
},
43+
"execution_count": 3,
44+
"metadata": {},
45+
"output_type": "execute_result"
46+
}
47+
],
48+
"source": [
49+
"word_detokenize([\"ผม\",\"เลี้ยง\",\" \",\"5\",\"ตัว\"],\"list\")"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 4,
55+
"metadata": {},
56+
"outputs": [
57+
{
58+
"data": {
59+
"text/plain": [
60+
"'ผมเลี้ยง 5 5 ตัว'"
61+
]
62+
},
63+
"execution_count": 4,
64+
"metadata": {},
65+
"output_type": "execute_result"
66+
}
67+
],
68+
"source": [
69+
"word_detokenize([\"ผม\",\"เลี้ยง\",\"5\",\"5\",\"ตัว\"])"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 5,
75+
"metadata": {},
76+
"outputs": [
77+
{
78+
"data": {
79+
"text/plain": [
80+
"'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาในกลุ่มภาษาไทซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า - ไท และเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'"
81+
]
82+
},
83+
"execution_count": 5,
84+
"metadata": {},
85+
"output_type": "execute_result"
86+
}
87+
],
88+
"source": [
89+
"word_detokenize(['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย'])"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 6,
95+
"metadata": {},
96+
"outputs": [
97+
{
98+
"data": {
99+
"text/plain": [
100+
"'ผมเลี้ยง 5 5 ตัว ๆ คนดี'"
101+
]
102+
},
103+
"execution_count": 6,
104+
"metadata": {},
105+
"output_type": "execute_result"
106+
}
107+
],
108+
"source": [
109+
"word_detokenize([\"ผม\",\"เลี้ยง\",\"5\",\"5\",\"ตัว\",\"\",\"คน\",\"ดี\"])"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": []
118+
}
119+
],
120+
"metadata": {
121+
"kernelspec": {
122+
"display_name": "Python 3.9.12 ('base')",
123+
"language": "python",
124+
"name": "python3"
125+
},
126+
"language_info": {
127+
"codemirror_mode": {
128+
"name": "ipython",
129+
"version": 3
130+
},
131+
"file_extension": ".py",
132+
"mimetype": "text/x-python",
133+
"name": "python",
134+
"nbconvert_exporter": "python",
135+
"pygments_lexer": "ipython3",
136+
"version": "3.9.12"
137+
},
138+
"orig_nbformat": 4,
139+
"vscode": {
140+
"interpreter": {
141+
"hash": "48b90c76b600d2ec6cf3e350b23a5df9176e3eef7b22ad90377f14c1de9c1bf6"
142+
}
143+
}
144+
},
145+
"nbformat": 4,
146+
"nbformat_minor": 2
147+
}

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"sent_tokenize",
1212
"subword_tokenize",
1313
"word_tokenize",
14+
"word_detokenize"
1415
]
1516

1617
from pythainlp.corpus import thai_syllables, thai_words
@@ -31,6 +32,7 @@
3132
sent_tokenize,
3233
subword_tokenize,
3334
word_tokenize,
35+
word_detokenize,
3436
)
3537

3638
from pythainlp.corpus import get_corpus as _get_corpus

pythainlp/tokenize/core.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
DEFAULT_WORD_DICT_TRIE,
1515
DEFAULT_WORD_TOKENIZE_ENGINE,
1616
)
17+
from pythainlp import thai_characters
1718
from pythainlp.util.trie import Trie, dict_trie
1819

1920

@@ -45,6 +46,64 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
4546
return segment(doc)
4647

4748

49+
def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
50+
"""
51+
Word detokenizer.
52+
53+
This function will detokenize the list word in each sentence to text.
54+
55+
:param str segments: List sentences with list words.
56+
:param str output: the output type (str or list)
57+
:return: the thai text
58+
:rtype: Union[str,List[str]]
59+
"""
60+
_list_all = []
61+
if isinstance(segments[0], str):
62+
segments = [segments]
63+
for i, s in enumerate(segments):
64+
_list_sents = []
65+
_add_index = []
66+
_space_index = []
67+
_mark_index = []
68+
for j, w in enumerate(s):
69+
if j > 0:
70+
# previous word
71+
p_w = s[j-1]
72+
# if w is number or other language and not be space
73+
if (
74+
w[0] not in thai_characters
75+
and not w.isspace()
76+
and not p_w.isspace()
77+
):
78+
_list_sents.append(" ")
79+
_add_index.append(j)
80+
# if previous word is number or other language and not be space
81+
elif p_w[0] not in thai_characters and not p_w.isspace():
82+
_list_sents.append(" ")
83+
_add_index.append(j)
84+
# if word is Thai iteration mark
85+
elif w == "ๆ":
86+
if not p_w.isspace():
87+
_list_sents.append(" ")
88+
_mark_index.append(j)
89+
elif w.isspace() and j-1 not in _space_index:
90+
_space_index.append(j)
91+
elif j-1 in _mark_index:
92+
_list_sents.append(" ")
93+
_list_sents.append(w)
94+
_list_all.append(_list_sents)
95+
if output == "list":
96+
return _list_all
97+
else:
98+
_text = []
99+
for i in _list_all:
100+
_temp = ""
101+
for j in i:
102+
_temp += j
103+
_text.append(_temp)
104+
return ' '.join(_text)
105+
106+
48107
def word_tokenize(
49108
text: str,
50109
custom_dict: Trie = None,
@@ -63,7 +122,7 @@ def word_tokenize(
63122
for end of phrase in Thai.
64123
Otherwise, whitespaces are omitted.
65124
:return: list of words
66-
:rtype: list[str]
125+
:rtype: List[str]
67126
**Options for engine**
68127
* *newmm* (default) - dictionary-based, Maximum Matching +
69128
Thai Character Cluster

tests/test_tokenize.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
sefr_cut,
2323
tltk,
2424
oskut,
25+
word_detokenize,
2526
)
2627
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
2728
from pythainlp.util import dict_trie
@@ -640,3 +641,32 @@ def test_oskut(self):
640641
self.assertIsNotNone(
641642
oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="scads"),
642643
)
644+
645+
def test_word_detokenize(self):
646+
self.assertEqual(
647+
word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]),
648+
"ผมเลี้ยง 5 ตัว"
649+
)
650+
self.assertEqual(word_detokenize(
651+
["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
652+
[["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]]
653+
)
654+
self.assertEqual(
655+
word_detokenize(
656+
["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]
657+
),
658+
"ผมเลี้ยง 5 10 ตัว ๆ คนดี"
659+
)
660+
self.assertEqual(
661+
word_detokenize(
662+
["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]
663+
),
664+
"ผมเลี้ยง 5 ตัว ๆ คนดี"
665+
)
666+
self.assertTrue(
667+
isinstance(word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), str)
668+
)
669+
self.assertEqual(
670+
word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
671+
"ม่ายย ผมเลี้ยง 5 ตัว"
672+
)

0 commit comments

Comments
 (0)