Skip to content

Commit b546f89

Browse files
committed
Update word_detokenize rule
- Fixed space bug
1 parent 451abd6 commit b546f89

File tree

2 files changed

+9
-1
lines changed

2 files changed

+9
-1
lines changed

pythainlp/tokenize/core.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
8181
_list_sents.append(" ")
8282
_add_index.append(j)
8383
else:
84-
pass
84+
_add_index.append(j)
85+
elif w.isspace():
86+
_add_index.append(j)
8587
elif j-1 in _add_index:
8688
_list_sents.append(" ")
8789
_list_sents.append(w)

tests/test_tokenize.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,12 @@ def test_word_detokenize(self):
657657
),
658658
"ผมเลี้ยง 5 10 ตัว ๆ คนดี"
659659
)
660+
self.assertEqual(
661+
word_detokenize(
662+
["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]
663+
),
664+
"ผมเลี้ยง 5 ตัว ๆ คนดี"
665+
)
660666
self.assertTrue(
661667
isinstance(word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), str)
662668
)

0 commit comments

Comments
 (0)