Skip to content

Commit c932d3f

Browse files
authored
Merge pull request #479 from PyThaiNLP/add-clause
Add clause tokenize model
2 parents 008470a + b8ab03e commit c932d3f

File tree

6 files changed

+117
-9
lines changed

6 files changed

+117
-9
lines changed

docs/api/tokenize.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
88
Modules
99
-------
1010

11+
.. autofunction:: clause_tokenize
1112
.. autofunction:: sent_tokenize
12-
.. autofunction:: word_tokenize
13-
.. autofunction:: syllable_tokenize
1413
.. autofunction:: subword_tokenize
14+
.. autofunction:: syllable_tokenize
15+
.. autofunction:: word_tokenize
1516
.. autoclass:: Tokenizer
1617
:members:
1718

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"THAI2FIT_TOKENIZER",
88
"Tokenizer",
99
"Trie",
10+
"clause_tokenize",
1011
"sent_tokenize",
1112
"subword_tokenize",
1213
"syllable_tokenize",
@@ -27,6 +28,7 @@
2728

2829
from pythainlp.tokenize.core import (
2930
Tokenizer,
31+
clause_tokenize,
3032
sent_tokenize,
3133
subword_tokenize,
3234
syllable_tokenize,

pythainlp/tokenize/core.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,39 @@
88
from pythainlp.tokenize import (
99
DEFAULT_SENT_TOKENIZE_ENGINE,
1010
DEFAULT_SUBWORD_TOKENIZE_ENGINE,
11+
DEFAULT_SYLLABLE_DICT_TRIE,
1112
DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
12-
DEFAULT_WORD_TOKENIZE_ENGINE,
1313
DEFAULT_WORD_DICT_TRIE,
14-
DEFAULT_SYLLABLE_DICT_TRIE,
14+
DEFAULT_WORD_TOKENIZE_ENGINE,
1515
)
1616
from pythainlp.util.trie import Trie, dict_trie
1717

1818

19+
def clause_tokenize(doc: List[str]) -> List[List[str]]:
20+
"""
21+
Clause tokenizer. (or Clause segmentation)
22+
23+
Tokenizes running word list into list of clauses (list of strings).
24+
split by CRF trained on LST20 Corpus.
25+
26+
:param str doc: word list to be clause
27+
:return: list of claues
28+
:rtype: list[list[str]]
29+
30+
:Example:
31+
32+
from pythainlp.tokenize import clause_tokenize
33+
34+
clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
35+
[['ฉัน', 'นอน'],
36+
['และ', 'คุณ', 'เล่น', 'มือถือ'],
37+
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
38+
"""
39+
from .crfcls import segment
40+
41+
return segment(doc)
42+
43+
1944
def word_tokenize(
2045
text: str,
2146
custom_dict: Trie = None,
@@ -50,9 +75,6 @@ def word_tokenize(
5075
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
5176
learning-based approach
5277
53-
.. warning::
54-
* the option for engine named *ulmfit* has been deprecated since \
55-
PyThaiNLP version 2.1
5678
:Note:
5779
- The parameter **custom_dict** can be provided as an argument \
5880
only for *newmm*, *longest*, and *attacut* engine.

pythainlp/tokenize/crfcls.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Clause segmenter
4+
"""
5+
from typing import List
6+
7+
import pycrfsuite
8+
from pythainlp.corpus import get_corpus_path
9+
from pythainlp.tag import pos_tag
10+
11+
12+
def _doc2features(doc, i):
13+
# features from current word
14+
curr_word = doc[i][0]
15+
curr_pos = doc[i][1]
16+
features = {
17+
"word.curr_word": curr_word,
18+
"word.curr_isspace": curr_word.isspace(),
19+
"word.curr_isdigit": curr_word.isdigit(),
20+
"word.curr_postag": curr_pos,
21+
}
22+
23+
# features from previous word
24+
if i > 0:
25+
prev_word = doc[i - 1][0]
26+
prev_pos = doc[i - 1][1]
27+
features["word.prev_word"] = prev_word
28+
features["word.prev_isspace"] = prev_word.isspace()
29+
features["word.prev_isdigit"] = prev_word.isdigit()
30+
features["word.prev_postag"] = prev_pos
31+
else:
32+
features["BOS"] = True # Beginning of Sequence
33+
34+
# features from next word
35+
if i < len(doc) - 1:
36+
next_word = doc[i + 1][0]
37+
next_pos = doc[i + 1][1]
38+
features["word.next_word"] = next_word
39+
features["word.next_isspace"] = next_word.isspace()
40+
features["word.next_isdigit"] = next_word.isdigit()
41+
features["word.next_postag"] = next_pos
42+
else:
43+
features["EOS"] = True # End of Sequence
44+
45+
return features
46+
47+
48+
def _extract_features(doc):
49+
return [_doc2features(doc, i) for i in range(len(doc))]
50+
51+
52+
_CORPUS_NAME = "lst20-cls"
53+
tagger = pycrfsuite.Tagger()
54+
tagger.open(get_corpus_path(_CORPUS_NAME))
55+
56+
57+
def segment(doc: List[str]) -> List[List[str]]:
58+
word_tags = pos_tag(doc, corpus="lst20")
59+
features = _extract_features(word_tags)
60+
word_markers = list(zip(doc, tagger.tag(features)))
61+
62+
clauses = []
63+
temp = []
64+
len_doc = len(doc) - 1
65+
for i, word_marker in enumerate(word_markers):
66+
word, marker = word_marker
67+
if marker == "E_CLS" or i == len_doc:
68+
temp.append(word)
69+
clauses.append(temp)
70+
temp = []
71+
else:
72+
temp.append(word)
73+
74+
return clauses

pythainlp/tokenize/crfcut.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
CRFCut - Thai sentence segmentor.
3+
CRFCut - Thai sentence segmenter.
44
55
Thai sentence segmentation using conditional random field,
66
default model trained on TED dataset

tests/test_tokenize.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@
22

33
import unittest
44

5-
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE, Tokenizer, attacut
5+
from pythainlp.tokenize import (
6+
DEFAULT_WORD_DICT_TRIE,
7+
Tokenizer,
8+
attacut,
9+
clause_tokenize,
10+
)
611
from pythainlp.tokenize import deepcut as tokenize_deepcut
712
from pythainlp.tokenize import etcc, longest, multi_cut, newmm
813
from pythainlp.tokenize import pyicu as tokenize_pyicu
@@ -184,6 +189,10 @@ def setUp(self):
184189
"กกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก"
185190
)
186191

192+
def test_clause_tokenize(self):
193+
self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"]))
194+
self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list)
195+
187196
def test_Tokenizer(self):
188197
t_test = Tokenizer(DEFAULT_WORD_DICT_TRIE)
189198
self.assertEqual(t_test.word_tokenize(""), [])

0 commit comments

Comments
 (0)