Skip to content

Commit 6ef5420

Browse files
committed
Refactor and revise variable names
1 parent 5ec63ce commit 6ef5420

File tree

7 files changed

+91
-78
lines changed

7 files changed

+91
-78
lines changed

docs/api/tokenize.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ Modules
1010

1111
.. autofunction:: clause_tokenize
1212
.. autofunction:: sent_tokenize
13-
.. autofunction:: word_tokenize
14-
.. autofunction:: syllable_tokenize
1513
.. autofunction:: subword_tokenize
14+
.. autofunction:: syllable_tokenize
15+
.. autofunction:: word_tokenize
1616
.. autoclass:: Tokenizer
1717
:members:
1818

pythainlp/tokenize/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"THAI2FIT_TOKENIZER",
88
"Tokenizer",
99
"Trie",
10+
"clause_tokenize",
1011
"sent_tokenize",
1112
"subword_tokenize",
1213
"syllable_tokenize",

pythainlp/tokenize/core.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
from pythainlp.tokenize import (
99
DEFAULT_SENT_TOKENIZE_ENGINE,
1010
DEFAULT_SUBWORD_TOKENIZE_ENGINE,
11+
DEFAULT_SYLLABLE_DICT_TRIE,
1112
DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
12-
DEFAULT_WORD_TOKENIZE_ENGINE,
1313
DEFAULT_WORD_DICT_TRIE,
14-
DEFAULT_SYLLABLE_DICT_TRIE,
14+
DEFAULT_WORD_TOKENIZE_ENGINE,
1515
)
1616
from pythainlp.util.trie import Trie, dict_trie
1717

@@ -20,7 +20,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
2020
"""
2121
Clause tokenizer. (or Clause segmentation)
2222
23-
Tokenizes running word list into list of claues (list of strings).
23+
Tokenizes running word list into list of clauses (list of strings).
2424
split by CRF trained on LST20 Corpus.
2525
2626
:param str doc: word list to be clause
@@ -36,8 +36,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
3636
['และ', 'คุณ', 'เล่น', 'มือถือ'],
3737
['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
3838
"""
39-
from .lst20 import clause_tokenize as cla
40-
return cla(doc)
39+
from .crfcls import segment
40+
41+
return segment(doc)
4142

4243

4344
def word_tokenize(

pythainlp/tokenize/crfcls.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Clause segmenter
4+
"""
5+
from typing import List
6+
7+
import pycrfsuite
8+
from pythainlp.corpus import get_corpus_path
9+
from pythainlp.tag import pos_tag
10+
11+
12+
def _doc2features(doc, i):
13+
# features from current word
14+
curr_word = doc[i][0]
15+
curr_pos = doc[i][1]
16+
features = {
17+
"word.word": curr_word,
18+
"word.isspace": curr_word.isspace(),
19+
"word.isdigit()": curr_word.isdigit(),
20+
"postag": curr_pos,
21+
}
22+
23+
# features from previous word
24+
if i > 0:
25+
prev_word = doc[i - 1][0]
26+
prev_pos = doc[i - 1][1]
27+
features["word.prevword"] = prev_word
28+
features["word.previsspace"] = prev_word.isspace()
29+
features["word.prevwordisdigit"] = prev_word.isdigit()
30+
features["word.prepostag"] = prev_pos
31+
else:
32+
features["BOS"] = True # Beginning of Sequence
33+
34+
# features from next word
35+
if i < len(doc) - 1:
36+
next_word = doc[i + 1][0]
37+
next_pos = doc[i + 1][1]
38+
features["word.nextword"] = next_word
39+
features["word.nextisspace"] = next_word.isspace()
40+
features["word.nextwordisdigit"] = next_word.isdigit()
41+
features["word.nextpostag"] = next_pos
42+
else:
43+
features["EOS"] = True # End of Sequence
44+
45+
return features
46+
47+
48+
def _extract_features(doc):
49+
return [_doc2features(doc, i) for i in range(len(doc))]
50+
51+
52+
_CORPUS_NAME = "lst20-cls"
53+
tagger = pycrfsuite.Tagger()
54+
tagger.open(get_corpus_path(_CORPUS_NAME))
55+
56+
57+
def segment(doc: List[str]) -> List[List[str]]:
58+
word_tags = pos_tag(doc, corpus="lst20")
59+
features = _extract_features(word_tags)
60+
word_markers = list(zip(doc, tagger.tag(features)))
61+
62+
clauses = []
63+
temp = []
64+
len_doc = len(doc) - 1
65+
for i, word_marker in enumerate(word_markers):
66+
word, marker = word_marker
67+
if marker == "E_CLS" or i == len_doc:
68+
temp.append(word)
69+
clauses.append(temp)
70+
temp = []
71+
else:
72+
temp.append(word)
73+
74+
return clauses

pythainlp/tokenize/crfcut.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
CRFCut - Thai sentence segmentor.
3+
CRFCut - Thai sentence segmenter.
44
55
Thai sentence segmentation using conditional random field,
66
default model trained on TED dataset

pythainlp/tokenize/lst20.py

Lines changed: 0 additions & 64 deletions
This file was deleted.

tests/test_tokenize.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@
22

33
import unittest
44

5-
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE, Tokenizer, attacut
5+
from pythainlp.tokenize import (
6+
DEFAULT_WORD_DICT_TRIE,
7+
Tokenizer,
8+
attacut,
9+
clause_tokenize,
10+
)
611
from pythainlp.tokenize import deepcut as tokenize_deepcut
712
from pythainlp.tokenize import etcc, longest, multi_cut, newmm
813
from pythainlp.tokenize import pyicu as tokenize_pyicu
@@ -15,7 +20,6 @@
1520
)
1621
from pythainlp.tokenize.ssg import segment as ssg_segment
1722
from pythainlp.util import dict_trie
18-
from pythainlp.tokenize import clause_tokenize
1923

2024

2125
class TestTokenizePackage(unittest.TestCase):
@@ -187,10 +191,7 @@ def setUp(self):
187191

188192
def test_clause_tokenize(self):
189193
self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"]))
190-
self.assertIsInstance(
191-
clause_tokenize(["ฉัน", "ทดสอบ"]),
192-
list
193-
)
194+
self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list)
194195

195196
def test_Tokenizer(self):
196197
t_test = Tokenizer(DEFAULT_WORD_DICT_TRIE)

0 commit comments

Comments
 (0)