Merge pull request #479 from PyThaiNLP/add-clause

wannaphong · web-flow · commit c932d3ff76af · 2020-10-07T15:46:55.000+07:00
Add clause tokenize model
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -8,10 +8,11 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
 Modules
 -------
 
+.. autofunction:: clause_tokenize
 .. autofunction:: sent_tokenize
-.. autofunction:: word_tokenize
-.. autofunction:: syllable_tokenize
 .. autofunction:: subword_tokenize
+.. autofunction:: syllable_tokenize
+.. autofunction:: word_tokenize
 .. autoclass:: Tokenizer
    :members:
 
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -7,6 +7,7 @@
     "THAI2FIT_TOKENIZER",
     "Tokenizer",
     "Trie",
+    "clause_tokenize",
     "sent_tokenize",
     "subword_tokenize",
     "syllable_tokenize",
@@ -27,6 +28,7 @@
 
 from pythainlp.tokenize.core import (
     Tokenizer,
+    clause_tokenize,
     sent_tokenize,
     subword_tokenize,
     syllable_tokenize,
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -8,14 +8,39 @@
 from pythainlp.tokenize import (
     DEFAULT_SENT_TOKENIZE_ENGINE,
     DEFAULT_SUBWORD_TOKENIZE_ENGINE,
+    DEFAULT_SYLLABLE_DICT_TRIE,
     DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
-    DEFAULT_WORD_TOKENIZE_ENGINE,
     DEFAULT_WORD_DICT_TRIE,
-    DEFAULT_SYLLABLE_DICT_TRIE,
+    DEFAULT_WORD_TOKENIZE_ENGINE,
 )
 from pythainlp.util.trie import Trie, dict_trie
 
 
+def clause_tokenize(doc: List[str]) -> List[List[str]]:
+    """
+    Clause tokenizer. (or Clause segmentation)
+
+    Tokenizes running word list into list of clauses (list of strings).
+    split by CRF trained on LST20 Corpus.
+
+    :param str doc: word list to be clause
+    :return: list of claues
+    :rtype: list[list[str]]
+
+    :Example:
+
+        from pythainlp.tokenize import clause_tokenize
+
+        clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
+        [['ฉัน', 'นอน'],
+        ['และ', 'คุณ', 'เล่น', 'มือถือ'],
+        ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
+    """
+    from .crfcls import segment
+
+    return segment(doc)
+
+
 def word_tokenize(
     text: str,
     custom_dict: Trie = None,
@@ -50,9 +75,6 @@ def word_tokenize(
           `DeepCut <https://github.com/rkcosmos/deepcut>`_,
           learning-based approach
 
-    .. warning::
-        * the option for engine named *ulmfit* has been deprecated since \
-          PyThaiNLP version 2.1
     :Note:
         - The parameter **custom_dict** can be provided as an argument \
           only for *newmm*, *longest*, and *attacut* engine.
diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+"""
+Clause segmenter
+"""
+from typing import List
+
+import pycrfsuite
+from pythainlp.corpus import get_corpus_path
+from pythainlp.tag import pos_tag
+
+
+def _doc2features(doc, i):
+    # features from current word
+    curr_word = doc[i][0]
+    curr_pos = doc[i][1]
+    features = {
+        "word.curr_word": curr_word,
+        "word.curr_isspace": curr_word.isspace(),
+        "word.curr_isdigit": curr_word.isdigit(),
+        "word.curr_postag": curr_pos,
+    }
+
+    # features from previous word
+    if i > 0:
+        prev_word = doc[i - 1][0]
+        prev_pos = doc[i - 1][1]
+        features["word.prev_word"] = prev_word
+        features["word.prev_isspace"] = prev_word.isspace()
+        features["word.prev_isdigit"] = prev_word.isdigit()
+        features["word.prev_postag"] = prev_pos
+    else:
+        features["BOS"] = True  # Beginning of Sequence
+
+    # features from next word
+    if i < len(doc) - 1:
+        next_word = doc[i + 1][0]
+        next_pos = doc[i + 1][1]
+        features["word.next_word"] = next_word
+        features["word.next_isspace"] = next_word.isspace()
+        features["word.next_isdigit"] = next_word.isdigit()
+        features["word.next_postag"] = next_pos
+    else:
+        features["EOS"] = True  # End of Sequence
+
+    return features
+
+
+def _extract_features(doc):
+    return [_doc2features(doc, i) for i in range(len(doc))]
+
+
+_CORPUS_NAME = "lst20-cls"
+tagger = pycrfsuite.Tagger()
+tagger.open(get_corpus_path(_CORPUS_NAME))
+
+
+def segment(doc: List[str]) -> List[List[str]]:
+    word_tags = pos_tag(doc, corpus="lst20")
+    features = _extract_features(word_tags)
+    word_markers = list(zip(doc, tagger.tag(features)))
+
+    clauses = []
+    temp = []
+    len_doc = len(doc) - 1
+    for i, word_marker in enumerate(word_markers):
+        word, marker = word_marker
+        if marker == "E_CLS" or i == len_doc:
+            temp.append(word)
+            clauses.append(temp)
+            temp = []
+        else:
+            temp.append(word)
+
+    return clauses
diff --git a/pythainlp/tokenize/crfcut.py b/pythainlp/tokenize/crfcut.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-CRFCut - Thai sentence segmentor.
+CRFCut - Thai sentence segmenter.
 
 Thai sentence segmentation using conditional random field,
 default model trained on TED dataset
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -2,7 +2,12 @@
 
 import unittest
 
-from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE, Tokenizer, attacut
+from pythainlp.tokenize import (
+    DEFAULT_WORD_DICT_TRIE,
+    Tokenizer,
+    attacut,
+    clause_tokenize,
+)
 from pythainlp.tokenize import deepcut as tokenize_deepcut
 from pythainlp.tokenize import etcc, longest, multi_cut, newmm
 from pythainlp.tokenize import pyicu as tokenize_pyicu
@@ -184,6 +189,10 @@ def setUp(self):
             "กกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกกก"
         )
 
+    def test_clause_tokenize(self):
+        self.assertIsNotNone(clause_tokenize(["ฉัน", "ทดสอบ"]))
+        self.assertIsInstance(clause_tokenize(["ฉัน", "ทดสอบ"]), list)
+
     def test_Tokenizer(self):
         t_test = Tokenizer(DEFAULT_WORD_DICT_TRIE)
         self.assertEqual(t_test.word_tokenize(""), [])