Merge pull request #277 from PyThaiNLP/trie (build and deploy docs)

wannaphong · web-flow · commit 66e67537428e · 2019-09-26T16:01:42.000+07:00
Change from `marisa-trie` to a Trie implementation written in python
diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
@@ -12,6 +12,7 @@ For some advanced functionalities, like word vector, extra packages may be neede
 where extras can be
 
 - ``artagger`` (to support artagger part-of-speech tagger)*
+- ``attacut`` - Fast and Reasonably Accurate Word Tokenizer for Thai (https://github.com/PyThaiNLP/attacut)
 - ``deepcut`` (to support deepcut machine-learnt tokenizer)
 - ``icu`` (for ICU support in transliteration and tokenization)
 - ``ipa`` (for International Phonetic Alphabet support in transliteration)
@@ -39,7 +40,7 @@ Note for installation on Windows:
     - Install them with pip. For example: `pip install marisa_trie‑0.7.5‑cp36‑cp36m‑win32.whl`
     
 - Option 2 (advanced):
-    - You can also try to install them with a command: `pip install marisa-trie pyicu`
+    - You can also try to install them with a command: `pip install pyicu`
     - With this, pip will try to build the libraries directly from source files.
     - This will take some time and need a set of build tools to be installed in your system, for example Microsoft Visual C++ Compiler. It also requires some technical skills on how things are getting built on Windows system, as you may need to configure some environment variables to accommodate the build process.
     - For PyICU, before the installation, you have to set ``ICU_VERSION`` environment variable to ICU version in your system. For example, ``set ICU_VERSION=62.1``.
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -6,7 +6,7 @@
 import warnings
 from typing import Iterable, List, Union
 
-from marisa_trie import Trie
+from .trie import Trie
 from pythainlp.corpus import thai_syllables, thai_words
 
 DEFAULT_DICT_TRIE = Trie(thai_words())
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
@@ -10,7 +10,7 @@
 
 import deepcut
 
-from marisa_trie import Trie
+from .trie import Trie
 
 
 def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -14,7 +14,7 @@
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
-from marisa_trie import Trie
+from .trie import Trie
 
 _FRONT_DEP_CHAR = [
     "ะ",
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -17,7 +17,7 @@
 
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
-from marisa_trie import Trie
+from .trie import Trie
 
 
 class LatticeString(str):
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -16,7 +16,7 @@
 from heapq import heappop, heappush  # for priority queue
 from typing import List
 
-from marisa_trie import Trie
+from .trie import Trie
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
 from .tcc import tcc_pos
diff --git a/pythainlp/tokenize/trie.py b/pythainlp/tokenize/trie.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+class Trie:
+
+    class Node(object):
+        __slots__ = 'end', 'children'
+
+        def __init__(self):
+            self.end = False
+            self.children = {}
+
+    def __init__(self, words):
+        self.words = words
+        self.root = Trie.Node()
+        for word in words:
+            cur = self.root
+            for ch in word:
+                node = cur.children.get(ch)
+                if not node:
+                    node = Trie.Node()
+                    cur.children[ch] = node
+                cur = node
+            cur.end = True
+
+    def prefixes(self, text):
+        res = []
+        cur = self.root
+        for i, ch in enumerate(text):
+            node = cur.children.get(ch)
+            if not node:
+                break
+            if node.end:
+                res.append(text[:i+1])
+            cur = node
+        return res
+
+    def __contains__(self, key):
+        return key in self.words
+
+    def __iter__(self):
+        yield from self.words
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
 dill==0.3.*
-marisa-trie==0.7.*
 nltk==3.4.*
 pytz==2019.2
 requests==2.22.*

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`dill==0.3.*`
`2`		`-marisa-trie==0.7.*`
`3`	`2`	`nltk==3.4.*`
`4`	`3`	`pytz==2019.2`
`5`	`4`	`requests==2.22.*`