Skip to content

Commit 66e6753

Browse files
authored
Merge pull request #277 from PyThaiNLP/trie (build and deploy docs)
Change from `marisa-trie` to a Trie implementation written in python
2 parents 5cf1520 + e59cc6d commit 66e6753

File tree

8 files changed

+47
-7
lines changed

8 files changed

+47
-7
lines changed

docs/notes/installation.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ For some advanced functionalities, like word vector, extra packages may be neede
1212
where extras can be
1313

1414
- ``artagger`` (to support artagger part-of-speech tagger)*
15+
- ``attacut`` - Fast and Reasonably Accurate Word Tokenizer for Thai (https://github.com/PyThaiNLP/attacut)
1516
- ``deepcut`` (to support deepcut machine-learnt tokenizer)
1617
- ``icu`` (for ICU support in transliteration and tokenization)
1718
- ``ipa`` (for International Phonetic Alphabet support in transliteration)
@@ -39,7 +40,7 @@ Note for installation on Windows:
3940
- Install them with pip. For example: `pip install marisa_trie‑0.7.5‑cp36‑cp36m‑win32.whl`
4041

4142
- Option 2 (advanced):
42-
- You can also try to install them with a command: `pip install marisa-trie pyicu`
43+
- You can also try to install them with a command: `pip install pyicu`
4344
- With this, pip will try to build the libraries directly from source files.
4445
- This will take some time and need a set of build tools to be installed in your system, for example Microsoft Visual C++ Compiler. It also requires some technical skills on how things are getting built on Windows system, as you may need to configure some environment variables to accommodate the build process.
4546
- For PyICU, before the installation, you have to set ``ICU_VERSION`` environment variable to ICU version in your system. For example, ``set ICU_VERSION=62.1``.

pythainlp/tokenize/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import warnings
77
from typing import Iterable, List, Union
88

9-
from marisa_trie import Trie
9+
from .trie import Trie
1010
from pythainlp.corpus import thai_syllables, thai_words
1111

1212
DEFAULT_DICT_TRIE = Trie(thai_words())

pythainlp/tokenize/deepcut.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import deepcut
1212

13-
from marisa_trie import Trie
13+
from .trie import Trie
1414

1515

1616
def segment(text: str, custom_dict: Union[Trie, List[str], str] = None) -> List[str]:

pythainlp/tokenize/longest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from pythainlp.tokenize import DEFAULT_DICT_TRIE
1616

17-
from marisa_trie import Trie
17+
from .trie import Trie
1818

1919
_FRONT_DEP_CHAR = [
2020
"ะ",

pythainlp/tokenize/multi_cut.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from pythainlp.tokenize import DEFAULT_DICT_TRIE
1919

20-
from marisa_trie import Trie
20+
from .trie import Trie
2121

2222

2323
class LatticeString(str):

pythainlp/tokenize/newmm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from heapq import heappop, heappush # for priority queue
1717
from typing import List
1818

19-
from marisa_trie import Trie
19+
from .trie import Trie
2020
from pythainlp.tokenize import DEFAULT_DICT_TRIE
2121

2222
from .tcc import tcc_pos

pythainlp/tokenize/trie.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
class Trie:
3+
4+
class Node(object):
5+
__slots__ = 'end', 'children'
6+
7+
def __init__(self):
8+
self.end = False
9+
self.children = {}
10+
11+
def __init__(self, words):
12+
self.words = words
13+
self.root = Trie.Node()
14+
for word in words:
15+
cur = self.root
16+
for ch in word:
17+
node = cur.children.get(ch)
18+
if not node:
19+
node = Trie.Node()
20+
cur.children[ch] = node
21+
cur = node
22+
cur.end = True
23+
24+
def prefixes(self, text):
25+
res = []
26+
cur = self.root
27+
for i, ch in enumerate(text):
28+
node = cur.children.get(ch)
29+
if not node:
30+
break
31+
if node.end:
32+
res.append(text[:i+1])
33+
cur = node
34+
return res
35+
36+
def __contains__(self, key):
37+
return key in self.words
38+
39+
def __iter__(self):
40+
yield from self.words

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
dill==0.3.*
2-
marisa-trie==0.7.*
32
nltk==3.4.*
43
pytz==2019.2
54
requests==2.22.*

0 commit comments

Comments
 (0)