Skip to content

Commit 9c6ecd7

Browse files
authored
Merge pull request #407 from PyThaiNLP/thainer-newcrf
Port Thai NER from sklearn-crfsuite to python-crfsuite
2 parents a4bbac9 + 25e7b87 commit 9c6ecd7

File tree

5 files changed

+9
-20
lines changed

5 files changed

+9
-20
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ where `extras` can be
8383
- `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization)
8484
- `ipa` (for IPA, International Phonetic Alphabet, support in transliteration)
8585
- `ml` (to support ULMFiT models for classification)
86-
- `ner` (for named-entity recognizer)
8786
- `thai2fit` (for Thai word vector)
8887
- `thai2rom` (for machine-learnt romanization)
8988
- `wordnet` (for Thai WordNet API)
@@ -205,7 +204,6 @@ pip install pythainlp[extra1,extra2,...]
205204
- `icu` (สำหรับการถอดตัวสะกดเป็นสัทอักษรและการตัดคำด้วย ICU)
206205
- `ipa` (สำหรับการถอดตัวสะกดเป็นสัทอักษรสากล (IPA))
207206
- `ml` (สำหรับการรองรับโมเดล ULMFiT)
208-
- `ner` (สำหรับการติดป้ายชื่อเฉพาะ (named-entity))
209207
- `thai2fit` (สำหรับ word vector)
210208
- `thai2rom` (สำหรับการถอดตัวสะกดเป็นอักษรละติน)
211209
- `wordnet` (สำหรับ API WordNet ภาษาไทย)

docs/notes/installation.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ where ``extras`` can be
1818
- ``icu`` (for ICU, International Components for Unicode, support in transliteration and tokenization)
1919
- ``ipa`` (for IPA, International Phonetic Alphabet, support in transliteration)
2020
- ``ml`` (to support ULMFiT models for classification)
21-
- ``ner`` (for named-entity recognizer)
2221
- ``thai2fit`` (for Thai word vector)
2322
- ``thai2rom`` (for machine-learnt romanization)
2423
- ``full`` (install everything)

pythainlp/corpus/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
_CORPUS_DB_URL = (
4040
"https://raw.githubusercontent.com/"
4141
+ "PyThaiNLP/pythainlp-corpus/"
42-
+ "2.1/db.json"
42+
+ "2.2/db.json"
4343
)
4444

4545
_CORPUS_DB_FILENAME = "db.json"

pythainlp/tag/named_entity.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from typing import List, Tuple, Union
99

10-
import sklearn_crfsuite
10+
import pycrfsuite
1111
from pythainlp.corpus import download, get_corpus_path, thai_stopwords
1212
from pythainlp.tag import pos_tag
1313
from pythainlp.tokenize import word_tokenize
@@ -76,18 +76,12 @@ def __init__(self):
7676
"""
7777
Thai named-entity recognizer
7878
"""
79-
self.__data_path = get_corpus_path("thainer-1-3")
79+
self.__data_path = get_corpus_path("thainer-1-4")
8080
if not self.__data_path:
81-
download("thainer-1-3")
82-
self.__data_path = get_corpus_path("thainer-1-3")
83-
self.crf = sklearn_crfsuite.CRF(
84-
algorithm="lbfgs",
85-
c1=0.1,
86-
c2=0.1,
87-
max_iterations=500,
88-
all_possible_transitions=True,
89-
model_filename=self.__data_path,
90-
)
81+
download("thainer-1-4")
82+
self.__data_path = get_corpus_path("thainer-1-4")
83+
self.crf = pycrfsuite.Tagger()
84+
self.crf.open(self.__data_path)
9185

9286
def get_ner(
9387
self, text: str, pos: bool = True, tag: bool = False
@@ -148,7 +142,7 @@ def get_ner(
148142
self.__tokens, engine="perceptron", corpus="orchid_ud"
149143
)
150144
self.__x_test = self.__extract_features(self.__pos_tags)
151-
self.__y = self.crf.predict_single(self.__x_test)
145+
self.__y = self.crf.tag(self.__x_test)
152146

153147
self.sent_ner = [
154148
(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)

setup.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@
5151
"benchmarks": ["numpy>=1.16.1", "pandas>=0.24"],
5252
"icu": ["pyicu>=2.3"],
5353
"ipa": ["epitran>=1.1"],
54-
"ml": ["numpy>=1.16.1", "torch>=1.0.0"],
55-
"ner": ["sklearn-crfsuite>=0.3.6"],
54+
"ml": ["numpy>=1.16", "torch>=1.0.0"],
5655
"ssg": ["ssg>=0.0.6"],
5756
"thai2fit": ["emoji>=0.5.1", "gensim>=3.2.0", "numpy>=1.16.1"],
5857
"thai2rom": ["torch>=1.0.0", "numpy>=1.16.1"],
@@ -66,7 +65,6 @@
6665
"numpy>=1.16.1",
6766
"pandas>=0.24",
6867
"pyicu>=2.3",
69-
"sklearn-crfsuite>=0.3.6",
7068
"ssg>=0.0.6",
7169
"torch>=1.0.0",
7270
],

0 commit comments

Comments
 (0)