Skip to content

Commit ddd785d

Browse files
authored
Merge pull request #752 from noppayut/lst20-deprecation-warning
Doc: Lst20 deprecation warning for 3.1.1 (#749)
2 parents 74e59cc + ecddb84 commit ddd785d

File tree

7 files changed

+170
-60
lines changed

7 files changed

+170
-60
lines changed

pythainlp/augment/wordnet.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
"WordNetAug",
77
"postype2wordnet",
88
]
9-
9+
import warnings
1010
from pythainlp.corpus import wordnet
1111
from collections import OrderedDict
1212
from pythainlp.tokenize import word_tokenize
1313
from pythainlp.tag import pos_tag
14+
from pythainlp.util.messages import deprecation_message
1415
from typing import List
1516
from nltk.corpus import wordnet as wn
1617
import itertools
@@ -127,9 +128,15 @@ def postype2wordnet(pos: str, corpus: str):
127128
* *lst20* - LST20 Corpus
128129
* *orchid* - Orchid Corpus
129130
"""
130-
if corpus not in ['lst20', 'orchid']:
131+
if corpus not in ["lst20", "orchid"]:
131132
return None
132-
if corpus == 'lst20':
133+
if corpus == "lst20":
134+
dep_msg = deprecation_message(
135+
[("corpus", "lst20")],
136+
"function `wordnet.postype2wordnet`",
137+
"4.0.0",
138+
)
139+
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
133140
return lst20[pos]
134141
else:
135142
return orchid[pos]
@@ -139,14 +146,12 @@ class WordNetAug:
139146
"""
140147
Text Augment using wordnet
141148
"""
149+
142150
def __init__(self):
143151
pass
144152

145153
def find_synonyms(
146-
self,
147-
word: str,
148-
pos: str = None,
149-
postag_corpus: str = "lst20"
154+
self, word: str, pos: str = None, postag_corpus: str = "lst20"
150155
) -> List[str]:
151156
"""
152157
Find synonyms from wordnet
@@ -162,13 +167,13 @@ def find_synonyms(
162167
self.list_synsets = wordnet.synsets(word)
163168
else:
164169
self.p2w_pos = postype2wordnet(pos, postag_corpus)
165-
if self.p2w_pos != '':
170+
if self.p2w_pos != "":
166171
self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
167172
else:
168173
self.list_synsets = wordnet.synsets(word)
169174

170175
for self.synset in wordnet.synsets(word):
171-
for self.syn in self.synset.lemma_names(lang='tha'):
176+
for self.syn in self.synset.lemma_names(lang="tha"):
172177
self.synonyms.append(self.syn)
173178

174179
self.synonyms_without_duplicates = list(
@@ -182,7 +187,7 @@ def augment(
182187
tokenize: object = word_tokenize,
183188
max_syn_sent: int = 6,
184189
postag: bool = True,
185-
postag_corpus: str = "lst20"
190+
postag_corpus: str = "lst20",
186191
) -> List[List[str]]:
187192
"""
188193
Text Augment using wordnet
@@ -210,10 +215,19 @@ def augment(
210215
('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
211216
('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
212217
"""
218+
if postag_corpus.startswith("lst20"):
219+
dep_msg = deprecation_message(
220+
[("postag_corpus", "lst20")],
221+
"method `WordNetAug.augment`",
222+
"4.0.0",
223+
)
224+
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
225+
213226
new_sentences = []
214227
self.list_words = tokenize(sentence)
215228
self.list_synonym = []
216229
self.p_all = 1
230+
217231
if postag:
218232
self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
219233
for word, pos in self.list_pos:

pythainlp/tag/named_entity.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import warnings
66
from typing import List, Tuple, Union
77

8+
from pythainlp.util.messages import deprecation_message
9+
810

911
class NER:
1012
"""
@@ -30,43 +32,53 @@ class NER:
3032
3133
**Note**: for tltk engine, It's support ner model from tltk only.
3234
"""
35+
3336
def __init__(self, engine: str, corpus: str = "thainer") -> None:
37+
if any([arg.startswith("lst20") for arg in (engine, corpus)]):
38+
dep_msg = deprecation_message(
39+
[("engine", "lst20_onnx"), ("corpus", "lst20")],
40+
"`named_entity.NER`",
41+
"4.0.0",
42+
)
43+
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
3444
self.load_engine(engine=engine, corpus=corpus)
3545

3646
def load_engine(self, engine: str, corpus: str) -> None:
3747
self.name_engine = engine
3848
self.engine = None
3949
if engine == "thainer" and corpus == "thainer":
4050
from pythainlp.tag.thainer import ThaiNameTagger
51+
4152
self.engine = ThaiNameTagger()
4253
elif engine == "lst20_onnx":
4354
from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX
55+
4456
self.engine = LST20_NER_ONNX()
4557
elif engine == "wangchanberta":
4658
from pythainlp.wangchanberta import ThaiNameTagger
47-
if corpus=="lst20":
48-
warnings.warn("""
59+
60+
if corpus == "lst20":
61+
warnings.warn(
62+
"""
4963
LST20 corpus are free for research and open source only.\n
5064
If you want to use in Commercial use, please contract NECTEC.\n
5165
https://www.facebook.com/dancearmy/posts/10157641945708284
52-
""")
66+
"""
67+
)
5368
self.engine = ThaiNameTagger(dataset_name=corpus)
5469
elif engine == "tltk":
5570
from pythainlp.tag import tltk
71+
5672
self.engine = tltk
5773
else:
5874
raise ValueError(
5975
"NER class not support {0} engine or {1} corpus.".format(
60-
engine,
61-
corpus
76+
engine, corpus
6277
)
6378
)
6479

6580
def tag(
66-
self,
67-
text,
68-
pos=True,
69-
tag=False
81+
self, text, pos=True, tag=False
7082
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
7183
"""
7284
This function tags named-entitiy from text in IOB format.
@@ -103,7 +115,10 @@ def tag(
103115
"""wangchanberta is not support part-of-speech tag.
104116
It have not part-of-speech tag in output."""
105117
)
106-
if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx":
118+
if (
119+
self.name_engine == "wangchanberta"
120+
or self.name_engine == "lst20_onnx"
121+
):
107122
return self.engine.get_ner(text, tag=tag)
108123
else:
109124
return self.engine.get_ner(text, tag=tag, pos=pos)
@@ -119,11 +134,13 @@ class NNER:
119134
**Options for engine**
120135
* *thai_nner* - Thai NER engine
121136
"""
137+
122138
def __init__(self, engine: str = "thai_nner") -> None:
123139
self.load_engine(engine)
124140

125141
def load_engine(self, engine: str = "thai_nner") -> None:
126142
from pythainlp.tag.thai_nner import Thai_NNER
143+
127144
self.engine = Thai_NNER()
128145

129146
def tag(self, text) -> Tuple[List[str], List[dict]]:

pythainlp/tag/perceptron.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from pythainlp.corpus import corpus_path, get_corpus_path
1010
from pythainlp.tag import PerceptronTagger, lst20, orchid
11+
from pythainlp.util.messages import deprecation_message
1112

1213
_ORCHID_FILENAME = "pos_orchid_perceptron.json"
1314
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -38,11 +39,13 @@ def _pud_tagger():
3839

3940
def _lst20_tagger():
4041
global _LST20_TAGGER
41-
warnings.warn("""
42+
warnings.warn(
43+
"""
4244
LST20 corpus are free for research and open source only.\n
4345
If you want to use in Commercial use, please contract NECTEC.\n
4446
https://www.facebook.com/dancearmy/posts/10157641945708284
45-
""")
47+
"""
48+
)
4649
if not _LST20_TAGGER:
4750
path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4")
4851
_LST20_TAGGER = PerceptronTagger(path=path)
@@ -69,6 +72,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
6972
word_tags = _orchid_tagger().tag(words)
7073
word_tags = orchid.post_process(word_tags, to_ud)
7174
elif corpus == "lst20" or corpus == "lst20_ud":
75+
dep_msg = deprecation_message(
76+
[("postag_corpus", "lst20"), ("postag_corpus", "lst20_ud")],
77+
"function `perceptron.tag`",
78+
"4.0.0",
79+
)
80+
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
7281
words = lst20.pre_process(words)
7382
word_tags = _lst20_tagger().tag(words)
7483
word_tags = lst20.post_process(word_tags, to_ud)

pythainlp/tag/pos_tag.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
# -*- coding: utf-8 -*-
22
from typing import List, Tuple
3+
import warnings
4+
5+
from pythainlp.util.messages import deprecation_message
36

47

58
def pos_tag(
6-
words: List[str],
7-
engine: str = "perceptron",
8-
corpus: str = "orchid"
9+
words: List[str], engine: str = "perceptron", corpus: str = "orchid"
910
) -> List[Tuple[str, str]]:
1011
"""
1112
Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'.
@@ -98,21 +99,29 @@ def pos_tag(
9899

99100
_support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]
100101

102+
if corpus.startswith("lst20"):
103+
dep_msg = deprecation_message(
104+
[("corpus", "lst20"), ("corpus", "lst20_ud")],
105+
"function `pos_tag.pos_tag`",
106+
"4.0.0",
107+
)
108+
101109
if engine == "perceptron" and corpus in _support_corpus:
102110
from pythainlp.tag.perceptron import tag as tag_
103111
elif engine == "wangchanberta" and corpus == "lst20":
104112
from pythainlp.wangchanberta.postag import pos_tag as tag_
105-
words = ''.join(words)
113+
114+
words = "".join(words)
106115
elif engine == "tltk":
107116
from pythainlp.tag.tltk import pos_tag as tag_
117+
108118
corpus = "tnc"
109119
elif engine == "unigram" and corpus in _support_corpus: # default
110120
from pythainlp.tag.unigram import tag as tag_
111121
else:
112122
raise ValueError(
113123
"pos_tag not support {0} engine or {1} corpus.".format(
114-
engine,
115-
corpus
124+
engine, corpus
116125
)
117126
)
118127

@@ -169,4 +178,12 @@ def pos_tag_sents(
169178
if not sentences:
170179
return []
171180

181+
if corpus.startswith("lst20"):
182+
dep_msg = deprecation_message(
183+
[("corpus", "lst20"), ("corpus", "lst20_ud")],
184+
"function `pos_tag.pos_tag_sents`",
185+
"4.0.0",
186+
)
187+
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
188+
172189
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]

pythainlp/tag/unigram.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from pythainlp.corpus import corpus_path, get_corpus_path
1111
from pythainlp.tag import lst20, orchid
12+
from pythainlp.util.messages import deprecation_message
1213

1314
_ORCHID_FILENAME = "pos_orchid_unigram.json"
1415
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -42,11 +43,13 @@ def _pud_tagger():
4243

4344
def _lst20_tagger():
4445
global _LST20_TAGGER
45-
warnings.warn("""
46+
warnings.warn(
47+
"""
4648
LST20 corpus are free for research and open source only.\n
4749
If you want to use in Commercial use, please contract NECTEC.\n
4850
https://www.facebook.com/dancearmy/posts/10157641945708284
49-
""")
51+
"""
52+
)
5053
if not _LST20_TAGGER:
5154
path = get_corpus_path(_LST20_TAGGER_NAME)
5255
with open(path, encoding="utf-8-sig") as fh:
@@ -84,6 +87,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
8487
word_tags = _find_tag(words, _orchid_tagger())
8588
word_tags = orchid.post_process(word_tags, to_ud)
8689
elif corpus == "lst20" or corpus == "lst20_ud":
90+
dep_msg = deprecation_message(
91+
[("corpus", "lst20"), ("corpus", "lst20_ud")],
92+
"function `unigram.tag`",
93+
"4.0.0",
94+
)
95+
warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
8796
words = lst20.pre_process(words)
8897
word_tags = _find_tag(words, _lst20_tagger())
8998
word_tags = lst20.post_process(word_tags, to_ud)

pythainlp/util/messages.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from typing import List, Tuple
2+
from warnings import warn
3+
4+
5+
def deprecation_message(
6+
deprecated_items: List[Tuple[str, str]],
7+
module_name: str,
8+
last_effective_version: str,
9+
recommended_action: str = "",
10+
):
11+
12+
dep_item_names = list(set([itm for itm, _ in deprecated_items]))
13+
is_same_item = len(dep_item_names) == 1
14+
if is_same_item:
15+
single_item = len(deprecated_items) == 1
16+
values = (
17+
deprecated_items[0][1]
18+
if single_item
19+
else [val for _, val in deprecated_items]
20+
)
21+
dep_msg = f"{dep_item_names[0]}={repr(values)}"
22+
else:
23+
dep_msg = ", ".join(
24+
[
25+
f"{dep_item}={repr(dep_value)}"
26+
for dep_item, dep_value in deprecated_items
27+
]
28+
)
29+
30+
dep_msg += f" of {module_name}"
31+
dep_msg += f" will be deprecated in version {last_effective_version}."
32+
33+
if recommended_action:
34+
dep_msg += " " + recommended_action
35+
36+
return dep_msg

0 commit comments

Comments
 (0)