Merge pull request #752 from noppayut/lst20-deprecation-warning

wannaphong · web-flow · commit ddd785d0e89b · 2022-10-31T00:32:09.000+07:00
Doc: Lst20 deprecation warning for 3.1.1 (#749)
diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py
@@ -6,11 +6,12 @@
     "WordNetAug",
     "postype2wordnet",
 ]
-
+import warnings
 from pythainlp.corpus import wordnet
 from collections import OrderedDict
 from pythainlp.tokenize import word_tokenize
 from pythainlp.tag import pos_tag
+from pythainlp.util.messages import deprecation_message
 from typing import List
 from nltk.corpus import wordnet as wn
 import itertools
@@ -127,9 +128,15 @@ def postype2wordnet(pos: str, corpus: str):
         * *lst20* - LST20 Corpus
         * *orchid* - Orchid Corpus
     """
-    if corpus not in ['lst20', 'orchid']:
+    if corpus not in ["lst20", "orchid"]:
         return None
-    if corpus == 'lst20':
+    if corpus == "lst20":
+        dep_msg = deprecation_message(
+            [("corpus", "lst20")],
+            "function `wordnet.postype2wordnet`",
+            "4.0.0",
+        )
+        warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
         return lst20[pos]
     else:
         return orchid[pos]
@@ -139,14 +146,12 @@ class WordNetAug:
     """
     Text Augment using wordnet
     """
+
     def __init__(self):
         pass
 
     def find_synonyms(
-        self,
-        word: str,
-        pos: str = None,
-        postag_corpus: str = "lst20"
+        self, word: str, pos: str = None, postag_corpus: str = "lst20"
     ) -> List[str]:
         """
         Find synonyms from wordnet
@@ -162,13 +167,13 @@ def find_synonyms(
             self.list_synsets = wordnet.synsets(word)
         else:
             self.p2w_pos = postype2wordnet(pos, postag_corpus)
-            if self.p2w_pos != '':
+            if self.p2w_pos != "":
                 self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
             else:
                 self.list_synsets = wordnet.synsets(word)
 
         for self.synset in wordnet.synsets(word):
-            for self.syn in self.synset.lemma_names(lang='tha'):
+            for self.syn in self.synset.lemma_names(lang="tha"):
                 self.synonyms.append(self.syn)
 
         self.synonyms_without_duplicates = list(
@@ -182,7 +187,7 @@ def augment(
         tokenize: object = word_tokenize,
         max_syn_sent: int = 6,
         postag: bool = True,
-        postag_corpus: str = "lst20"
+        postag_corpus: str = "lst20",
     ) -> List[List[str]]:
         """
         Text Augment using wordnet
@@ -210,10 +215,19 @@ def augment(
              ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
              ('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
         """
+        if postag_corpus.startswith("lst20"):
+            dep_msg = deprecation_message(
+                [("postag_corpus", "lst20")],
+                "method `WordNetAug.augment`",
+                "4.0.0",
+            )
+            warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
+
         new_sentences = []
         self.list_words = tokenize(sentence)
         self.list_synonym = []
         self.p_all = 1
+
         if postag:
             self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
             for word, pos in self.list_pos:
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -5,6 +5,8 @@
 import warnings
 from typing import List, Tuple, Union
 
+from pythainlp.util.messages import deprecation_message
+
 
 class NER:
     """
@@ -30,43 +32,53 @@ class NER:
 
     **Note**: for tltk engine, It's support ner model from tltk only.
     """
+
     def __init__(self, engine: str, corpus: str = "thainer") -> None:
+        if any([arg.startswith("lst20") for arg in (engine, corpus)]):
+            dep_msg = deprecation_message(
+                [("engine", "lst20_onnx"), ("corpus", "lst20")],
+                "`named_entity.NER`",
+                "4.0.0",
+            )
+            warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
         self.load_engine(engine=engine, corpus=corpus)
 
     def load_engine(self, engine: str, corpus: str) -> None:
         self.name_engine = engine
         self.engine = None
         if engine == "thainer" and corpus == "thainer":
             from pythainlp.tag.thainer import ThaiNameTagger
+
             self.engine = ThaiNameTagger()
         elif engine == "lst20_onnx":
             from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX
+
             self.engine = LST20_NER_ONNX()
         elif engine == "wangchanberta":
             from pythainlp.wangchanberta import ThaiNameTagger
-            if corpus=="lst20":
-                warnings.warn("""
+
+            if corpus == "lst20":
+                warnings.warn(
+                    """
                 LST20 corpus are free for research and open source only.\n
                 If you want to use in Commercial use, please contract NECTEC.\n
                 https://www.facebook.com/dancearmy/posts/10157641945708284
-                """)
+                """
+                )
             self.engine = ThaiNameTagger(dataset_name=corpus)
         elif engine == "tltk":
             from pythainlp.tag import tltk
+
             self.engine = tltk
         else:
             raise ValueError(
                 "NER class not support {0} engine or {1} corpus.".format(
-                    engine,
-                    corpus
+                    engine, corpus
                 )
             )
 
     def tag(
-        self,
-        text,
-        pos=True,
-        tag=False
+        self, text, pos=True, tag=False
     ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named-entitiy from text in IOB format.
@@ -103,7 +115,10 @@ def tag(
                 """wangchanberta is not support part-of-speech tag.
                 It have not part-of-speech tag in output."""
             )
-        if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx":
+        if (
+            self.name_engine == "wangchanberta"
+            or self.name_engine == "lst20_onnx"
+        ):
             return self.engine.get_ner(text, tag=tag)
         else:
             return self.engine.get_ner(text, tag=tag, pos=pos)
@@ -119,11 +134,13 @@ class NNER:
     **Options for engine**
         * *thai_nner* - Thai NER engine
     """
+
     def __init__(self, engine: str = "thai_nner") -> None:
         self.load_engine(engine)
 
     def load_engine(self, engine: str = "thai_nner") -> None:
         from pythainlp.tag.thai_nner import Thai_NNER
+
         self.engine = Thai_NNER()
 
     def tag(self, text) -> Tuple[List[str], List[dict]]:
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -8,6 +8,7 @@
 
 from pythainlp.corpus import corpus_path, get_corpus_path
 from pythainlp.tag import PerceptronTagger, lst20, orchid
+from pythainlp.util.messages import deprecation_message
 
 _ORCHID_FILENAME = "pos_orchid_perceptron.json"
 _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -38,11 +39,13 @@ def _pud_tagger():
 
 def _lst20_tagger():
     global _LST20_TAGGER
-    warnings.warn("""
+    warnings.warn(
+        """
     LST20 corpus are free for research and open source only.\n
     If you want to use in Commercial use, please contract NECTEC.\n
     https://www.facebook.com/dancearmy/posts/10157641945708284
-    """)
+    """
+    )
     if not _LST20_TAGGER:
         path = get_corpus_path(_LST20_TAGGER_NAME, version="0.2.4")
         _LST20_TAGGER = PerceptronTagger(path=path)
@@ -69,6 +72,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         word_tags = _orchid_tagger().tag(words)
         word_tags = orchid.post_process(word_tags, to_ud)
     elif corpus == "lst20" or corpus == "lst20_ud":
+        dep_msg = deprecation_message(
+            [("postag_corpus", "lst20"), ("postag_corpus", "lst20_ud")],
+            "function `perceptron.tag`",
+            "4.0.0",
+        )
+        warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
         words = lst20.pre_process(words)
         word_tags = _lst20_tagger().tag(words)
         word_tags = lst20.post_process(word_tags, to_ud)
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
 from typing import List, Tuple
+import warnings
+
+from pythainlp.util.messages import deprecation_message
 
 
 def pos_tag(
-    words: List[str],
-    engine: str = "perceptron",
-    corpus: str = "orchid"
+    words: List[str], engine: str = "perceptron", corpus: str = "orchid"
 ) -> List[Tuple[str, str]]:
     """
     Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'.
@@ -98,21 +99,29 @@ def pos_tag(
 
     _support_corpus = ["lst20", "lst20_ud", "orchid", "orchid_ud", "pud"]
 
+    if corpus.startswith("lst20"):
+        dep_msg = deprecation_message(
+            [("corpus", "lst20"), ("corpus", "lst20_ud")],
+            "function `pos_tag.pos_tag`",
+            "4.0.0",
+        )
+
     if engine == "perceptron" and corpus in _support_corpus:
         from pythainlp.tag.perceptron import tag as tag_
     elif engine == "wangchanberta" and corpus == "lst20":
         from pythainlp.wangchanberta.postag import pos_tag as tag_
-        words = ''.join(words)
+
+        words = "".join(words)
     elif engine == "tltk":
         from pythainlp.tag.tltk import pos_tag as tag_
+
         corpus = "tnc"
     elif engine == "unigram" and corpus in _support_corpus:  # default
         from pythainlp.tag.unigram import tag as tag_
     else:
         raise ValueError(
             "pos_tag not support {0} engine or {1} corpus.".format(
-                engine,
-                corpus
+                engine, corpus
             )
         )
 
@@ -169,4 +178,12 @@ def pos_tag_sents(
     if not sentences:
         return []
 
+    if corpus.startswith("lst20"):
+        dep_msg = deprecation_message(
+            [("corpus", "lst20"), ("corpus", "lst20_ud")],
+            "function `pos_tag.pos_tag_sents`",
+            "4.0.0",
+        )
+        warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
+
     return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
@@ -9,6 +9,7 @@
 
 from pythainlp.corpus import corpus_path, get_corpus_path
 from pythainlp.tag import lst20, orchid
+from pythainlp.util.messages import deprecation_message
 
 _ORCHID_FILENAME = "pos_orchid_unigram.json"
 _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
@@ -42,11 +43,13 @@ def _pud_tagger():
 
 def _lst20_tagger():
     global _LST20_TAGGER
-    warnings.warn("""
+    warnings.warn(
+        """
     LST20 corpus are free for research and open source only.\n
     If you want to use in Commercial use, please contract NECTEC.\n
     https://www.facebook.com/dancearmy/posts/10157641945708284
-    """)
+    """
+    )
     if not _LST20_TAGGER:
         path = get_corpus_path(_LST20_TAGGER_NAME)
         with open(path, encoding="utf-8-sig") as fh:
@@ -84,6 +87,12 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         word_tags = _find_tag(words, _orchid_tagger())
         word_tags = orchid.post_process(word_tags, to_ud)
     elif corpus == "lst20" or corpus == "lst20_ud":
+        dep_msg = deprecation_message(
+            [("corpus", "lst20"), ("corpus", "lst20_ud")],
+            "function `unigram.tag`",
+            "4.0.0",
+        )
+        warnings.warn(dep_msg, DeprecationWarning, stacklevel=2)
         words = lst20.pre_process(words)
         word_tags = _find_tag(words, _lst20_tagger())
         word_tags = lst20.post_process(word_tags, to_ud)
diff --git a/pythainlp/util/messages.py b/pythainlp/util/messages.py
@@ -0,0 +1,36 @@
+from typing import List, Tuple
+from warnings import warn
+
+
+def deprecation_message(
+    deprecated_items: List[Tuple[str, str]],
+    module_name: str,
+    last_effective_version: str,
+    recommended_action: str = "",
+):
+
+    dep_item_names = list(set([itm for itm, _ in deprecated_items]))
+    is_same_item = len(dep_item_names) == 1
+    if is_same_item:
+        single_item = len(deprecated_items) == 1
+        values = (
+            deprecated_items[0][1]
+            if single_item
+            else [val for _, val in deprecated_items]
+        )
+        dep_msg = f"{dep_item_names[0]}={repr(values)}"
+    else:
+        dep_msg = ", ".join(
+            [
+                f"{dep_item}={repr(dep_value)}"
+                for dep_item, dep_value in deprecated_items
+            ]
+        )
+
+    dep_msg += f" of {module_name}"
+    dep_msg += f" will be deprecated in version {last_effective_version}."
+
+    if recommended_action:
+        dep_msg += " " + recommended_action
+
+    return dep_msg
diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py