Add transformers_ud

wannaphong · wannaphong · commit c8835428c43d · 2022-09-17T13:51:48.000+07:00
diff --git a/pythainlp/parse/core.py b/pythainlp/parse/core.py
@@ -2,11 +2,13 @@
 _tagger = None
 _tagger_name = ""
 
-def dependency_parsing(text: str, engine: str="esupar")->str:
+def dependency_parsing(text: str, model: str=None, engine: str="esupar")->str:
     """
     Dependency Parsing
 
     :param str text: text to do dependency parsing
+    :param str model: model for using with engine \
+        (for esupar and transformers_ud)
     :param str engine: the name dependency parser
     :return: str (conllu)
 
@@ -17,6 +19,41 @@ def dependency_parsing(text: str, engine: str="esupar")->str:
         * *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \
             for Thai language, working on Universal Dependencies. \
             `GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_
+        * *transformers_ud* - TransformersUD \
+            `GitHub <https://github.com/KoichiYasuoka/>`_
+
+    **Options for model (esupar engine)**
+        * *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
+            `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_
+        * *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \
+            pre-trained on Thai Wikipedia texts for POS-tagging and \
+            dependency-parsing `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_
+        * *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \
+            pre-trained on Thai Wikipedia texts for POS-tagging and \
+            dependency-parsing. (syllable level) `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_
+        * *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \
+            pre-trained on Thai Wikipedia texts for POS-tagging \
+            and dependency-parsing. (char level) `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_
+
+    If you want to train model for esupar, you can read \
+    `Huggingface <https://github.com/KoichiYasuoka/esupar>`_
+
+    **Options for model (transformers_ud engine)**
+        * *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \
+            DeBERTa(V2) model pretrained on Thai Wikipedia texts \
+            for dependency-parsing (head-detection on Universal \
+            Dependencies) as question-answering, derived from \
+            deberta-base-thai. \
+            trained by th_blackboard.conll. `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_
+        * *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \
+            roberta model pretrained on Thai Wikipedia texts \
+            for dependency-parsing. `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_
 
     :Example:
     ::
@@ -40,7 +77,10 @@ def dependency_parsing(text: str, engine: str="esupar")->str:
     if _tagger_name != engine:
         if engine == "esupar":
             from pythainlp.parse.esupar_engine import Parse
-            _tagger = Parse()
+            _tagger = Parse(model=model)
+        elif engine == "transformers_ud":
+            from pythainlp.parse.transformers_ud import Parse
+            _tagger = Parse(model=model)
         elif engine == "spacy_thai":
             from pythainlp.parse.spacy_thai_engine import Parse
             _tagger = Parse()
diff --git a/pythainlp/parse/esupar_engine.py b/pythainlp/parse/esupar_engine.py
@@ -9,6 +9,8 @@
 
 class Parse:
     def __init__(self, model: str="th") -> None:
+        if model == None:
+            model = "th"
         self.nlp=esupar.load(model)
 
     def __call__(self, text):
diff --git a/pythainlp/parse/transformers_ud.py b/pythainlp/parse/transformers_ud.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+TransformersUD
+
+Author: Prof. Koichi Yasuoka
+
+This tagger is provided under the terms of the apache-2.0 License.
+
+The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head
+
+GitHub: https://github.com/KoichiYasuoka
+"""
+import os
+import numpy
+import torch
+import ufal.chu_liu_edmonds
+from transformers import (
+    AutoTokenizer,
+    AutoModelForQuestionAnswering,
+    AutoModelForTokenClassification,
+    AutoConfig,
+    TokenClassificationPipeline
+)
+from transformers.utils import cached_file
+
+
+class Parse:
+    def __init__(self, model: str="KoichiYasuoka/deberta-base-thai-ud-head") -> None:
+        if model == None:
+            model = "KoichiYasuoka/deberta-base-thai-ud-head"
+        self.tokenizer=AutoTokenizer.from_pretrained(model)
+        self.model=AutoModelForQuestionAnswering.from_pretrained(model)
+        x=AutoModelForTokenClassification.from_pretrained
+        if os.path.isdir(model):
+            d,t=x(os.path.join(model,"deprel")),x(os.path.join(model,"tagger"))
+        else:
+            c=AutoConfig.from_pretrained(cached_file(model,"deprel/config.json"))
+            d=x(cached_file(model,"deprel/pytorch_model.bin"),config=c)
+            s=AutoConfig.from_pretrained(cached_file(model,"tagger/config.json"))
+            t=x(cached_file(model,"tagger/pytorch_model.bin"),config=s)
+        self.deprel=TokenClassificationPipeline(
+            model=d
+            tokenizer=self.tokenizer,
+            aggregation_strategy="simple"
+        )
+        self.tagger=TokenClassificationPipeline(
+            model=t
+            tokenizer=self.tokenizer
+        )
+
+    def __call__(self, text: str)->str:
+        w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
+        z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
+        r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
+        v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
+        for i,t in enumerate(v):
+            q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
+            c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
+        b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
+        with torch.no_grad():
+            d=self.model(
+                input_ids=torch.tensor([sum(x,[]) for x in c]),
+                token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b])
+            )
+        s,e=d.start_logits.tolist(),d.end_logits.tolist()
+        for i in range(n):
+            for j in range(n):
+                m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
+        h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
+        if [0 for i in h if i==0]!=[0]:
+            i=([p for s,e,p in w]+["root"]).index("root")
+            j=i+1 if i<n else numpy.nanargmax(m[:,0])
+            m[0:j,0]=m[j+1:,0]=numpy.nan
+            h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
+        u="# text = "+text.replace("\n"," ")+"\n"
+        for i,(s,e,p) in enumerate(w,1):
+            p="root" if h[i]==0 else "dep" if p=="root" else p
+            u+="\t".join(
+                [str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),str(h[i]),p,"_","_" if i<n and e<w[i][0] else "SpaceAfter=No"]
+            )+"\n"
+        return u+"\n"
diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -7,4 +7,5 @@
 class TestParsePackage(unittest.TestCase):
     def test_dependency_parsing(self):
         self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
+        self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="transformers_ud"))
         self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))