Merge branch 'rm-default-tqdm' of https://github.com/PyThaiNLP/pythainlp into rm-default-tqdm

bact · bact · commit 16158baf7dc1 · 2020-05-22T22:14:27.000+01:00
diff --git a/bin/word-tokenization-benchmark b/bin/word-tokenization-benchmark
@@ -54,49 +54,57 @@ print(
 
 df_raw = word_tokenization.benchmark(expected, actual)
 
-df_res = df_raw.describe()
-df_res = df_res[
-    [
-        "char_level:tp",
-        "char_level:tn",
-        "char_level:fp",
-        "char_level:fn",
-        "char_level:precision",
-        "char_level:recall",
-        "char_level:f1",
-        "word_level:precision",
-        "word_level:recall",
-        "word_level:f1",
-    ]
+
+columns = [
+    "char_level:tp",
+    "char_level:fp",
+    "char_level:tn",
+    "char_level:fn",
+    "word_level:correctly_tokenised_words",
+    "word_level:total_words_in_sample",
+    "word_level:total_words_in_ref_sample",
 ]
 
-df_res = df_res.T.reset_index(0)
+statistics = dict()
+
+for c in columns:
+    statistics[c] = float(df_raw[c].sum())
+
+statistics["char_level:precision"] = statistics["char_level:tp"] / (
+    statistics["char_level:tp"] + statistics["char_level:fp"]
+)
 
-df_res["mean±std"] = df_res.apply(
-    lambda r: "%2.2f±%2.2f" % (r["mean"], r["std"]), axis=1
+statistics["char_level:recall"] = statistics["char_level:tp"] / (
+    statistics["char_level:tp"] + statistics["char_level:fn"]
 )
 
-df_res["metric"] = df_res["index"]
+statistics["word_level:precision"] = statistics["word_level:correctly_tokenised_words"] \
+    / statistics["word_level:total_words_in_sample"]
+
+statistics["word_level:recall"] = statistics["word_level:correctly_tokenised_words"] \
+    / statistics["word_level:total_words_in_ref_sample"]
 
 print("============== Benchmark Result ==============")
-print(df_res[["metric", "mean±std", "min", "max"]].to_string(index=False))
 
+for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
+    c = f"char_level:{c}"
+    v = statistics[c]
+    print(f"{c:>40s} {v:.4f}")
 
-if args.save_details:
-    data = {}
-    for r in df_res.to_dict("records"):
-        metric = r["index"]
-        del r["index"]
-        data[metric] = r
+for c in ["total_words_in_sample", "total_words_in_ref_sample", "correctly_tokenised_words", "precision", "recall"]:
+    c = f"word_level:{c}"
+    v = statistics[c]
+    print(f"{c:>40s} {v:.4f}")
 
+if args.save_details:
     dir_name = os.path.dirname(args.input_file)
     file_name = args.input_file.split("/")[-1].split(".")[0]
 
     res_path = "%s/eval-%s.yml" % (dir_name, file_name)
     print("Evaluation result is saved to %s" % res_path)
 
     with open(res_path, "w", encoding="utf-8") as outfile:
-        yaml.dump(data, outfile, default_flow_style=False)
+        yaml.dump(statistics, outfile, default_flow_style=False)
 
     res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
     print("Details of comparisons is saved to %s" % res_path)
@@ -110,6 +118,6 @@ if args.save_details:
 
             samples.append(dict(metrics=r, expected=expected, actual=actual, id=i))
 
-        details = dict(metrics=data, samples=samples)
+        details = dict(metrics=statistics, samples=samples)
 
         json.dump(details, f, ensure_ascii=False)
diff --git a/pythainlp/benchmarks/word_tokenization.py b/pythainlp/benchmarks/word_tokenization.py
@@ -185,10 +185,6 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
 
     correctly_tokenised_words = np.sum(tokenization_indicators)
 
-    w_precision = correctly_tokenised_words / np.sum(sample)
-    w_recall = correctly_tokenised_words / np.sum(ref_sample)
-    w_f1 = _f1(w_precision, w_recall)
-
     tokenization_indicators = list(
         map(lambda x: str(x), tokenization_indicators)
     )
@@ -199,14 +195,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
             "fp": c_fp,
             "tn": c_tn,
             "fn": c_fn,
-            "precision": c_precision,
-            "recall": c_recall,
-            "f1": c_f1,
         },
         "word_level": {
-            "precision": w_precision,
-            "recall": w_recall,
-            "f1": w_f1,
+            "correctly_tokenised_words": correctly_tokenised_words,
+            "total_words_in_sample": np.sum(sample),
+            "total_words_in_ref_sample": np.sum(ref_sample)
         },
         "global": {
             "tokenisation_indicators": "".join(tokenization_indicators)
diff --git a/pythainlp/corpus/orchid_pt_tagger.pkl b/pythainlp/corpus/orchid_pt_tagger.pkl
diff --git a/pythainlp/corpus/ud_thai_pud_pt_tagger.pkl b/pythainlp/corpus/ud_thai_pud_pt_tagger.pkl
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -3,20 +3,20 @@
 Perceptron Part-Of-Speech tagger
 """
 import os
+import pickle
 from typing import List, Tuple
 
-import dill
 from pythainlp.corpus import corpus_path
 from pythainlp.tag.orchid import tag_signs, tag_to_text
 
-_ORCHID_DATA_FILENAME = "orchid_pt_tagger.dill"
-_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.dill"
+_ORCHID_DATA_FILENAME = "orchid_pt_tagger.pkl"
+_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.pkl"
 
 
 def _load_tagger(filename):
     data_filename = os.path.join(corpus_path(), filename)
     with open(data_filename, "rb") as fh:
-        model = dill.load(fh)
+        model = pickle.load(fh)
     return model
 
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
-dill==0.3.*
 python-crfsuite==0.9.*
 requests==2.23.*
 tinydb==4.1.*
diff --git a/setup.py b/setup.py
@@ -39,7 +39,6 @@
 """
 
 requirements = [
-    "dill>=0.3.0",
     "python-crfsuite>=0.9.6",
     "requests>=2.22.0",
     "tinydb>=3.0",
@@ -88,7 +87,7 @@
             "etcc.txt",
             "negations_th.txt",
             "orchid_pos_th.json",
-            "orchid_pt_tagger.dill",
+            "orchid_pt_tagger.pkl",
             "person_names_female_th.txt",
             "person_names_male_th.txt",
             "sentenceseg-crfcut-v2.model",
@@ -98,7 +97,7 @@
             "thailand_provinces_th.txt",
             "tnc_freq.txt",
             "ttc_freq.txt",
-            "ud_thai_pud_pt_tagger.dill",
+            "ud_thai_pud_pt_tagger.pkl",
             "ud_thai_pud_unigram_tagger.json",
             "words_th_thai2fit_201810.txt",
             "words_th.txt",
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -5,7 +5,6 @@
 from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
 from pythainlp.tag.locations import tag_provinces
 from pythainlp.tag.named_entity import ThaiNameTagger
-from pythainlp.tokenize import word_tokenize
 
 
 class TestTagPackage(unittest.TestCase):
@@ -26,7 +25,7 @@ def test_pos_tag(self):
         self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
         self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
         self.assertEqual(
-            pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
+            pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
             [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
         )
 
@@ -40,6 +39,10 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag([], corpus="pud"), [])
         self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
         self.assertEqual(perceptron.tag([], corpus="orchid"), [])
+        self.assertEqual(
+            pos_tag(["นักเรียน", "ถาม", "ครู"]),
+            [("นักเรียน", "NCMN"), ("ถาม", "VACT"), ("ครู", "NCMN")],
+        )
 
         self.assertEqual(pos_tag_sents(None), [])
         self.assertEqual(pos_tag_sents([]), [])