Merge pull request #369 from PyThaiNLP/fix-tokenization-benchmark

Charin · web-flow · commit 4460a35e5826 · 2020-05-22T13:47:03.000+07:00
[WIP] Fix tokenization benchmark statistics
diff --git a/bin/word-tokenization-benchmark b/bin/word-tokenization-benchmark
@@ -54,49 +54,57 @@ print(
 
 df_raw = word_tokenization.benchmark(expected, actual)
 
-df_res = df_raw.describe()
-df_res = df_res[
-    [
-        "char_level:tp",
-        "char_level:tn",
-        "char_level:fp",
-        "char_level:fn",
-        "char_level:precision",
-        "char_level:recall",
-        "char_level:f1",
-        "word_level:precision",
-        "word_level:recall",
-        "word_level:f1",
-    ]
+
+columns = [
+    "char_level:tp",
+    "char_level:fp",
+    "char_level:tn",
+    "char_level:fn",
+    "word_level:correctly_tokenised_words",
+    "word_level:total_words_in_sample",
+    "word_level:total_words_in_ref_sample",
 ]
 
-df_res = df_res.T.reset_index(0)
+statistics = dict()
+
+for c in columns:
+    statistics[c] = float(df_raw[c].sum())
+
+statistics["char_level:precision"] = statistics["char_level:tp"] / (
+    statistics["char_level:tp"] + statistics["char_level:fp"]
+)
 
-df_res["mean±std"] = df_res.apply(
-    lambda r: "%2.2f±%2.2f" % (r["mean"], r["std"]), axis=1
+statistics["char_level:recall"] = statistics["char_level:tp"] / (
+    statistics["char_level:tp"] + statistics["char_level:fn"]
 )
 
-df_res["metric"] = df_res["index"]
+statistics["word_level:precision"] = statistics["word_level:correctly_tokenised_words"] \
+    / statistics["word_level:total_words_in_sample"]
+
+statistics["word_level:recall"] = statistics["word_level:correctly_tokenised_words"] \
+    / statistics["word_level:total_words_in_ref_sample"]
 
 print("============== Benchmark Result ==============")
-print(df_res[["metric", "mean±std", "min", "max"]].to_string(index=False))
 
+for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
+    c = f"char_level:{c}"
+    v = statistics[c]
+    print(f"{c:>40s} {v:.4f}")
 
-if args.save_details:
-    data = {}
-    for r in df_res.to_dict("records"):
-        metric = r["index"]
-        del r["index"]
-        data[metric] = r
+for c in ["total_words_in_sample", "total_words_in_ref_sample", "correctly_tokenised_words", "precision", "recall"]:
+    c = f"word_level:{c}"
+    v = statistics[c]
+    print(f"{c:>40s} {v:.4f}")
 
+if args.save_details:
     dir_name = os.path.dirname(args.input_file)
     file_name = args.input_file.split("/")[-1].split(".")[0]
 
     res_path = "%s/eval-%s.yml" % (dir_name, file_name)
     print("Evaluation result is saved to %s" % res_path)
 
     with open(res_path, "w", encoding="utf-8") as outfile:
-        yaml.dump(data, outfile, default_flow_style=False)
+        yaml.dump(statistics, outfile, default_flow_style=False)
 
     res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
     print("Details of comparisons is saved to %s" % res_path)
@@ -110,6 +118,6 @@ if args.save_details:
 
             samples.append(dict(metrics=r, expected=expected, actual=actual, id=i))
 
-        details = dict(metrics=data, samples=samples)
+        details = dict(metrics=statistics, samples=samples)
 
         json.dump(details, f, ensure_ascii=False)
diff --git a/pythainlp/benchmarks/word_tokenization.py b/pythainlp/benchmarks/word_tokenization.py
@@ -185,10 +185,6 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
 
     correctly_tokenised_words = np.sum(tokenization_indicators)
 
-    w_precision = correctly_tokenised_words / np.sum(sample)
-    w_recall = correctly_tokenised_words / np.sum(ref_sample)
-    w_f1 = _f1(w_precision, w_recall)
-
     tokenization_indicators = list(
         map(lambda x: str(x), tokenization_indicators)
     )
@@ -199,14 +195,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
             "fp": c_fp,
             "tn": c_tn,
             "fn": c_fn,
-            "precision": c_precision,
-            "recall": c_recall,
-            "f1": c_f1,
         },
         "word_level": {
-            "precision": w_precision,
-            "recall": w_recall,
-            "f1": w_f1,
+            "correctly_tokenised_words": correctly_tokenised_words,
+            "total_words_in_sample": np.sum(sample),
+            "total_words_in_ref_sample": np.sum(ref_sample)
         },
         "global": {
             "tokenisation_indicators": "".join(tokenization_indicators)