Skip to content

Commit df86a8c

Browse files
authored
Merge pull request #432 from PyThaiNLP/benchmark-to-cli
issue-424: Add benchmark to thainlp cli
2 parents 1b0c1c3 + 90af32a commit df86a8c

File tree

6 files changed

+196
-129
lines changed

6 files changed

+196
-129
lines changed

bin/word-tokenization-benchmark

Lines changed: 0 additions & 123 deletions
This file was deleted.

docs/notes/command_line.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,27 @@ You can use some thainlp functions directly from command line.
7373

7474
$ thainlp data --help
7575

76+
**Benchmark**::
77+
78+
thainlp benchmark word-tokenization --input-file <source> --test-file <label> [--save-details]
79+
80+
*Example*::
81+
82+
$thainlp benchmark word-tokenization --input-file wisesight-1000-deepcut.txt --test-file wisesight-1000.label
83+
Benchmarking wisesight-1000-deepcut.txt against .wisesight-1000.label with 993 samples in total
84+
============== Benchmark Result ==============
85+
char_level:tp 17654.0000
86+
char_level:fn 1153.0000
87+
char_level:tn 50755.0000
88+
char_level:fp 1478.0000
89+
char_level:precision 0.9227
90+
char_level:recall 0.9387
91+
word_level:total_words_in_sample 19132.0000
92+
word_level:total_words_in_ref_sample 18807.0000
93+
word_level:correctly_tokenised_words 15637.0000
94+
word_level:precision 0.8173
95+
word_level:recall 0.8314
96+
7697
**Help**::
7798

7899
thainlp --help

pythainlp/cli/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
import sys
33
from argparse import ArgumentParser
44

5-
from pythainlp.cli import data, soundex, tag, tokenize
5+
from pythainlp.cli import data, soundex, tag, tokenize, benchmark
66

77
# a command should be a verb when possible
8-
COMMANDS = sorted(["data", "soundex", "tag", "tokenize"])
8+
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])
99

1010
CLI_NAME = "thainlp"
1111

pythainlp/cli/benchmark.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
import argparse
5+
import json
6+
import os
7+
8+
import yaml
9+
10+
from pythainlp import cli
11+
from pythainlp.benchmarks import word_tokenization
12+
13+
14+
def _read_file(path):
15+
with open(path, "r", encoding="utf-8") as f:
16+
lines = map(lambda r: r.strip(), f.readlines())
17+
return list(lines)
18+
19+
20+
class App:
21+
def __init__(self, argv):
22+
parser = argparse.ArgumentParser(
23+
prog="benchmark",
24+
description=(
25+
"Benchmark for various tasks;"
26+
"currently, we have only for word tokenization."
27+
),
28+
usage=(
29+
"thainlp benchmark [task] [task-options]\n\n"
30+
"tasks:\n\n"
31+
"word-tokenization benchmark word tokenization\n\n"
32+
"--"
33+
),
34+
)
35+
36+
parser.add_argument(
37+
"task", type=str, help="[word-tokenization]"
38+
)
39+
40+
args = parser.parse_args(argv[2:3])
41+
cli.exit_if_empty(args.task, parser)
42+
task = str.lower(args.task)
43+
44+
task_argv = argv[3:]
45+
if task == "word-tokenization":
46+
WordTokenizationBenchmark(task, task_argv)
47+
48+
49+
class WordTokenizationBenchmark:
50+
def __init__(self, name, argv):
51+
parser = argparse.ArgumentParser(**cli.make_usage("benchmark " + name))
52+
53+
parser.add_argument(
54+
"--input-file",
55+
action="store",
56+
help="Path to input file to compare against the test file",
57+
)
58+
59+
parser.add_argument(
60+
"--test-file",
61+
action="store",
62+
help="Path to test file i.e. ground truth",
63+
)
64+
65+
parser.add_argument(
66+
"--save-details",
67+
default=False,
68+
action="store_true",
69+
help=(
70+
"Save comparison details to files (eval-XXX.json"
71+
" and eval-details-XXX.json)"
72+
)
73+
)
74+
75+
args = parser.parse_args(argv)
76+
77+
actual = _read_file(args.input_file)
78+
expected = _read_file(args.test_file)
79+
80+
assert len(actual) == len(expected), \
81+
"Input and test files do not have the same number of samples"
82+
83+
print(
84+
"Benchmarking %s against %s with %d samples in total"
85+
% (args.input_file, args.test_file, len(actual))
86+
)
87+
88+
df_raw = word_tokenization.benchmark(expected, actual)
89+
90+
columns = [
91+
"char_level:tp",
92+
"char_level:fp",
93+
"char_level:tn",
94+
"char_level:fn",
95+
"word_level:correctly_tokenised_words",
96+
"word_level:total_words_in_sample",
97+
"word_level:total_words_in_ref_sample",
98+
]
99+
100+
statistics = dict()
101+
102+
for c in columns:
103+
statistics[c] = float(df_raw[c].sum())
104+
105+
statistics["char_level:precision"] = statistics["char_level:tp"] / (
106+
statistics["char_level:tp"] + statistics["char_level:fp"]
107+
)
108+
109+
statistics["char_level:recall"] = statistics["char_level:tp"] / (
110+
statistics["char_level:tp"] + statistics["char_level:fn"]
111+
)
112+
113+
statistics["word_level:precision"] = \
114+
statistics["word_level:correctly_tokenised_words"] \
115+
/ statistics["word_level:total_words_in_sample"]
116+
117+
statistics["word_level:recall"] = \
118+
statistics["word_level:correctly_tokenised_words"] \
119+
/ statistics["word_level:total_words_in_ref_sample"]
120+
121+
print("============== Benchmark Result ==============")
122+
123+
for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
124+
c = f"char_level:{c}"
125+
v = statistics[c]
126+
print(f"{c:>40s} {v:.4f}")
127+
128+
for c in [
129+
"total_words_in_sample",
130+
"total_words_in_ref_sample",
131+
"correctly_tokenised_words",
132+
"precision",
133+
"recall"
134+
]:
135+
c = f"word_level:{c}"
136+
v = statistics[c]
137+
print(f"{c:>40s} {v:.4f}")
138+
139+
if args.save_details:
140+
dir_name = os.path.dirname(args.input_file)
141+
file_name = args.input_file.split("/")[-1].split(".")[0]
142+
143+
res_path = "%s/eval-%s.yml" % (dir_name, file_name)
144+
print("Evaluation result is saved to %s" % res_path)
145+
146+
with open(res_path, "w", encoding="utf-8") as outfile:
147+
yaml.dump(statistics, outfile, default_flow_style=False)
148+
149+
res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
150+
print("Details of comparisons is saved to %s" % res_path)
151+
152+
with open(res_path, "w", encoding="utf-8") as f:
153+
samples = []
154+
for i, r in enumerate(df_raw.to_dict("records")):
155+
expected, actual = r["expected"], r["actual"]
156+
del r["expected"]
157+
del r["actual"]
158+
159+
samples.append(
160+
dict(
161+
metrics=r,
162+
expected=expected,
163+
actual=actual, id=i
164+
)
165+
)
166+
167+
details = dict(metrics=statistics, samples=samples)
168+
169+
json.dump(details, f, ensure_ascii=False)

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
python-crfsuite==0.9.*
22
requests==2.23.*
33
tinydb==4.1.*
4+
PyYAML==5.3.1
5+
numpy==1.18.5

setup.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646

4747
extras = {
4848
"attacut": ["attacut>=1.0.6"],
49-
"benchmarks": ["numpy>=1.16.1", "pandas>=0.24"],
49+
"benchmarks": ["numpy>=1.16.1", "pandas>=0.24", "PyYAML>=5.3.1"],
5050
"icu": ["pyicu>=2.3"],
5151
"ipa": ["epitran>=1.1"],
5252
"ml": ["numpy>=1.16", "torch>=1.0.0"],
@@ -55,6 +55,7 @@
5555
"thai2rom": ["torch>=1.0.0", "numpy>=1.16.1"],
5656
"wordnet": ["nltk>=3.3.*"],
5757
"full": [
58+
"PyYAML>=5.3.1",
5859
"attacut>=1.0.4",
5960
"emoji>=0.5.1",
6061
"epitran>=1.1",
@@ -130,9 +131,6 @@
130131
"Topic :: Text Processing :: General",
131132
"Topic :: Text Processing :: Linguistic",
132133
],
133-
scripts=[
134-
"bin/word-tokenization-benchmark",
135-
],
136134
entry_points={
137135
"console_scripts": [
138136
"thainlp = pythainlp.__main__:main",

0 commit comments

Comments
 (0)