Skip to content

Commit 16158ba

Browse files
committed
Merge branch 'rm-default-tqdm' of https://github.com/PyThaiNLP/pythainlp into rm-default-tqdm
2 parents a7f80c0 + 2d907a2 commit 16158ba

File tree

8 files changed

+49
-47
lines changed

8 files changed

+49
-47
lines changed

bin/word-tokenization-benchmark

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -54,49 +54,57 @@ print(
5454

5555
df_raw = word_tokenization.benchmark(expected, actual)
5656

57-
df_res = df_raw.describe()
58-
df_res = df_res[
59-
[
60-
"char_level:tp",
61-
"char_level:tn",
62-
"char_level:fp",
63-
"char_level:fn",
64-
"char_level:precision",
65-
"char_level:recall",
66-
"char_level:f1",
67-
"word_level:precision",
68-
"word_level:recall",
69-
"word_level:f1",
70-
]
57+
58+
columns = [
59+
"char_level:tp",
60+
"char_level:fp",
61+
"char_level:tn",
62+
"char_level:fn",
63+
"word_level:correctly_tokenised_words",
64+
"word_level:total_words_in_sample",
65+
"word_level:total_words_in_ref_sample",
7166
]
7267

73-
df_res = df_res.T.reset_index(0)
68+
statistics = dict()
69+
70+
for c in columns:
71+
statistics[c] = float(df_raw[c].sum())
72+
73+
statistics["char_level:precision"] = statistics["char_level:tp"] / (
74+
statistics["char_level:tp"] + statistics["char_level:fp"]
75+
)
7476

75-
df_res["mean±std"] = df_res.apply(
76-
lambda r: "%2.2f±%2.2f" % (r["mean"], r["std"]), axis=1
77+
statistics["char_level:recall"] = statistics["char_level:tp"] / (
78+
statistics["char_level:tp"] + statistics["char_level:fn"]
7779
)
7880

79-
df_res["metric"] = df_res["index"]
81+
statistics["word_level:precision"] = statistics["word_level:correctly_tokenised_words"] \
82+
/ statistics["word_level:total_words_in_sample"]
83+
84+
statistics["word_level:recall"] = statistics["word_level:correctly_tokenised_words"] \
85+
/ statistics["word_level:total_words_in_ref_sample"]
8086

8187
print("============== Benchmark Result ==============")
82-
print(df_res[["metric", "mean±std", "min", "max"]].to_string(index=False))
8388

89+
for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
90+
c = f"char_level:{c}"
91+
v = statistics[c]
92+
print(f"{c:>40s} {v:.4f}")
8493

85-
if args.save_details:
86-
data = {}
87-
for r in df_res.to_dict("records"):
88-
metric = r["index"]
89-
del r["index"]
90-
data[metric] = r
94+
for c in ["total_words_in_sample", "total_words_in_ref_sample", "correctly_tokenised_words", "precision", "recall"]:
95+
c = f"word_level:{c}"
96+
v = statistics[c]
97+
print(f"{c:>40s} {v:.4f}")
9198

99+
if args.save_details:
92100
dir_name = os.path.dirname(args.input_file)
93101
file_name = args.input_file.split("/")[-1].split(".")[0]
94102

95103
res_path = "%s/eval-%s.yml" % (dir_name, file_name)
96104
print("Evaluation result is saved to %s" % res_path)
97105

98106
with open(res_path, "w", encoding="utf-8") as outfile:
99-
yaml.dump(data, outfile, default_flow_style=False)
107+
yaml.dump(statistics, outfile, default_flow_style=False)
100108

101109
res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
102110
print("Details of comparisons is saved to %s" % res_path)
@@ -110,6 +118,6 @@ if args.save_details:
110118

111119
samples.append(dict(metrics=r, expected=expected, actual=actual, id=i))
112120

113-
details = dict(metrics=data, samples=samples)
121+
details = dict(metrics=statistics, samples=samples)
114122

115123
json.dump(details, f, ensure_ascii=False)

pythainlp/benchmarks/word_tokenization.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,6 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
185185

186186
correctly_tokenised_words = np.sum(tokenization_indicators)
187187

188-
w_precision = correctly_tokenised_words / np.sum(sample)
189-
w_recall = correctly_tokenised_words / np.sum(ref_sample)
190-
w_f1 = _f1(w_precision, w_recall)
191-
192188
tokenization_indicators = list(
193189
map(lambda x: str(x), tokenization_indicators)
194190
)
@@ -199,14 +195,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
199195
"fp": c_fp,
200196
"tn": c_tn,
201197
"fn": c_fn,
202-
"precision": c_precision,
203-
"recall": c_recall,
204-
"f1": c_f1,
205198
},
206199
"word_level": {
207-
"precision": w_precision,
208-
"recall": w_recall,
209-
"f1": w_f1,
200+
"correctly_tokenised_words": correctly_tokenised_words,
201+
"total_words_in_sample": np.sum(sample),
202+
"total_words_in_ref_sample": np.sum(ref_sample)
210203
},
211204
"global": {
212205
"tokenisation_indicators": "".join(tokenization_indicators)
Binary file not shown.

pythainlp/tag/perceptron.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
Perceptron Part-Of-Speech tagger
44
"""
55
import os
6+
import pickle
67
from typing import List, Tuple
78

8-
import dill
99
from pythainlp.corpus import corpus_path
1010
from pythainlp.tag.orchid import tag_signs, tag_to_text
1111

12-
_ORCHID_DATA_FILENAME = "orchid_pt_tagger.dill"
13-
_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.dill"
12+
_ORCHID_DATA_FILENAME = "orchid_pt_tagger.pkl"
13+
_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.pkl"
1414

1515

1616
def _load_tagger(filename):
1717
data_filename = os.path.join(corpus_path(), filename)
1818
with open(data_filename, "rb") as fh:
19-
model = dill.load(fh)
19+
model = pickle.load(fh)
2020
return model
2121

2222

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
dill==0.3.*
21
python-crfsuite==0.9.*
32
requests==2.23.*
43
tinydb==4.1.*

setup.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
"""
4040

4141
requirements = [
42-
"dill>=0.3.0",
4342
"python-crfsuite>=0.9.6",
4443
"requests>=2.22.0",
4544
"tinydb>=3.0",
@@ -88,7 +87,7 @@
8887
"etcc.txt",
8988
"negations_th.txt",
9089
"orchid_pos_th.json",
91-
"orchid_pt_tagger.dill",
90+
"orchid_pt_tagger.pkl",
9291
"person_names_female_th.txt",
9392
"person_names_male_th.txt",
9493
"sentenceseg-crfcut-v2.model",
@@ -98,7 +97,7 @@
9897
"thailand_provinces_th.txt",
9998
"tnc_freq.txt",
10099
"ttc_freq.txt",
101-
"ud_thai_pud_pt_tagger.dill",
100+
"ud_thai_pud_pt_tagger.pkl",
102101
"ud_thai_pud_unigram_tagger.json",
103102
"words_th_thai2fit_201810.txt",
104103
"words_th.txt",

tests/test_tag.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
66
from pythainlp.tag.locations import tag_provinces
77
from pythainlp.tag.named_entity import ThaiNameTagger
8-
from pythainlp.tokenize import word_tokenize
98

109

1110
class TestTagPackage(unittest.TestCase):
@@ -26,7 +25,7 @@ def test_pos_tag(self):
2625
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
2726
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
2827
self.assertEqual(
29-
pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
28+
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
3029
[("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
3130
)
3231

@@ -40,6 +39,10 @@ def test_pos_tag(self):
4039
self.assertEqual(perceptron.tag([], corpus="pud"), [])
4140
self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
4241
self.assertEqual(perceptron.tag([], corpus="orchid"), [])
42+
self.assertEqual(
43+
pos_tag(["นักเรียน", "ถาม", "ครู"]),
44+
[("นักเรียน", "NCMN"), ("ถาม", "VACT"), ("ครู", "NCMN")],
45+
)
4346

4447
self.assertEqual(pos_tag_sents(None), [])
4548
self.assertEqual(pos_tag_sents([]), [])

0 commit comments

Comments
 (0)