-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval.py
More file actions
executable file
·160 lines (148 loc) · 6.13 KB
/
eval.py
File metadata and controls
executable file
·160 lines (148 loc) · 6.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/local/bin/python3
import os
import json
import pickle
import glob
import time
from dmtipci.dictionary import Dictionary
from dmtipci.find import Finder
from dmtipci.debug import _assert, __LINE__, __FILE__
from dmtipci import util, system
WORDNET_CACHE_VERSION = system.VERSION
def parse_wordnet_line(line, kind):
line = line.strip()
# {...} works for nouns and verbs
if line.startswith('{') and line.endswith('}'):
# information regarding the words
def_start = line.find('(')
def_end = line.rfind(')')
def_content = line[def_start+1:def_end-1]
# def_content is supposedly the definition of the word (aka gloss)
line = line[1:def_start]
# information about word usage context in square brackets: [] are not used, they have to be ignored
line_p = ''
in_bracket = False
for char in line:
if char == '[':
in_bracket = True
elif char == ']':
in_bracket = False
continue
if not in_bracket:
line_p += char
elements = [x.strip(',1234567890') for x in line_p.strip().split(' ') if len(x.strip(',')) > 0]
# process words formatted like noun.plant:pome
for i in range(0, len(elements)):
element = elements[i]
semi_c = element.rfind(':')
if '.' in element and semi_c > 0:
element = element[semi_c + 1:]
elements[i] = element
subject = elements[0]
if not subject.endswith(',@'): # only a single hypernym? disregard
i_hypernyms = [] # indirect hypernyms
for i in range(1, len(elements)):
if elements[i].endswith(',@') or elements[i].endswith(';c') or elements[i].endswith(';r') or elements[i].endswith(';u') or elements[i].endswith(',~'):
element = elements[i][:-2] # remove ,@
i_hypernyms.append(element.strip(',1234567890').upper())
elif elements[i].endswith(',@i') or elements[i].endswith(',~i'):
element = elements[i][:-3]
i_hypernyms.append(element.strip(',1234567890').upper())
return {
'subject': subject.upper(),
'i_hypernyms': i_hypernyms
}
return {}
def load_wordnet():
wordnet_cache_dir = 'wordnet_db/cache/'
wordnet_files = 'wordnet_db/dbfiles/'
os.makedirs(wordnet_cache_dir, exist_ok=True)
db = util.load_pickle(wordnet_cache_dir + 'wordnet.json.pd', 'wordnet', WORDNET_CACHE_VERSION)
if len(db) != 0:
return db
# Note: Only hypernyms are being parsed.
# Hypernyms are those with ,@ pointer syntax in wordnet, according to http://wordnet.princeton.edu/wordnet/man/wninput.5WN.html .
# Refer to https://github.com/kazazes/DMTIPCI/issues/1 for the reasons we are only interested in hypernyms.
# adj and adv are temporarily ignored because they use a more complex format
for filename in glob.glob(wordnet_files + 'noun.*'):
# nouns
print("Parsing", filename, "...")
for line in open(filename, 'r').readlines():
l = parse_wordnet_line(line, 'noun')
if l:
hns = []
if l['subject'] in db:
hns = db[l['subject']]
for key in l['i_hypernyms']:
if not key in hns:
hns.append(key)
if len(hns) > 0:
db[l['subject']] = hns
for filename in glob.glob(wordnet_files + 'verb.*'):
# nouns
print("Parsing", filename)
util.save_pickle(db, wordnet_cache_dir + 'wordnet', WORDNET_CACHE_VERSION)
if len(db) == 0:
print("DMTIPCI Evaluation requires WordNet, instructions:")
print("- Download from http://wordnetcode.princeton.edu/wn3.1.dict.tar.gz")
print("- Unpack and get a folder named 'dict', rename it to 'wordnet_db'")
print("- Place 'wordnet_db' in the same folder as", __FILE__())
print("- Re-run the script")
quit()
return db
def main():
db = load_wordnet()
print('WordNet has', len(db), 'entries.')
master_text = 'dict/pg29765.txt'
word_freq = 'dict/pg29765_word_freq'
word_infl = 'dict/pg29765_word_infl'
d = Dictionary()
d.updateFromGutenbergText(master_text)
d.updateWordInflection(word_infl)
d.updateWordFrequency(word_freq)
finder = Finder(d)
precision_total = 0
coverage_total = 0
total_counted = 0
last_checkpoint = time.time()
print('Evaluation started.')
for idx, word in enumerate(db):
dst = db[word]
src = finder.find(word, debug_print=False)
if src and 'N' in src:
src = src['N']
if not src:
continue
total_counted += 1
dst_ = []
for e in dst:
dst_ += e.split('_')
# print(src)
# print(dst)
# print(dst_)
correct = 0
for e in src:
for e_ in dst_:
if e in e_:
correct += 1
precision_total += correct / len(src)
# print('Precision', correct, 'of', len(src))
correct = 0
for e_ in dst_:
for e in src:
if e in e_:
correct += 1
# print('Coverage', correct, 'of', len(dst_))
coverage_total += correct / len(dst_)
checkpoint = time.time()
if checkpoint - last_checkpoint >= 15:
print('Precision: %.8f, Coverage (Recall): %.8f' % (precision_total / total_counted, coverage_total / total_counted), ' %d%%' % (100 * (idx + 1) / len(db)))
last_checkpoint = checkpoint
print('Final Precision: %.8f, Coverage (Recall): %.8f' % (precision_total / total_counted, coverage_total / total_counted))
if __name__ == '__main__':
print("╔╦╗╔╦╗╔╦╗╦╔═╗╔═╗╦")
print(" ║║║║║ ║ ║╠═╝║ ║")
print("═╩╝╩ ╩ ╩ ╩╩ ╚═╝╩")
print("- DMTIPCI Eval. -")
print(" v" + str(system.VERSION) + " ")
main()