-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
executable file
·110 lines (91 loc) · 2.98 KB
/
utils.py
File metadata and controls
executable file
·110 lines (91 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import urllib.parse
import redis
#from py2neo import Graph
import Levenshtein
import globals
urlPostPrefixSpotlight = "http://spotlight.sztaki.hu:2222/rest/candidates"
headers = {'Accept': 'application/json'}
rds=redis.Redis()
def getNearN(mentions, current, M=2, N=1):
previous=mentions[max(current-M, 0):current]
upcoming=mentions[min(current+1, len(mentions)):min(current+N, len(mentions))]
return list(set(previous).union(upcoming))
def getLastN(mentions, current, N=10):
if current>=N:
return mentions[current-N: current]
else:
return list(set(mentions[0:current]))#.union(mentions[current-N:]))
def setAnchorMention(m, em_objs):
for em_obj in em_objs:
if isSubstring(m, em_obj.mention) or isAbbreviation(m, em_obj.mention) and m!=em_obj.mention:
return em_obj
return None
def sortAndReturnKeys(candScores):
sortedCands=sorted(candScores.items(), key=lambda t:float(t[1]), reverse=True)
return list(x[0] for x in sortedCands)
def computeTP(url):
pkl=globals.pkl
return pkl[url] if url in pkl else 0.0
def computeSS(s1, s2):
return Levenshtein.ratio(s1.strip().lower(), s2.strip().lower())
def computePR(url):
val=rds.get('pr:%s' % url)
return float(val) if val else 0.0
def neo4jPath(t):
m1=t[0]
m2=t[1]
gn=Graph()
query="MATCH path=shortestPath((m:Page {name:\"%s\"})-[LINKS_TO*1..10]-(n:Page {name:\"%s\"})) RETURN LENGTH(path) AS length, path, m, n" % (m1, m2)
path=gn.run(query).evaluate()
return path
def normalizeURL(s):
if s:
return urllib.parse.unquote(s.replace("http://en.wikipedia.org/wiki/", "").replace("http://dbpedia.org/resource/", ""). replace("http://dbpedia.org/page/", "").strip().strip('"'))
else:
return '--NME--'
def getLinkDisambiguations(link):
red=rds.get('dis:%s' % link)
if red:
return set(eval(red.decode('UTF-8')))
else:
return None
def getLinkRedirect(link):
red=rds.get('rdr:%s' % link)
if red:
return red.decode('UTF-8')
else:
return link
def getInitials(entity_string):
initials=""
ent_split=entity_string.split()
if len(ent_split)>1:
for word in ent_split:
if word[0].isupper():
initials+=word[0]
else:
initials=None
return initials
def isAbbreviation(m1, m2):
if m1==m2:
return False
m1=m1.replace('.', '').replace(' ', '')
if not m2 or not getInitials(m2):
return False
if m1[0]!=m2[0]:
return False
else:
return m1==getInitials(m2)
def isSubstring(m1, m2):
return m1.lower() in m2.lower() and m1.lower()!=m2.lower()
def analyzeEntities(articles, collection):
c=0
nils=0
for article in articles:
if article.collection==collection:
c+=len(article.entity_mentions)
for e in article.entity_mentions:
if e.gold_link=='--NME--':
nils+=1
else:
print(article.collection)
return c, nils