HumanLikeEL/utils.py at master · cltl/HumanLikeEL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import urllib.parse
import redis
#from py2neo import Graph
import Levenshtein

import globals

urlPostPrefixSpotlight = "http://spotlight.sztaki.hu:2222/rest/candidates"
headers = {'Accept': 'application/json'}

rds=redis.Redis()

def getNearN(mentions, current, M=2, N=1):
	previous=mentions[max(current-M, 0):current]
	upcoming=mentions[min(current+1, len(mentions)):min(current+N, len(mentions))]
	return list(set(previous).union(upcoming))

def getLastN(mentions, current, N=10):
	if current>=N:
		return mentions[current-N: current]
	else:
		return list(set(mentions[0:current]))#.union(mentions[current-N:]))

def setAnchorMention(m, em_objs):
	for em_obj in em_objs:
		if isSubstring(m, em_obj.mention) or isAbbreviation(m, em_obj.mention) and m!=em_obj.mention:
			return em_obj
	return None

def sortAndReturnKeys(candScores):
        sortedCands=sorted(candScores.items(), key=lambda t:float(t[1]), reverse=True)
        return list(x[0] for x in sortedCands)

def computeTP(url):
	pkl=globals.pkl
	return pkl[url] if url in pkl else 0.0

def computeSS(s1, s2):
	return Levenshtein.ratio(s1.strip().lower(), s2.strip().lower())

def computePR(url):
	val=rds.get('pr:%s' % url)
	return float(val) if val else 0.0

def neo4jPath(t):
	m1=t[0]
	m2=t[1]
	gn=Graph()
	query="MATCH path=shortestPath((m:Page {name:\"%s\"})-[LINKS_TO*1..10]-(n:Page {name:\"%s\"})) RETURN LENGTH(path) AS length, path, m, n" % (m1, m2)
	path=gn.run(query).evaluate()
	return path

def normalizeURL(s):
	if s:
		return urllib.parse.unquote(s.replace("http://en.wikipedia.org/wiki/", "").replace("http://dbpedia.org/resource/", ""). replace("http://dbpedia.org/page/", "").strip().strip('"'))
	else:
		return '--NME--'

def getLinkDisambiguations(link):
	red=rds.get('dis:%s' % link)
	if red:
		return set(eval(red.decode('UTF-8')))
	else:
		return None

def getLinkRedirect(link):
	red=rds.get('rdr:%s' % link)
	if red:
		return red.decode('UTF-8')
	else:
		return link

def getInitials(entity_string):
        initials=""
        ent_split=entity_string.split()
        if len(ent_split)>1:
                for word in ent_split:
                        if word[0].isupper():
                                initials+=word[0]
        else:
                initials=None
        return initials

def isAbbreviation(m1, m2):
	if m1==m2:
		return False
	m1=m1.replace('.', '').replace(' ', '')
	if not m2 or not getInitials(m2):
		return False
	if m1[0]!=m2[0]:
		return False
	else:
		return m1==getInitials(m2)

def isSubstring(m1, m2):
	return m1.lower() in m2.lower() and m1.lower()!=m2.lower()

def analyzeEntities(articles, collection):
        c=0
        nils=0
        for article in articles:
                if article.collection==collection:
                        c+=len(article.entity_mentions)
                        for e in article.entity_mentions:
                                if e.gold_link=='--NME--':
                                        nils+=1
                else:
                        print(article.collection)
        return c, nils