-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathhigher_order_experiments.py
More file actions
110 lines (98 loc) · 4.04 KB
/
higher_order_experiments.py
File metadata and controls
110 lines (98 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Module containing experiments with higher order co-occurrence relations, as part of the
co-occurrence network representation.
"""
import pprint as pp
import plotter
import numpy
import networkx as nx
import scipy.spatial.distance
import data
import graph
import freq_representation
import graph_representation
import classify
import evaluation
numpy.set_printoptions(linewidth = 1000, precision = 3)
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
"""
Test retrieval using different combinations of higher orders and weightings of these.
The list *orders* define which higher order relations to include.
The relative importance of the orders are defined by *order_weights*.
"""
print '> Reading cases..'
descriptions_path = '../data/air/problem_descriptions_preprocessed'
description_texts, labels = data.read_files(descriptions_path)
filenames = data.get_file_names(descriptions_path)
solutions_path = '../data/air/solutions_preprocessed'
solution_texts, labels = data.read_files(solutions_path)
solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
print '> Creating representations..'
rep = []
for i, text in enumerate(description_texts):
print ' '+str(i)+"/"+str(len(description_texts))
g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i])
d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
rep.append(d)
rep = graph_representation.dicts_to_vectors(rep)
print '> Evaluating..'
score = evaluation.evaluate_retrieval(rep, solution_vectors)
print 'orders:', orders
print 'score:', score
fname = 'output/higher_order/results/retr'
with open(fname, 'a+') as f:
s = reduce(lambda x,y:str(x)+str(y), orders)
f.write(str(s)+' '+str(score)+'\n')
return score
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
"""
Test classification using different combinations of higher orders and weightings of these.
The list *orders* define which higher order relations to include.
The relative importance of the orders are defined by *order_weights*.
"""
print '> Reading cases..'
path = '../data/tasa/TASA900_text'
texts, labels = data.read_files(path)
filenames = data.get_file_names(path)
print '> Creating representations..'
rep = []
for i, text in enumerate(texts):
print ' '+str(i)+"/"+str(len(texts))
g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i])
d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
rep.append(d)
rep = graph_representation.dicts_to_vectors(rep)
print '> Evaluating..'
score = evaluation.evaluate_classification(rep, labels)
print 'orders:', orders
print 'score:', score
fname = 'output/higher_order/results/class'
with open(fname, 'a+') as f:
s = reduce(lambda x,y:str(x)+str(y), orders)
f.write(str(s)+' '+str(score)+'\n')
return score
def test_vocabulary_size(path = '../data/air/problem_descriptions_preprocessed'):
"""
Print vocabulary sizes for documents in dataset.
"""
texts, labels = data.read_files(path)
lengths = []
for text in texts:
text = text.split(' ')
l = len(list(set(text)))
lengths.append(l)
print ' ',l
lengths = numpy.array(lengths)
print 'avg', lengths.mean()
print 'max', lengths.max()
print 'min', lengths.min()
def test_combinations():
"""
Test all combinations of higher orders with classification and retrieval.
"""
combinations = [[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]]
for c in combinations:
test_classification(c)
test_retrieval(c)
if __name__ == "__main__":
test_combinations()