TextNet/higher_order_experiments.py at master · kvalle/TextNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Module containing experiments with higher order co-occurrence relations, as part of the
co-occurrence network representation.
"""
import pprint as pp
import plotter
import numpy
import networkx as nx
import scipy.spatial.distance

import data
import graph
import freq_representation
import graph_representation
import classify
import evaluation

numpy.set_printoptions(linewidth = 1000, precision = 3)

def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test retrieval using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)
    filenames = data.get_file_names(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        print '    '+str(i)+"/"+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/retr'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score

def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test classification using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)
    filenames = data.get_file_names(path)
    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        print '    '+str(i)+"/"+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)
    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/class'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score

def test_vocabulary_size(path = '../data/air/problem_descriptions_preprocessed'):
    """
    Print vocabulary sizes for documents in dataset.
    """
    texts, labels = data.read_files(path)
    lengths = []
    for text in texts:
        text = text.split(' ')
        l = len(list(set(text)))
        lengths.append(l)
        print '   ',l
    lengths = numpy.array(lengths)
    print 'avg', lengths.mean()
    print 'max', lengths.max()
    print 'min', lengths.min()

def test_combinations():
    """
    Test all combinations of higher orders with classification and retrieval.
    """
    combinations = [[1],[2],[3],[1,2],[1,3],[2,3],[1,2,3]]
    for c in combinations:
        test_classification(c)
        test_retrieval(c)

if __name__ == "__main__":
    test_combinations()