ArticleClassification/read_data.py at master · smokys123/ArticleClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python3
import random
import os
import pandas as pd


class document_DataSet(object):
    def __init__(self, data, classes, valid_idxs=None):
        self.data = data
        # data =(class, number of sentences, [number of words], [w1, w2, ... ])
        total_num_examples = self.get_data_size()
        self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
        self.valid_data = data if valid_idxs is None else self.get_valid_data()
        self.num_examples = len(self.valid_idxs)
        self.classes = classes
        self.nclass = len(classes)
        self.vocabulary = self.get_vocabulary()
        self.nvoc = len(self.vocabulary)+1
        # vocabulary size, +1 is for special symbol <pad>
        self.word_to_idx = self.get_word_to_idx()
        self.idx_to_word = self.get_idx_to_word()
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        self.idx_to_class = {i: c for i, c in enumerate(self.classes)}

    def get_valid_data():
        valid_data = []
        for idx in self.valid_idxs:
            valid_data.append(self.data[idx])
        return valid_data

    def get_data_size(self):
        return len(self.data)

    def get_vocabulary(self):
        # for entire set
        current_path = os.path.dirname(os.path.abspath(__file__))
        data_path = os.path.join(current_path, 'dataset', 'BBC')
        voc_path = os.path.join(data_path, 'vocabulary.txt')
        fp = open(voc_path, 'r')
        # nvoc=int(fp.readline())
        vocabulary = fp.read().split()
        fp.close()

        # for tesing
        """
        voc=[]
        # (class, number of sentence, number of words, news)
        for _, num_sen_in_doc, num_word_in_sen, news in self.valid_data:
        # (target_class, length of sentence, [sentence])
        for i in range(num_sen_in_doc):
        for j in range(num_word_in_sen[i]):
        word = news[i][j]
        if word not in voc: voc.append(word)
        return voc
        """
        return vocabulary

    def get_word_to_idx(self):
        # no special symbols <s>, </s>
        word2idx = {word: i+1 for i, word in enumerate(self.vocabulary)}
        word2idx["<pad>"] = 0
        return word2idx

    def _word_to_idx(self):
        for idx in range(self.num_examples):
            doc = self.valid_data[idx][3]
            num_sen_in_doc = self.valid_data[idx][1]
            num_word_in_sen = self.valid_data[idx][2]
            for i in range(num_sen_in_doc):
                for j in range(num_word_in_sen[i]):
                    doc[i][j] = self.word_to_idx[doc[i][j]]

    def padding_doc(self):
        for idx in range(self.num_examples):
            doc = self.valid_data[idx][3]
            num_sen_in_doc = self.valid_data[idx][1]
            num_word_in_sen = self.valid_data[idx][2]
            max_len_sen = 0
            for num in num_word_in_sen:
                if max_len_sen < num:
                    max_len_sen = num
            for i in range(num_sen_in_doc):
                for j in range(max_len_sen):
                    if j >= (num_word_in_sen[i]):
                        doc[i].append(0)

    def get_idx_to_word(self):
        # no special symbols <s>, </s>
        idx2word = {i+1: word for i, word in enumerate(self.vocabulary)}
        idx2word[0] = "<pad>"
        return idx2word

    def _sort_by_len(self): raise NotImplementedError()

    def _sort_by_senlen(self): raise NotImplementedError()

    def split_set(self): raise NotImplementedError()

    def get_by_idxs(self, idxs): raise NotImplementedError()

    def get_one(self, idx): raise NotImplementedError()

    def get_batches(self, batch_size, num_batches=None, shuffle=False):
        return self.valid_data

    def get_max_len(self):
        max_len = 0
        for i in self.valid_idxs:
            _, length, _ = self.data[i]
            if length > max_len:
                max_len = length
        return max_len

    def get_mean_len(self):
        tot_len = 0
        for i in self.valid_idxs:
            _, length, _ = self.data[i]
            tot_len += length
        return tot_len/self.num_examples

    def show_length_distribution(self):
        raise NotImplementedError()


def read_bbc(config):
    def preprocess(document):  # if sentence is long cut sentence
        doc = [x.split() for x in document.split('.') if len(x.strip()) > 0]
        truncate_length = 50
        fdoc = []
        for sen in doc:
            if len(sen) <= truncate_length: fdoc.append(sen)
            else:
                trunc_sen = [x for x in sen]
                while True:
                    if len(trunc_sen) <= truncate_length:
                        fdoc.append(trunc_sen)
                        break
                    else:
                        fdoc.append(trunc_sen[:truncate_length])
                        trunc_sen = trunc_sen[truncate_length:]

        num_sentence = len(fdoc)
        num_word_in_sen = [len(x) for x in fdoc]
        return fdoc, num_sentence, num_word_in_sen

    def make_data(df):
        data = []
        for rowIdx in df.index:
            target_class = df.ix[rowIdx]['type']
            news, num_sen_in_doc, num_word_in_sen = preprocess(df.ix[rowIdx]['news'])
            if num_sen_in_doc != 94:
                data.append((target_class, num_sen_in_doc,
                             num_word_in_sen, news))
        return data

    current_path = os.path.dirname(os.path.abspath(__file__))
    data_path = os.path.join(current_path,
                             config.data_dir,
                             config.data_type)
    train_path = os.path.join(data_path, 'train_data.csv')
    test_path = os.path.join(data_path, 'test_data.csv')

    train_df = pd.read_csv(train_path, encoding='ISO-8859-1')
    test_df = pd.read_csv(test_path, encoding='ISO-8859-1')

    train_data = make_data(train_df)
    test_data = make_data(test_df)

    train_data = train_data[:1000]
    test_data = test_data[:200]

    """
    data -- list of tuple
    (class, num of sen, num words in each sen,
    [[w1, w2, w3], [w1, w2], ... ])
    """

    random.shuffle(train_data)  # 2225 instances
    random.shuffle(test_data)

    return (document_DataSet(train_data, classes=['business', 'entertainment', 'politics', 'sport', 'tech']),
            document_DataSet(test_data, classes=['business', 'entertainment', 'politics', 'sport', 'tech']))