NLP2/preprocessing.py at master · friediisch/NLP2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import spacy as sp
import os
from pathlib import Path
import json
import sys
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Span
import numpy as np
import keras
import editdistance
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten, Add
from keras.layers.embeddings import Embedding
import keras.backend as K
from keras.layers import Lambda
from time import time
import gensim
import sys
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from collections import Counter
from random import shuffle
import utils
from datetime import datetime


data = []

fp = 'data/LRECjson/'
doc_count = 0
files = os.listdir(Path(fp))
start = datetime.now()
shuffle(files)
for jsonfile in files:
#for jsonfile in ['../data/LRECjson/2018_1049.json']:
    doc_id = doc_count
    doc_count += 1
    path = str(fp + str(jsonfile))

    title, abstract, keywords, text = utils.read_document(path)

    if None in [title, abstract, keywords, text]:
        continue
    doc_data = utils.process_document(title, abstract, keywords, text, doc_id = doc_id, jsonfile= jsonfile, verbose=1)

    if doc_data is None:
        continue

    # downsample document ngram data
    # [{dict of chunk1 features}, {dict of chunk2 features}, {}, ...]
    labels = [int(instance['label']) for instance in doc_data]

    positive_examples = sum(labels)

    negative_ratio = 10

    if positive_examples > 0:

        # sample fix ratio of negative and positive labels
        neg_idx = [i for i in range(len(labels)) if labels[i] == 0] # indices of negative examples
        neg_idx = np.random.choice(np.array(neg_idx),
                                    min(positive_examples*negative_ratio, len(neg_idx)),
                                    replace=False)

        pos_idx = [i for i in range(len(labels)) if labels[i] == 1] # indices of positive examples
        pos_idx = np.random.choice(np.array(pos_idx), positive_examples, replace=False)

        idx = np.hstack((pos_idx, neg_idx))

        doc_data = [doc_data[i] for i in idx]

        data += doc_data

        print('Progress: ', str(np.round(doc_count/len(files), 4)), '%')
        print('ETA: ', str(((datetime.now()-start)/doc_count)*(len(files)-doc_count)))


#     # write temporary feature file
#     # with open(Path('../data/models/features/data_2_tmp.json'), 'w+') as f:
#     #     json.dump(data, f)


# # class balancing:
# labels = [int(instance['label']) for instance in data]

# positive_examples = sum(labels)

# negative_ratio = 25

# # sample fix ratio of negative and positive labels
# neg_idx = [i for i in range(len(labels)) if labels[i] == 0] # indices of negative examples
# neg_idx = np.random.choice(np.array(neg_idx),
#                             positive_examples*negative_ratio,
#                             replace=False)

# pos_idx = [i for i in range(len(labels)) if labels[i] == 1] # indices of positive examples
# pos_idx = np.random.choice(np.array(pos_idx), positive_examples, replace=False)

# idx = np.hstack((pos_idx, neg_idx))

# data = [data[i] for i in idx]


with open(Path('data/full.json'), 'w+') as f:
    json.dump(data, f)