-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
103 lines (90 loc) · 3.95 KB
/
utils.py
File metadata and controls
103 lines (90 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import re
import csv
def read_psuedo_data(data_path):
"""
Reads Data from directory and converts it to lists
@param data_path : (str) Path to the base data directory
@returns train_reviews : (List[List[str]]) List of all reviews in train dir - lower case
@returns test_reviews : (List[List[str]]) List of all reviews in test dir - lower case
@returns train_labels : (List[int]) List of all labels in train dir - encoded
@returns test_labels : (List[int]) List of all labels in test dir - encoded
"""
train_reviews, test_reviews = [], []
train_labels, test_labels = [], []
for category in ["train", "test"]:
folder = os.path.join(data_path, category)
for sentiment in ["pos", "neg"]:
path = os.path.join(folder, sentiment)
for file_name in os.listdir(path):
file = open(os.path.join(path, file_name), encoding='utf-8')
sent = file.read().lower().strip()
sent = normalize_sentence(sent)
sent = sent.split()
if category == "train":
train_reviews.append(sent)
if sentiment == "pos":
train_labels.append(1)
else:
train_labels.append(0)
else:
test_reviews.append(sent)
if sentiment == "pos":
test_labels.append(1)
else:
test_labels.append(0)
file.close()
return train_reviews[:5000], test_reviews[:5000], train_labels, test_labels
def read_data(data_path):
"""
Reads Data from directory and converts it to lists
@param data_path : (str) Path to the base data directory
@returns train_reviews : (List[List[str]]) List of all reviews in train dir - lower case
@returns test_reviews : (List[List[str]]) List of all reviews in test dir - lower case
@returns train_labels : (List[int]) List of all labels in train dir - encoded
@returns test_labels : (List[int]) List of all labels in test dir - encoded
"""
train_reviews, test_reviews = [], []
train_labels, test_labels = [], []
with open(data_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
sent = row[1].lower().strip()
sent = normalize_sentence(sent)
sent = sent.split()
if i > 1900:
test_reviews.append(sent)
test_labels.append(int(row[4]))
elif i > 0:
train_reviews.append(sent)
train_labels.append(int(row[4]))
return train_reviews, test_reviews[:500], train_labels, test_labels[:500]
def pad_sentences(word_list, pad_token='<pad>', seq_len=500):
"""
Pad or Truncate sentences to seq_len
@param word_list : (List[List[str]]) List of words for each sentence
@param pad_token : (str) Token to be used for padding
@param seq_len : (int) Length of padded sequences
@returns padded_sents : (List[List[str]]) List of padded sentences represented as word lists
"""
padded_sents = []
for sent in word_list:
if(len(sent) > seq_len):
sent = sent[:seq_len]
sent = sent + [pad_token] * (seq_len - len(sent))
padded_sents.append(sent)
return padded_sents
def normalize_sentence(sent):
"""
Normalize a sentence - removing all unnecessary characters
@param sent : (str) Unnormalized sentence
@returns norm_sent : (str) Normalized sentence
"""
sent = sent.replace('<br />', '')
sent = re.sub(r'(\W)(?=\1)', '', sent)
sent = re.sub(r"([.!?])", r" \1", sent)
norm_sent = re.sub(r"[^a-zA-Z.!?]+", r" ", sent)
return norm_sent
if __name__ == "__main__":
t, t2, l, l2 = read_data("./data/data.csv")
print(len(t), len(t2))