-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocessing.py
More file actions
115 lines (91 loc) · 3.71 KB
/
preprocessing.py
File metadata and controls
115 lines (91 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""ITeung
# Preprocessing
"""
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import io
import os
import re
import requests
import csv
import datetime
import numpy as np
import pandas as pd
import random
import pickle
factory = StemmerFactory()
stemmer = factory.create_stemmer()
punct_re_escape = re.compile('[%s]' % re.escape('!"#$%&()*+,./:;<=>?@[\\]^_`{|}~'))
unknowns = ["gak paham","kurang ngerti","I don't know"]
list_indonesia_slang = pd.read_csv('./dataset/daftar-slang-bahasa-indonesia.csv', header=None).to_numpy()
data_slang = {}
for key, value in list_indonesia_slang:
data_slang[key] = value
def dynamic_switcher(dict_data, key):
return dict_data.get(key, None)
def check_normal_word(word_input):
slang_result = dynamic_switcher(data_slang, word_input)
if slang_result:
return slang_result
return word_input
def normalize_sentence(sentence):
sentence = punct_re_escape.sub('', sentence.lower())
sentence = sentence.replace('iteung', '').replace('\n', '').replace(' wah','').replace('wow','').replace(' dong','').replace(' sih','').replace(' deh','')
sentence = sentence.replace('teung', '')
sentence = re.sub(r'((wk)+(w?)+(k?)+)+', '', sentence)
sentence = re.sub(r'((xi)+(x?)+(i?)+)+', '', sentence)
sentence = re.sub(r'((h(a|i|e)h)((a|i|e)?)+(h?)+((a|i|e)?)+)+', '', sentence)
sentence = ' '.join(sentence.split())
if sentence:
sentence = sentence.strip().split(" ")
normal_sentence = " "
for word in sentence:
normalize_word = check_normal_word(word)
root_sentence = stemmer.stem(normalize_word)
normal_sentence += root_sentence+" "
return punct_re_escape.sub('',normal_sentence)
return sentence
df = pd.read_csv('./dataset/qa.csv', sep='|',usecols= ['question','answer'])
df.head()
question_length = {}
answer_length = {}
for index, row in df.iterrows():
question = normalize_sentence(row['question'])
question = normalize_sentence(question)
question = stemmer.stem(question)
if question_length.get(len(question.split())):
question_length[len(question.split())] += 1
else:
question_length[len(question.split())] = 1
if answer_length.get(len(str(row['answer']).split())):
answer_length[len(str(row['answer']).split())] += 1
else:
answer_length[len(str(row['answer']).split())] = 1
question_length
answer_length
val_question_length = list(question_length.values())
key_question_length = list(question_length.keys())
key_val_question_length = list(zip(key_question_length, val_question_length))
df_question_length = pd.DataFrame(key_val_question_length, columns=['length_data', 'total_sentences'])
df_question_length.sort_values(by=['length_data'], inplace=True)
df_question_length.describe()
val_answer_length = list(answer_length.values())
key_answer_length = list(answer_length.keys())
key_val_answer_length = list(zip(key_answer_length, val_answer_length))
df_answer_length = pd.DataFrame(key_val_answer_length, columns=['length_data', 'total_sentences'])
df_answer_length.sort_values(by=['length_data'], inplace=True)
df_answer_length.describe()
data_length = 0
#filename = open('./dataset/clean_qa.txt', 'a+')
filename= './dataset/clean_qa.txt'
with open(filename, 'w', encoding='utf-8') as f:
for index, row in df.iterrows():
question = normalize_sentence(str(row['question']))
question = normalize_sentence(question)
question = stemmer.stem(question)
answer = str(row['answer']).lower().replace('iteung', 'aku').replace('\n', ' ')
if len(question.split()) > 0 and len(question.split()) < 13 and len(answer.split()) < 29:
body="{"+question+"}|<START> {"+answer+"} <END>"
print(body, file=f)
#filename.write(f"{question}\t<START> {answer} <END>\n")
#filename.close()