-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessing.py
More file actions
37 lines (30 loc) · 1.03 KB
/
processing.py
File metadata and controls
37 lines (30 loc) · 1.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import classifier
redundantChar = [',','.','!','?']
commonWords = ["i", "to", "will", "of", "i'm", "does","a", "be","would","is", "should","you","what"]
def cleanText(text,listOfChar):
for i in listOfChar:
text.strip(i)
text.lower()
return text
def vectorize(dataList, wordBag, bag):
for i in dataList:
splitted = i.split()
for j in splitted:
wordBag.append(cleanText(j,redundantChar))
bag.append(cleanText(j,redundantChar))
def makeDictionary(wordbag, dictionary, bag):
bagWordCount = len(bag)
vocabCount = len(wordbag)
# Mapping dictionary
for i in wordbag:
dictionary[i] = 0
for x in dictionary.keys():
counted = classifier.countWord(x, bag)
dictionary[x] = classifier.condProb(counted, bagWordCount, vocabCount)
def removeCommonWords(setBag):
removableWords = []
for i in setBag:
for j in commonWords:
if i == j:
removableWords.append(i)
return set(setBag).difference(removableWords)