nlpcoursework/processing.py at master · raister21/nlpcoursework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import classifier

redundantChar = [',','.','!','?']
commonWords = ["i", "to", "will", "of", "i'm", "does","a", "be","would","is", "should","you","what"]

def cleanText(text,listOfChar):
    for i in listOfChar:
        text.strip(i)
    text.lower()
    return text

def vectorize(dataList, wordBag, bag):
    for i in dataList:
        splitted = i.split()
        for j in splitted:
            wordBag.append(cleanText(j,redundantChar))
            bag.append(cleanText(j,redundantChar))

def makeDictionary(wordbag, dictionary, bag):
    bagWordCount = len(bag)
    vocabCount = len(wordbag)
    # Mapping dictionary
    for i in wordbag:
        dictionary[i] = 0

    for x in dictionary.keys():
        counted = classifier.countWord(x, bag)
        dictionary[x] = classifier.condProb(counted, bagWordCount, vocabCount)

def removeCommonWords(setBag):
    removableWords = []
    for i in setBag:
        for j in commonWords:
            if i == j:
                removableWords.append(i)
    return set(setBag).difference(removableWords)