Language_Analysis/main.py at main · Melite12/Language_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from util import sort_count_pairs
import math

def count_tokens(tokens):
    counter = {}
    for letter in tokens:
        counter[letter] = counter.get(letter, 0) + 1

    return counter

def sorted_token_list(tokens):
    token_dict = count_tokens(tokens)

    token_list = []
    for value in token_dict.items():
        token_list.append(value)
    token_list = sort_count_pairs(token_list)
    return token_list

def find_top_k(tokens, k):
    token_list = sorted_token_list(tokens)

    top_k = token_list[:k]
    ans_list = []
    for items in top_k:
        ans_list.append(items[0])

    return ans_list

def find_min_count(tokens, min_count):
    token_dict = count_tokens(tokens)

    ans_set = set()
    for value in token_dict.items():
        if value[1] >= min_count:
            ans_set.add(value[0])

    return ans_set

def tf(t, d, TD, TL):
    F_td = TD[t]
    max_term = TL[0][1]

    ans = 0.5 + (0.5 * (F_td / max_term))
    return ans

def idf(t, D):
    num_documents = len(D)

    total = 0
    for value in D:
        if t in value:
            total +=1

    ans = math.log(num_documents / total)
    return ans

def tf_idf(t,d,D, TD, TL):
    tf_val = tf(t,d, TD, TL)
    idf_val = idf(t, D)

    return tf_val*idf_val

def find_salient(docs, threshold):
    ans_list = []

    for doc in docs:
        s = set()
        token_dict = count_tokens(doc)
        token_list = sorted_token_list(doc)
        for value in token_dict:
            tf_idf_val = tf_idf(value, doc, docs, token_dict, token_list)
            if tf_idf_val > threshold:
                s.add(value)
        ans_list.append(s)

    return ans_list


def find_text_in_entities(tweets, entity_desc):
    key, data_in_key = entity_desc[0], entity_desc[1]
    data_list = []
    for tweet in tweets:
        entity = tweet['entities'][key]
        for data in entity:
            entity_text = data[data_in_key]
            data_list.append(entity_text)

    return data_list

def find_top_k_entities(tweets, entity_desc, k):

    data_list = find_text_in_entities(tweets, entity_desc)
    top_k = find_top_k(data_list, k)

    return top_k

def find_min_count_entities(tweets, entity_desc, min_count):

    data_list = find_text_in_entities(tweets, entity_desc)
    data_list_min_count = find_min_count(data_list, min_count)

    return data_list_min_count

def clean_text(text, case_sensitive, stop_words):
    if case_sensitive:
        text = text.lower()

    split_text = text.split()
    clean_text = []
    for word in split_text:
        #word = word.strip(PUNCTUATION)
        clean = True
        if stop_words and word in STOP_WORDS:
        #    clean = False
        #if word in STOP_PREFIXES or word.startswith(STOP_PREFIXES): (still have to code the input for this)
        #    clean = False
        if clean and len(word) > 0:
            clean_text.append(word)

    return clean_text

def return_ngrams(text, n, case_sensitive, stop_words):
    text = clean_text(text, case_sensitive, stop_words)

    n_grams = []
    for p in range(0, len(text) - (n - 1)):
        n_gram = []
        for i in range(0, n):
            n_gram.append(text[(i + p)])
        n_grams.append(tuple(n_gram))

    return n_grams

def return_all_ngrams(tweets, n, case_sensitive, stop_words):
    all_ngrams = []
    for tweet in tweets:
        new_ngram = return_ngrams(tweet["abridged_text"], n, case_sensitive, stop_words)
        # may need to be extend instead of append
        all_ngrams.append(new_ngram)
    return all_ngrams

def find_top_k_ngrams(tweets, n, case_sensitive, k):
    stop_words = True
    all_ngrams = return_all_ngrams(tweets, n, case_sensitive, stop_words)
    return find_top_k(all_ngrams, k)

def find_min_count_ngrams(tweets, n, case_sensitive, min_count):
    stop_words = True
    all_ngrams = return_all_ngrams(tweets, n, case_sensitive, stop_words)
    min_count = find_min_count(all_ngrams, min_count)

    return min_count

def find_salient_ngrams(tweets, n, case_sensitive, threshold):
    stop_words = False
    all_ngrams = return_all_ngrams(tweets, n, case_sensitive, stop_words)
    salient_n = find_salient(all_ngrams, threshold)

    return salient_n

PUNCTUATION = ['#']
STOP_PREFIXES
STOP_WORDS

tweets = [ {"abridged_text": "the cat in the hat" },
           {"abridged_text": "don't let the cat on the hat" },
           {"abridged_text": "the cat's hat" },
           {"abridged_text": "the hat cat" }]

ans = find_salient_ngrams(tweets, 2, False, 1.33)
print(ans)