Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
286 changes: 280 additions & 6 deletions ml/predict.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
# Command line usage: `python predict.py "Insert design problem here."`

import os
import re
import sys

import nltk
import numpy as np
import pandas as pd
from fcmeans import FCM
from nltk import PorterStemmer
from matplotlib import pyplot as plt
from matplotlib import cm as cm
from nltk import PorterStemmer, word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from sklearn import cluster
from sklearn.cluster import AgglomerativeClustering, BisectingKMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_samples, silhouette_score
from unidecode import unidecode
from wordcloud import WordCloud

try:
nltk.find("corpora/stopwords")
Expand Down Expand Up @@ -55,6 +61,256 @@ def do_output(text: str = "") -> None:
print(text)


# from PatternKMeans (last edited Jan 2, 2023)
# Transforms a centroids dataframe into a dictionary to be used on a WordCloud.
def centroidsDict(centroids, index):
a = centroids.T[index].sort_values(ascending=False).reset_index().values
centroid_dict = dict()

for i in range(0, len(a)):
centroid_dict.update({a[i, 0]: a[i, 1]})

return centroid_dict


# # Generates a word cloud of the most frequent and influential words in a cluster.
# # Takes in a dataframe and generates a plot. Returns nothing.
# def generateWordClouds(centroids): # centroids is a dataframe
# wordcloud = WordCloud(max_font_size=100, background_color="white")
# for i in range(0, len(centroids)):
# centroid_dict = centroidsDict(centroids, i)
# wordcloud.generate_from_frequencies(centroid_dict)
#
# plt.figure()
# plt.title("Cluster {}".format(i))
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show()

# removes a list of words (ie. stopwords) from a tokenized list.
def removeWords(listOfTokens, listOfWords):
return [token for token in listOfTokens if token not in listOfWords]


# applies stemming to a list of tokenized words
def applyStemming(listOfTokens, stemmer):
return [stemmer.stem(token) for token in listOfTokens]


# applied lemmatization to a list of tokenized words
def applyLemmatization(listOfTokens, lemmatizer):
return [lemmatizer.lemmatize(token) for token in listOfTokens]


# removes any words composed of less than 2 or more than 21 letters
def twoLetters(listOfTokens):
twoLetterWord = []
for token in listOfTokens:
if len(token) <= 2 or len(token) >= 21:
twoLetterWord.append(token)
return twoLetterWord


# removes any words that aren't verbs
def notVerbs(listOfTokens):
notVerb = []
for token in listOfTokens:
if (
pos_tag(word_tokenize(token), tagset="universal")[0][1] != "VERB"
and pos_tag(word_tokenize(token), tagset="universal")[0][1] != "ADJ"
):
notVerb.append(token)
return notVerb


def processCorpus(corpus, language, stemmer):
stopwords = nltk.corpus.stopwords.words(language)
param_stemmer = stemmer

for document in corpus:
index = corpus.index(document)
corpus[index] = str(corpus[index]).replace(
"\ufffd", "8"
) # Replaces the ASCII '�' symbol with '8'
corpus[index] = corpus[index].replace(",", "") # Removes commas
corpus[index] = corpus[index].rstrip("\n") # Removes line breaks
corpus[index] = corpus[index].casefold() # Makes all letters lowercase

corpus[index] = re.sub(
"\W_", " ", corpus[index]
) # removes specials characters and leaves only words
corpus[index] = re.sub(
"\S*\d\S*", " ", corpus[index]
) # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
corpus[index] = re.sub(
"\S*@\S*\s?", " ", corpus[index]
) # removes emails and mentions (words with @)
corpus[index] = re.sub(r"http\S+", "", corpus[index]) # removes URLs with http
corpus[index] = re.sub(r"www\S+", "", corpus[index]) # removes URLs with www

listOfTokens = word_tokenize(corpus[index])
twoLetterWord = twoLetters(listOfTokens)
notVerb = notVerbs(listOfTokens)

listOfTokens = removeWords(listOfTokens, stopwords)
listOfTokens = removeWords(listOfTokens, twoLetterWord)
listOfTokens = removeWords(listOfTokens, notVerb)

listOfTokens = applyStemming(listOfTokens, param_stemmer)
# listOfTokens = applyLemmatization(listOfTokens, lemmatizer)

corpus[index] = " ".join(listOfTokens)
corpus[index] = unidecode(corpus[index])

return corpus


# Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects
def generateWordClouds(df, df_labels, cleaned_text):
# df_labels: patterns vs [0,1,2] cluster each algo classified as. bottom row is input text
# agglomerative bi_kmeans_inertia bi_kmeans_lg_cluster fuzzy_cmeans kmeans pam_euclidean pam_manhattan
# 0 1 0 0 0 1 1 0
# cleaned_text: patterns vs top keywords. bottom row is input text
# ex: 0 provid creat relat depend specifi consid suppo...
wordcloud = WordCloud(max_font_size=100, background_color="white")
list_plots = []
cleaned_text = pd.Series(cleaned_text)
list_keywords = cleaned_text.str.split()
# list_keywords = cleaned_text
# For each algo:
for a, algo in enumerate(algorithms):
# For each cluster c:
for c in range(3):
# From df_labels, get the patterns w/ value c under the current algo
df_cluster = df.loc[df_labels[algo] == c][["name", "category", algo]].copy()
# cluster_keywords = {"word": {"score": 3+4+5, "count": 3}} so
# that we can take the avg score and make a word cloud with that
cluster_keywords = {}
# For each of these patterns:
for p, pattern in enumerate(df_cluster.iloc[:, 0]):
# From cleaned_text, get the keywords as a list
list_keywords_pattern = list_keywords[p]
# Append keywords
for w, keywords in enumerate(list_keywords):
for i, word in enumerate(keywords):
if word in cluster_keywords:
cluster_keywords[word][
"score"
] += i # closer to the beginning is higher priority
cluster_keywords[word][
"count"
] += 1 # increment count so that can take the avg later
else:
cluster_keywords[word] = {"score": w, "count": 1}
# Take the average score for each word, then normalize it into a number between 0 and 1
# Remember, lower scores are higher frequency
word_cloud_input = {}
for word in cluster_keywords:
word_cloud_input[word] = 1 - (
float(
int(cluster_keywords[word]["score"])
/ int(cluster_keywords[word]["count"])
)
/ 100
)
# Plot!
wordcloud.generate_from_frequencies(word_cloud_input)
fig = plt.figure()
plt.suptitle(f"Cluster {a}, Algo {algo}")
plt.imshow(wordcloud)
plt.axis("off")
# Append fig to list of plots
list_plots.append(fig)
# Return list of plots
return list_plots


# TODO: Adapt the code for using k-means for silhouette, word clouds, etc for
# all clustering algorithms in use. Call/merge into do_cluster()?
def run_KMeans(max_k, data):
max_k += 1
kmeans_results = dict()
for k in range(2, max_k):
kmeans = cluster.KMeans(
n_clusters=k,
init="k-means++",
n_init=10,
tol=0.0001
# , n_jobs = -1
,
random_state=1,
algorithm="full",
)

kmeans_results.update({k: kmeans.fit(data)})

return kmeans_results


def printAvg(avg_dict):
for avg in sorted(avg_dict.keys(), reverse=True):
print("Avg: {}\tK:{}".format(avg.round(4), avg_dict[avg]))


def plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg):
fig, ax1 = plt.subplots(1)
fig.set_size_inches(8, 6)
ax1.set_xlim([-0.2, 1])
ax1.set_ylim([0, len(df) + (n_clusters + 1) * 10])

ax1.axvline(
x=silhouette_avg, color="red", linestyle="--"
) # The vertical line for average silhouette score of all the values
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.title(
("Silhouette analysis for K = %d" % n_clusters), fontsize=10, fontweight="bold"
)

y_lower = 10
sample_silhouette_values = silhouette_samples(
df, kmeans_labels
) # Compute the silhouette scores for each sample
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[kmeans_labels == i]
ith_cluster_silhouette_values.sort()

size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i

color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)

ax1.text(
-0.05, y_lower + 0.5 * size_cluster_i, str(i)
) # Label the silhouette plots with their cluster numbers at the middle
y_lower = (
y_upper + 10
) # Compute the new y_lower for next plot. 10 for the 0 samples
plt.show()


def silhouette(kmeans_dict, df, plot=False):
df = df.to_numpy()
avg_dict = dict()
for n_clusters, kmeans in kmeans_dict.items():
kmeans_labels = kmeans.predict(df)
silhouette_avg = silhouette_score(
df, kmeans_labels
) # Average Score for all Samples
avg_dict.update({silhouette_avg: n_clusters})

if plot:
plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg)


def preprocess(series: pd.Series) -> pd.Series:
# Lowercase
series = series.str.lower()
Expand All @@ -76,6 +332,9 @@ def preprocess(series: pd.Series) -> pd.Series:
return series


# TODO: After clustering the patterns, automatically label each cluster.
# Each cluster can have a word cloud, and the clusters can be labeled
# using the most important words in each cluster as a guide.
def do_cluster(df_weighted: pd.DataFrame) -> pd.DataFrame:
# This is the DataFrame we will return that contains all the labels.
df = pd.DataFrame()
Expand Down Expand Up @@ -176,10 +435,19 @@ def main(design_problem: str = ""):
)
df = pd.concat([df, new_row.to_frame().T], ignore_index=True)

# Preprocess
cleaned_text = preprocess(df["overview"])
# Create a dense tfidf matrix
tfidf_matrix = do_weighting("Tfidf", cleaned_text)
# # Preprocess
# cleaned_text = preprocess(df["overview"])
# # Create a dense tfidf matrix
# tfidf_matrix = do_weighting("Tfidf", cleaned_text)
# pre process
corpus = df["overview"].tolist()
corpus = processCorpus(corpus, language="english", stemmer=stemmer)
# vectorize data
vectorizer = TfidfVectorizer(sublinear_tf=True)
X = vectorizer.fit_transform(corpus)
tfidf_matrix = pd.DataFrame(
data=X.toarray(), columns=vectorizer.get_feature_names_out()
)
# Perform clustering
df_labels = do_cluster(tfidf_matrix)
# Append (horizontally) the cluster labels to the original DF
Expand All @@ -193,6 +461,12 @@ def main(design_problem: str = ""):
max_len_pattern = df["name"].str.len().max()
max_len = len(max(algorithms_pretty, key=len))

# Generate word clouds for each clustering algorithm
plots = generateWordClouds(df, df_labels, corpus)
for cloud in plots:
cloud.show()
breakpoint()

do_output()
for i, algorithm in enumerate(algorithms):
do_output(f"{algorithms_pretty[i]}")
Expand Down Expand Up @@ -241,4 +515,4 @@ def main(design_problem: str = ""):


if __name__ == "__main__":
main()
main()