From aebe48cc8afda6ec07eb420a4b266efb4f27498d Mon Sep 17 00:00:00 2001 From: remi Date: Wed, 26 Apr 2023 21:36:54 -0500 Subject: [PATCH 1/5] Added WordCloud-related functions to predict.py --- ml/predict.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/ml/predict.py b/ml/predict.py index 4999f13..32b2d88 100644 --- a/ml/predict.py +++ b/ml/predict.py @@ -16,6 +16,8 @@ from sklearn.metrics.pairwise import cosine_similarity from sklearn_extra.cluster import KMedoids +from wordcloud import WordCloud + try: nltk.find("corpora/stopwords") except LookupError: @@ -55,6 +57,32 @@ def do_output(text: str = "") -> None: print(text) +# from PatternKMeans (last edited Jan 2, 2023) +# Transforms a centroids dataframe into a dictionary to be used on a WordCloud. +def centroidsDict(centroids, index): + a = centroids.T[index].sort_values(ascending=False).reset_index().values + centroid_dict = dict() + + for i in range(0, len(a)): + centroid_dict.update({a[i, 0]: a[i, 1]}) + + return centroid_dict + + +# Generates a word cloud of the most frequent and influential words in a cluster. +def generateWordClouds(centroids): + wordcloud = WordCloud(max_font_size=100, background_color="white") + for i in range(0, len(centroids)): + centroid_dict = centroidsDict(centroids, i) + wordcloud.generate_from_frequencies(centroid_dict) + + plt.figure() + plt.title("Cluster {}".format(i)) + plt.imshow(wordcloud) + plt.axis("off") + plt.show() + + def preprocess(series: pd.Series) -> pd.Series: # Lowercase series = series.str.lower() @@ -76,6 +104,9 @@ def preprocess(series: pd.Series) -> pd.Series: return series +# TODO: After clustering the patterns, automatically label each cluster. +# Each cluster can have a word cloud, and the clusters can be labeled +# using the most important words in each cluster as a guide. def do_cluster(df_weighted: pd.DataFrame) -> pd.DataFrame: # This is the DataFrame we will return that contains all the labels. df = pd.DataFrame() @@ -241,4 +272,4 @@ def main(design_problem: str = ""): if __name__ == "__main__": - main() \ No newline at end of file + main() From 40aff770a869607d4d3426b66b2f8d95042f2496 Mon Sep 17 00:00:00 2001 From: remi Date: Wed, 26 Apr 2023 21:47:40 -0500 Subject: [PATCH 2/5] Added more functions from PatternKMeans --- ml/predict.py | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/ml/predict.py b/ml/predict.py index 32b2d88..26500b3 100644 --- a/ml/predict.py +++ b/ml/predict.py @@ -7,6 +7,8 @@ import numpy as np import pandas as pd from fcmeans import FCM +from matplotlib import pyplot as plt +from matplotlib import cm as cm from nltk import PorterStemmer from nltk.corpus import stopwords from nltk.tag import pos_tag @@ -15,7 +17,7 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn_extra.cluster import KMedoids - +from sklearn.metrics import silhouette_samples, silhouette_score from wordcloud import WordCloud try: @@ -83,6 +85,90 @@ def generateWordClouds(centroids): plt.show() +def run_KMeans(max_k, data): + max_k += 1 + kmeans_results = dict() + for k in range(2, max_k): + kmeans = cluster.KMeans( + n_clusters=k, + init="k-means++", + n_init=10, + tol=0.0001 + # , n_jobs = -1 + , + random_state=1, + algorithm="full", + ) + + kmeans_results.update({k: kmeans.fit(data)}) + + return kmeans_results + + +def printAvg(avg_dict): + for avg in sorted(avg_dict.keys(), reverse=True): + print("Avg: {}\tK:{}".format(avg.round(4), avg_dict[avg])) + + +def plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg): + fig, ax1 = plt.subplots(1) + fig.set_size_inches(8, 6) + ax1.set_xlim([-0.2, 1]) + ax1.set_ylim([0, len(df) + (n_clusters + 1) * 10]) + + ax1.axvline( + x=silhouette_avg, color="red", linestyle="--" + ) # The vertical line for average silhouette score of all the values + ax1.set_yticks([]) # Clear the yaxis labels / ticks + ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) + plt.title( + ("Silhouette analysis for K = %d" % n_clusters), fontsize=10, fontweight="bold" + ) + + y_lower = 10 + sample_silhouette_values = silhouette_samples( + df, kmeans_labels + ) # Compute the silhouette scores for each sample + for i in range(n_clusters): + ith_cluster_silhouette_values = sample_silhouette_values[kmeans_labels == i] + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.nipy_spectral(float(i) / n_clusters) + ax1.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) + + ax1.text( + -0.05, y_lower + 0.5 * size_cluster_i, str(i) + ) # Label the silhouette plots with their cluster numbers at the middle + y_lower = ( + y_upper + 10 + ) # Compute the new y_lower for next plot. 10 for the 0 samples + plt.show() + + +def silhouette(kmeans_dict, df, plot=False): + df = df.to_numpy() + avg_dict = dict() + for n_clusters, kmeans in kmeans_dict.items(): + kmeans_labels = kmeans.predict(df) + silhouette_avg = silhouette_score( + df, kmeans_labels + ) # Average Score for all Samples + avg_dict.update({silhouette_avg: n_clusters}) + + if plot: + plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg) + + def preprocess(series: pd.Series) -> pd.Series: # Lowercase series = series.str.lower() From a638b4352a52c93da3c921978bc28bee612c8a4b Mon Sep 17 00:00:00 2001 From: remi Date: Fri, 28 Apr 2023 02:57:53 -0500 Subject: [PATCH 3/5] New method for generating word clouds --- ml/predict.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/ml/predict.py b/ml/predict.py index 26500b3..197822e 100644 --- a/ml/predict.py +++ b/ml/predict.py @@ -72,7 +72,8 @@ def centroidsDict(centroids, index): # Generates a word cloud of the most frequent and influential words in a cluster. -def generateWordClouds(centroids): +# Takes in a dataframe and generates a plot. Returns nothing. +def generateWordClouds(centroids): # centroids is a dataframe wordcloud = WordCloud(max_font_size=100, background_color="white") for i in range(0, len(centroids)): centroid_dict = centroidsDict(centroids, i) @@ -85,6 +86,62 @@ def generateWordClouds(centroids): plt.show() +# Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects +def generateWordClouds(df, df_labels, cleaned_text): + # df_labels: patterns vs [0,1,2] cluster each algo classified as. bottom row is input text + # agglomerative bi_kmeans_inertia bi_kmeans_lg_cluster fuzzy_cmeans kmeans pam_euclidean pam_manhattan + # 0 1 0 0 0 1 1 0 + # cleaned_text: patterns vs top keywords. bottom row is input text + # ex: 0 provid creat relat depend specifi consid suppo... + wordcloud = WordCloud(max_font_size=100, background_color="white") + + list_plots = [] + # For each algo: + for a, algo in enumerate(algorithms): + # For each cluster c: + for c in range(3): + # From df_labels, get the patterns w/ value c under the current algo + df_cluster = df.loc[df_labels[algo] == c][["name", "category", algo]].copy() + # cluster_keywords = {"word": {"score": 3+4+5, "count": 3}} so + # that we can take the avg score and make a word cloud with that + cluster_keywords = {} + # For each of these patterns: + for pattern in df_cluster: + # From cleaned_text, get the keywords as a list + list_keywords = cleaned_text.str.split() + # Append keywords + print(list_keywords) + for w, keywords in enumerate(list_keywords): + for word in keywords: + print(word) + if word in cluster_keywords: + cluster_keywords[word][ + "score" + ] += w # closer to the beginning is higher priority + cluster_keywords[word][ + "count" + ] += 1 # increment count so that can take the avg later + else: + cluster_keywords[word] = {"score": w, "count": 1} + # Take the average score for each word, then normalize it into a number between 0 and 1 + # Remember, lower scores are higher frequency + word_cloud_input = {} + for word in cluster_keywords: + word_cloud_input[word] = 1 - (float(int(word[0]) / int(word[1])) / 100) + # Plot! + wordcloud.generate_from_frequencies(word_cloud_input) + fig = plt.figure() + fig.title(f"Cluster {a}, Algo {algo}") + fig.imshow(wordcloud) + fig.axis("off") + # Append fig to list of plots + list_plots.append(fig) + # Return list of plots + return list_plots + + +# TODO: Adapt the code for using k-means for silhouette, word clouds, etc for +# all clustering algorithms in use. Call/merge into do_cluster()? def run_KMeans(max_k, data): max_k += 1 kmeans_results = dict() @@ -310,6 +367,9 @@ def main(design_problem: str = ""): max_len_pattern = df["name"].str.len().max() max_len = len(max(algorithms_pretty, key=len)) + # Generate word clouds for each clustering algorithm + generateWordClouds(df, df_labels, cleaned_text) + do_output() for i, algorithm in enumerate(algorithms): do_output(f"{algorithms_pretty[i]}") From d413b2d9f84aabe668dabe5dcb0afe36e0a388e7 Mon Sep 17 00:00:00 2001 From: remi Date: Fri, 28 Apr 2023 10:10:01 -0500 Subject: [PATCH 4/5] Different clusters and algos for word clouds --- ml/predict.py | 57 +++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/ml/predict.py b/ml/predict.py index 197822e..6b4e6e9 100644 --- a/ml/predict.py +++ b/ml/predict.py @@ -71,19 +71,19 @@ def centroidsDict(centroids, index): return centroid_dict -# Generates a word cloud of the most frequent and influential words in a cluster. -# Takes in a dataframe and generates a plot. Returns nothing. -def generateWordClouds(centroids): # centroids is a dataframe - wordcloud = WordCloud(max_font_size=100, background_color="white") - for i in range(0, len(centroids)): - centroid_dict = centroidsDict(centroids, i) - wordcloud.generate_from_frequencies(centroid_dict) - - plt.figure() - plt.title("Cluster {}".format(i)) - plt.imshow(wordcloud) - plt.axis("off") - plt.show() +# # Generates a word cloud of the most frequent and influential words in a cluster. +# # Takes in a dataframe and generates a plot. Returns nothing. +# def generateWordClouds(centroids): # centroids is a dataframe +# wordcloud = WordCloud(max_font_size=100, background_color="white") +# for i in range(0, len(centroids)): +# centroid_dict = centroidsDict(centroids, i) +# wordcloud.generate_from_frequencies(centroid_dict) +# +# plt.figure() +# plt.title("Cluster {}".format(i)) +# plt.imshow(wordcloud) +# plt.axis("off") +# plt.show() # Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects @@ -94,8 +94,8 @@ def generateWordClouds(df, df_labels, cleaned_text): # cleaned_text: patterns vs top keywords. bottom row is input text # ex: 0 provid creat relat depend specifi consid suppo... wordcloud = WordCloud(max_font_size=100, background_color="white") - list_plots = [] + list_keywords = cleaned_text.str.split() # For each algo: for a, algo in enumerate(algorithms): # For each cluster c: @@ -106,18 +106,16 @@ def generateWordClouds(df, df_labels, cleaned_text): # that we can take the avg score and make a word cloud with that cluster_keywords = {} # For each of these patterns: - for pattern in df_cluster: + for p, pattern in enumerate(df_cluster.iloc[:, 0]): # From cleaned_text, get the keywords as a list - list_keywords = cleaned_text.str.split() + list_keywords_pattern = list_keywords[p] # Append keywords - print(list_keywords) for w, keywords in enumerate(list_keywords): - for word in keywords: - print(word) + for i, word in enumerate(keywords): if word in cluster_keywords: cluster_keywords[word][ "score" - ] += w # closer to the beginning is higher priority + ] += i # closer to the beginning is higher priority cluster_keywords[word][ "count" ] += 1 # increment count so that can take the avg later @@ -127,13 +125,19 @@ def generateWordClouds(df, df_labels, cleaned_text): # Remember, lower scores are higher frequency word_cloud_input = {} for word in cluster_keywords: - word_cloud_input[word] = 1 - (float(int(word[0]) / int(word[1])) / 100) + word_cloud_input[word] = 1 - ( + float( + int(cluster_keywords[word]["score"]) + / int(cluster_keywords[word]["count"]) + ) + / 100 + ) # Plot! wordcloud.generate_from_frequencies(word_cloud_input) fig = plt.figure() - fig.title(f"Cluster {a}, Algo {algo}") - fig.imshow(wordcloud) - fig.axis("off") + plt.suptitle(f"Cluster {a}, Algo {algo}") + plt.imshow(wordcloud) + plt.axis("off") # Append fig to list of plots list_plots.append(fig) # Return list of plots @@ -368,7 +372,10 @@ def main(design_problem: str = ""): max_len = len(max(algorithms_pretty, key=len)) # Generate word clouds for each clustering algorithm - generateWordClouds(df, df_labels, cleaned_text) + plots = generateWordClouds(df, df_labels, cleaned_text) + for cloud in plots: + cloud.show() + # breakpoint() do_output() for i, algorithm in enumerate(algorithms): From 16464329f47c82398ed28d712fd32531f1d30be3 Mon Sep 17 00:00:00 2001 From: remi Date: Fri, 28 Apr 2023 12:50:36 -0500 Subject: [PATCH 5/5] More work on word clouds --- ml/predict.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/ml/predict.py b/ml/predict.py index 6b4e6e9..44b9f4f 100644 --- a/ml/predict.py +++ b/ml/predict.py @@ -1,6 +1,7 @@ # Command line usage: `python predict.py "Insert design problem here."` import os +import re import sys import nltk @@ -9,7 +10,7 @@ from fcmeans import FCM from matplotlib import pyplot as plt from matplotlib import cm as cm -from nltk import PorterStemmer +from nltk import PorterStemmer, word_tokenize from nltk.corpus import stopwords from nltk.tag import pos_tag from sklearn import cluster @@ -18,6 +19,7 @@ from sklearn.metrics.pairwise import cosine_similarity from sklearn_extra.cluster import KMedoids from sklearn.metrics import silhouette_samples, silhouette_score +from unidecode import unidecode from wordcloud import WordCloud try: @@ -85,6 +87,83 @@ def centroidsDict(centroids, index): # plt.axis("off") # plt.show() +# removes a list of words (ie. stopwords) from a tokenized list. +def removeWords(listOfTokens, listOfWords): + return [token for token in listOfTokens if token not in listOfWords] + + +# applies stemming to a list of tokenized words +def applyStemming(listOfTokens, stemmer): + return [stemmer.stem(token) for token in listOfTokens] + + +# applied lemmatization to a list of tokenized words +def applyLemmatization(listOfTokens, lemmatizer): + return [lemmatizer.lemmatize(token) for token in listOfTokens] + + +# removes any words composed of less than 2 or more than 21 letters +def twoLetters(listOfTokens): + twoLetterWord = [] + for token in listOfTokens: + if len(token) <= 2 or len(token) >= 21: + twoLetterWord.append(token) + return twoLetterWord + + +# removes any words that aren't verbs +def notVerbs(listOfTokens): + notVerb = [] + for token in listOfTokens: + if ( + pos_tag(word_tokenize(token), tagset="universal")[0][1] != "VERB" + and pos_tag(word_tokenize(token), tagset="universal")[0][1] != "ADJ" + ): + notVerb.append(token) + return notVerb + + +def processCorpus(corpus, language, stemmer): + stopwords = nltk.corpus.stopwords.words(language) + param_stemmer = stemmer + + for document in corpus: + index = corpus.index(document) + corpus[index] = str(corpus[index]).replace( + "\ufffd", "8" + ) # Replaces the ASCII '�' symbol with '8' + corpus[index] = corpus[index].replace(",", "") # Removes commas + corpus[index] = corpus[index].rstrip("\n") # Removes line breaks + corpus[index] = corpus[index].casefold() # Makes all letters lowercase + + corpus[index] = re.sub( + "\W_", " ", corpus[index] + ) # removes specials characters and leaves only words + corpus[index] = re.sub( + "\S*\d\S*", " ", corpus[index] + ) # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381. + corpus[index] = re.sub( + "\S*@\S*\s?", " ", corpus[index] + ) # removes emails and mentions (words with @) + corpus[index] = re.sub(r"http\S+", "", corpus[index]) # removes URLs with http + corpus[index] = re.sub(r"www\S+", "", corpus[index]) # removes URLs with www + + listOfTokens = word_tokenize(corpus[index]) + twoLetterWord = twoLetters(listOfTokens) + notVerb = notVerbs(listOfTokens) + + listOfTokens = removeWords(listOfTokens, stopwords) + listOfTokens = removeWords(listOfTokens, twoLetterWord) + listOfTokens = removeWords(listOfTokens, notVerb) + + listOfTokens = applyStemming(listOfTokens, param_stemmer) + # listOfTokens = applyLemmatization(listOfTokens, lemmatizer) + + corpus[index] = " ".join(listOfTokens) + corpus[index] = unidecode(corpus[index]) + + return corpus + # Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects def generateWordClouds(df, df_labels, cleaned_text): @@ -95,7 +174,9 @@ def generateWordClouds(df, df_labels, cleaned_text): # ex: 0 provid creat relat depend specifi consid suppo... wordcloud = WordCloud(max_font_size=100, background_color="white") list_plots = [] + cleaned_text = pd.Series(cleaned_text) list_keywords = cleaned_text.str.split() + # list_keywords = cleaned_text # For each algo: for a, algo in enumerate(algorithms): # For each cluster c: @@ -354,10 +435,19 @@ def main(design_problem: str = ""): ) df = pd.concat([df, new_row.to_frame().T], ignore_index=True) - # Preprocess - cleaned_text = preprocess(df["overview"]) - # Create a dense tfidf matrix - tfidf_matrix = do_weighting("Tfidf", cleaned_text) + # # Preprocess + # cleaned_text = preprocess(df["overview"]) + # # Create a dense tfidf matrix + # tfidf_matrix = do_weighting("Tfidf", cleaned_text) + # pre process + corpus = df["overview"].tolist() + corpus = processCorpus(corpus, language="english", stemmer=stemmer) + # vectorize data + vectorizer = TfidfVectorizer(sublinear_tf=True) + X = vectorizer.fit_transform(corpus) + tfidf_matrix = pd.DataFrame( + data=X.toarray(), columns=vectorizer.get_feature_names_out() + ) # Perform clustering df_labels = do_cluster(tfidf_matrix) # Append (horizontally) the cluster labels to the original DF @@ -372,10 +462,10 @@ def main(design_problem: str = ""): max_len = len(max(algorithms_pretty, key=len)) # Generate word clouds for each clustering algorithm - plots = generateWordClouds(df, df_labels, cleaned_text) + plots = generateWordClouds(df, df_labels, corpus) for cloud in plots: cloud.show() - # breakpoint() + breakpoint() do_output() for i, algorithm in enumerate(algorithms):