From aebe48cc8afda6ec07eb420a4b266efb4f27498d Mon Sep 17 00:00:00 2001
From: remi <remitoutin@gmail.com>
Date: Wed, 26 Apr 2023 21:36:54 -0500
Subject: [PATCH 1/5] Added WordCloud-related functions to predict.py

---
 ml/predict.py | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/ml/predict.py b/ml/predict.py
index 4999f13..32b2d88 100644
--- a/ml/predict.py
+++ b/ml/predict.py
@@ -16,6 +16,8 @@
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn_extra.cluster import KMedoids
 
+from wordcloud import WordCloud
+
 try:
     nltk.find("corpora/stopwords")
 except LookupError:
@@ -55,6 +57,32 @@ def do_output(text: str = "") -> None:
     print(text)
 
 
+# from PatternKMeans (last edited Jan 2, 2023)
+# Transforms a centroids dataframe into a dictionary to be used on a WordCloud.
+def centroidsDict(centroids, index):
+    a = centroids.T[index].sort_values(ascending=False).reset_index().values
+    centroid_dict = dict()
+
+    for i in range(0, len(a)):
+        centroid_dict.update({a[i, 0]: a[i, 1]})
+
+    return centroid_dict
+
+
+# Generates a word cloud of the most frequent and influential words in a cluster.
+def generateWordClouds(centroids):
+    wordcloud = WordCloud(max_font_size=100, background_color="white")
+    for i in range(0, len(centroids)):
+        centroid_dict = centroidsDict(centroids, i)
+        wordcloud.generate_from_frequencies(centroid_dict)
+
+        plt.figure()
+        plt.title("Cluster {}".format(i))
+        plt.imshow(wordcloud)
+        plt.axis("off")
+        plt.show()
+
+
 def preprocess(series: pd.Series) -> pd.Series:
     # Lowercase
     series = series.str.lower()
@@ -76,6 +104,9 @@ def preprocess(series: pd.Series) -> pd.Series:
     return series
 
 
+# TODO: After clustering the patterns, automatically label each cluster.
+#       Each cluster can have a word cloud, and the clusters can be labeled
+#       using the most important words in each cluster as a guide.
 def do_cluster(df_weighted: pd.DataFrame) -> pd.DataFrame:
     # This is the DataFrame we will return that contains all the labels.
     df = pd.DataFrame()
@@ -241,4 +272,4 @@ def main(design_problem: str = ""):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 40aff770a869607d4d3426b66b2f8d95042f2496 Mon Sep 17 00:00:00 2001
From: remi <remitoutin@gmail.com>
Date: Wed, 26 Apr 2023 21:47:40 -0500
Subject: [PATCH 2/5] Added more functions from PatternKMeans

---
 ml/predict.py | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/ml/predict.py b/ml/predict.py
index 32b2d88..26500b3 100644
--- a/ml/predict.py
+++ b/ml/predict.py
@@ -7,6 +7,8 @@
 import numpy as np
 import pandas as pd
 from fcmeans import FCM
+from matplotlib import pyplot as plt
+from matplotlib import cm as cm
 from nltk import PorterStemmer
 from nltk.corpus import stopwords
 from nltk.tag import pos_tag
@@ -15,7 +17,7 @@
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn_extra.cluster import KMedoids
-
+from sklearn.metrics import silhouette_samples, silhouette_score
 from wordcloud import WordCloud
 
 try:
@@ -83,6 +85,90 @@ def generateWordClouds(centroids):
         plt.show()
 
 
+def run_KMeans(max_k, data):
+    max_k += 1
+    kmeans_results = dict()
+    for k in range(2, max_k):
+        kmeans = cluster.KMeans(
+            n_clusters=k,
+            init="k-means++",
+            n_init=10,
+            tol=0.0001
+            # , n_jobs = -1
+            ,
+            random_state=1,
+            algorithm="full",
+        )
+
+        kmeans_results.update({k: kmeans.fit(data)})
+
+    return kmeans_results
+
+
+def printAvg(avg_dict):
+    for avg in sorted(avg_dict.keys(), reverse=True):
+        print("Avg: {}\tK:{}".format(avg.round(4), avg_dict[avg]))
+
+
+def plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg):
+    fig, ax1 = plt.subplots(1)
+    fig.set_size_inches(8, 6)
+    ax1.set_xlim([-0.2, 1])
+    ax1.set_ylim([0, len(df) + (n_clusters + 1) * 10])
+
+    ax1.axvline(
+        x=silhouette_avg, color="red", linestyle="--"
+    )  # The vertical line for average silhouette score of all the values
+    ax1.set_yticks([])  # Clear the yaxis labels / ticks
+    ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
+    plt.title(
+        ("Silhouette analysis for K = %d" % n_clusters), fontsize=10, fontweight="bold"
+    )
+
+    y_lower = 10
+    sample_silhouette_values = silhouette_samples(
+        df, kmeans_labels
+    )  # Compute the silhouette scores for each sample
+    for i in range(n_clusters):
+        ith_cluster_silhouette_values = sample_silhouette_values[kmeans_labels == i]
+        ith_cluster_silhouette_values.sort()
+
+        size_cluster_i = ith_cluster_silhouette_values.shape[0]
+        y_upper = y_lower + size_cluster_i
+
+        color = cm.nipy_spectral(float(i) / n_clusters)
+        ax1.fill_betweenx(
+            np.arange(y_lower, y_upper),
+            0,
+            ith_cluster_silhouette_values,
+            facecolor=color,
+            edgecolor=color,
+            alpha=0.7,
+        )
+
+        ax1.text(
+            -0.05, y_lower + 0.5 * size_cluster_i, str(i)
+        )  # Label the silhouette plots with their cluster numbers at the middle
+        y_lower = (
+            y_upper + 10
+        )  # Compute the new y_lower for next plot. 10 for the 0 samples
+    plt.show()
+
+
+def silhouette(kmeans_dict, df, plot=False):
+    df = df.to_numpy()
+    avg_dict = dict()
+    for n_clusters, kmeans in kmeans_dict.items():
+        kmeans_labels = kmeans.predict(df)
+        silhouette_avg = silhouette_score(
+            df, kmeans_labels
+        )  # Average Score for all Samples
+        avg_dict.update({silhouette_avg: n_clusters})
+
+        if plot:
+            plotSilhouette(df, n_clusters, kmeans_labels, silhouette_avg)
+
+
 def preprocess(series: pd.Series) -> pd.Series:
     # Lowercase
     series = series.str.lower()

From a638b4352a52c93da3c921978bc28bee612c8a4b Mon Sep 17 00:00:00 2001
From: remi <remitoutin@gmail.com>
Date: Fri, 28 Apr 2023 02:57:53 -0500
Subject: [PATCH 3/5] New method for generating word clouds

---
 ml/predict.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/ml/predict.py b/ml/predict.py
index 26500b3..197822e 100644
--- a/ml/predict.py
+++ b/ml/predict.py
@@ -72,7 +72,8 @@ def centroidsDict(centroids, index):
 
 
 # Generates a word cloud of the most frequent and influential words in a cluster.
-def generateWordClouds(centroids):
+# Takes in a dataframe and generates a plot. Returns nothing.
+def generateWordClouds(centroids):  # centroids is a dataframe
     wordcloud = WordCloud(max_font_size=100, background_color="white")
     for i in range(0, len(centroids)):
         centroid_dict = centroidsDict(centroids, i)
@@ -85,6 +86,62 @@ def generateWordClouds(centroids):
         plt.show()
 
 
+# Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects
+def generateWordClouds(df, df_labels, cleaned_text):
+    # df_labels: patterns vs [0,1,2] cluster each algo classified as. bottom row is input text
+    #   agglomerative  bi_kmeans_inertia  bi_kmeans_lg_cluster  fuzzy_cmeans  kmeans  pam_euclidean  pam_manhattan
+    #   0               1                  0                     0             0       1              1              0
+    # cleaned_text: patterns vs top keywords. bottom row is input text
+    # ex: 0     provid creat relat depend specifi consid suppo...
+    wordcloud = WordCloud(max_font_size=100, background_color="white")
+
+    list_plots = []
+    # For each algo:
+    for a, algo in enumerate(algorithms):
+        # For each cluster c:
+        for c in range(3):
+            # From df_labels, get the patterns w/ value c under the current algo
+            df_cluster = df.loc[df_labels[algo] == c][["name", "category", algo]].copy()
+            # cluster_keywords = {"word": {"score": 3+4+5, "count": 3}} so
+            # that we can take the avg score and make a word cloud with that
+            cluster_keywords = {}
+            # For each of these patterns:
+            for pattern in df_cluster:
+                # From cleaned_text, get the keywords as a list
+                list_keywords = cleaned_text.str.split()
+                # Append keywords
+                print(list_keywords)
+                for w, keywords in enumerate(list_keywords):
+                    for word in keywords:
+                        print(word)
+                        if word in cluster_keywords:
+                            cluster_keywords[word][
+                                "score"
+                            ] += w  # closer to the beginning is higher priority
+                            cluster_keywords[word][
+                                "count"
+                            ] += 1  # increment count so that can take the avg later
+                        else:
+                            cluster_keywords[word] = {"score": w, "count": 1}
+            # Take the average score for each word, then normalize it into a number between 0 and 1
+            # Remember, lower scores are higher frequency
+            word_cloud_input = {}
+            for word in cluster_keywords:
+                word_cloud_input[word] = 1 - (float(int(word[0]) / int(word[1])) / 100)
+            # Plot!
+            wordcloud.generate_from_frequencies(word_cloud_input)
+            fig = plt.figure()
+            fig.title(f"Cluster {a}, Algo {algo}")
+            fig.imshow(wordcloud)
+            fig.axis("off")
+            # Append fig to list of plots
+            list_plots.append(fig)
+    # Return list of plots
+    return list_plots
+
+
+# TODO: Adapt the code for using k-means for silhouette, word clouds, etc for
+#       all clustering algorithms in use. Call/merge into do_cluster()?
 def run_KMeans(max_k, data):
     max_k += 1
     kmeans_results = dict()
@@ -310,6 +367,9 @@ def main(design_problem: str = ""):
     max_len_pattern = df["name"].str.len().max()
     max_len = len(max(algorithms_pretty, key=len))
 
+    # Generate word clouds for each clustering algorithm
+    generateWordClouds(df, df_labels, cleaned_text)
+
     do_output()
     for i, algorithm in enumerate(algorithms):
         do_output(f"{algorithms_pretty[i]}")

From d413b2d9f84aabe668dabe5dcb0afe36e0a388e7 Mon Sep 17 00:00:00 2001
From: remi <remitoutin@gmail.com>
Date: Fri, 28 Apr 2023 10:10:01 -0500
Subject: [PATCH 4/5] Different clusters and algos for word clouds

---
 ml/predict.py | 57 +++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/ml/predict.py b/ml/predict.py
index 197822e..6b4e6e9 100644
--- a/ml/predict.py
+++ b/ml/predict.py
@@ -71,19 +71,19 @@ def centroidsDict(centroids, index):
     return centroid_dict
 
 
-# Generates a word cloud of the most frequent and influential words in a cluster.
-# Takes in a dataframe and generates a plot. Returns nothing.
-def generateWordClouds(centroids):  # centroids is a dataframe
-    wordcloud = WordCloud(max_font_size=100, background_color="white")
-    for i in range(0, len(centroids)):
-        centroid_dict = centroidsDict(centroids, i)
-        wordcloud.generate_from_frequencies(centroid_dict)
-
-        plt.figure()
-        plt.title("Cluster {}".format(i))
-        plt.imshow(wordcloud)
-        plt.axis("off")
-        plt.show()
+# # Generates a word cloud of the most frequent and influential words in a cluster.
+# # Takes in a dataframe and generates a plot. Returns nothing.
+# def generateWordClouds(centroids):  # centroids is a dataframe
+#     wordcloud = WordCloud(max_font_size=100, background_color="white")
+#     for i in range(0, len(centroids)):
+#         centroid_dict = centroidsDict(centroids, i)
+#         wordcloud.generate_from_frequencies(centroid_dict)
+#
+#         plt.figure()
+#         plt.title("Cluster {}".format(i))
+#         plt.imshow(wordcloud)
+#         plt.axis("off")
+#         plt.show()
 
 
 # Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects
@@ -94,8 +94,8 @@ def generateWordClouds(df, df_labels, cleaned_text):
     # cleaned_text: patterns vs top keywords. bottom row is input text
     # ex: 0     provid creat relat depend specifi consid suppo...
     wordcloud = WordCloud(max_font_size=100, background_color="white")
-
     list_plots = []
+    list_keywords = cleaned_text.str.split()
     # For each algo:
     for a, algo in enumerate(algorithms):
         # For each cluster c:
@@ -106,18 +106,16 @@ def generateWordClouds(df, df_labels, cleaned_text):
             # that we can take the avg score and make a word cloud with that
             cluster_keywords = {}
             # For each of these patterns:
-            for pattern in df_cluster:
+            for p, pattern in enumerate(df_cluster.iloc[:, 0]):
                 # From cleaned_text, get the keywords as a list
-                list_keywords = cleaned_text.str.split()
+                list_keywords_pattern = list_keywords[p]
                 # Append keywords
-                print(list_keywords)
                 for w, keywords in enumerate(list_keywords):
-                    for word in keywords:
-                        print(word)
+                    for i, word in enumerate(keywords):
                         if word in cluster_keywords:
                             cluster_keywords[word][
                                 "score"
-                            ] += w  # closer to the beginning is higher priority
+                            ] += i  # closer to the beginning is higher priority
                             cluster_keywords[word][
                                 "count"
                             ] += 1  # increment count so that can take the avg later
@@ -127,13 +125,19 @@ def generateWordClouds(df, df_labels, cleaned_text):
             # Remember, lower scores are higher frequency
             word_cloud_input = {}
             for word in cluster_keywords:
-                word_cloud_input[word] = 1 - (float(int(word[0]) / int(word[1])) / 100)
+                word_cloud_input[word] = 1 - (
+                    float(
+                        int(cluster_keywords[word]["score"])
+                        / int(cluster_keywords[word]["count"])
+                    )
+                    / 100
+                )
             # Plot!
             wordcloud.generate_from_frequencies(word_cloud_input)
             fig = plt.figure()
-            fig.title(f"Cluster {a}, Algo {algo}")
-            fig.imshow(wordcloud)
-            fig.axis("off")
+            plt.suptitle(f"Cluster {a}, Algo {algo}")
+            plt.imshow(wordcloud)
+            plt.axis("off")
             # Append fig to list of plots
             list_plots.append(fig)
     # Return list of plots
@@ -368,7 +372,10 @@ def main(design_problem: str = ""):
     max_len = len(max(algorithms_pretty, key=len))
 
     # Generate word clouds for each clustering algorithm
-    generateWordClouds(df, df_labels, cleaned_text)
+    plots = generateWordClouds(df, df_labels, cleaned_text)
+    for cloud in plots:
+        cloud.show()
+        # breakpoint()
 
     do_output()
     for i, algorithm in enumerate(algorithms):

From 16464329f47c82398ed28d712fd32531f1d30be3 Mon Sep 17 00:00:00 2001
From: remi <remitoutin@gmail.com>
Date: Fri, 28 Apr 2023 12:50:36 -0500
Subject: [PATCH 5/5] More work on word clouds

---
 ml/predict.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 97 insertions(+), 7 deletions(-)

diff --git a/ml/predict.py b/ml/predict.py
index 6b4e6e9..44b9f4f 100644
--- a/ml/predict.py
+++ b/ml/predict.py
@@ -1,6 +1,7 @@
 # Command line usage: `python predict.py "Insert design problem here."`
 
 import os
+import re
 import sys
 
 import nltk
@@ -9,7 +10,7 @@
 from fcmeans import FCM
 from matplotlib import pyplot as plt
 from matplotlib import cm as cm
-from nltk import PorterStemmer
+from nltk import PorterStemmer, word_tokenize
 from nltk.corpus import stopwords
 from nltk.tag import pos_tag
 from sklearn import cluster
@@ -18,6 +19,7 @@
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn_extra.cluster import KMedoids
 from sklearn.metrics import silhouette_samples, silhouette_score
+from unidecode import unidecode
 from wordcloud import WordCloud
 
 try:
@@ -85,6 +87,83 @@ def centroidsDict(centroids, index):
 #         plt.axis("off")
 #         plt.show()
 
+# removes a list of words (ie. stopwords) from a tokenized list.
+def removeWords(listOfTokens, listOfWords):
+    return [token for token in listOfTokens if token not in listOfWords]
+
+
+# applies stemming to a list of tokenized words
+def applyStemming(listOfTokens, stemmer):
+    return [stemmer.stem(token) for token in listOfTokens]
+
+
+# applied lemmatization to a list of tokenized words
+def applyLemmatization(listOfTokens, lemmatizer):
+    return [lemmatizer.lemmatize(token) for token in listOfTokens]
+
+
+# removes any words composed of less than 2 or more than 21 letters
+def twoLetters(listOfTokens):
+    twoLetterWord = []
+    for token in listOfTokens:
+        if len(token) <= 2 or len(token) >= 21:
+            twoLetterWord.append(token)
+    return twoLetterWord
+
+
+# removes any words that aren't verbs
+def notVerbs(listOfTokens):
+    notVerb = []
+    for token in listOfTokens:
+        if (
+            pos_tag(word_tokenize(token), tagset="universal")[0][1] != "VERB"
+            and pos_tag(word_tokenize(token), tagset="universal")[0][1] != "ADJ"
+        ):
+            notVerb.append(token)
+    return notVerb
+
+
+def processCorpus(corpus, language, stemmer):
+    stopwords = nltk.corpus.stopwords.words(language)
+    param_stemmer = stemmer
+
+    for document in corpus:
+        index = corpus.index(document)
+        corpus[index] = str(corpus[index]).replace(
+            "\ufffd", "8"
+        )  # Replaces the ASCII '�' symbol with '8'
+        corpus[index] = corpus[index].replace(",", "")  # Removes commas
+        corpus[index] = corpus[index].rstrip("\n")  # Removes line breaks
+        corpus[index] = corpus[index].casefold()  # Makes all letters lowercase
+
+        corpus[index] = re.sub(
+            "\W_", " ", corpus[index]
+        )  # removes specials characters and leaves only words
+        corpus[index] = re.sub(
+            "\S*\d\S*", " ", corpus[index]
+        )  # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
+        corpus[index] = re.sub(
+            "\S*@\S*\s?", " ", corpus[index]
+        )  # removes emails and mentions (words with @)
+        corpus[index] = re.sub(r"http\S+", "", corpus[index])  # removes URLs with http
+        corpus[index] = re.sub(r"www\S+", "", corpus[index])  # removes URLs with www
+
+        listOfTokens = word_tokenize(corpus[index])
+        twoLetterWord = twoLetters(listOfTokens)
+        notVerb = notVerbs(listOfTokens)
+
+        listOfTokens = removeWords(listOfTokens, stopwords)
+        listOfTokens = removeWords(listOfTokens, twoLetterWord)
+        listOfTokens = removeWords(listOfTokens, notVerb)
+
+        listOfTokens = applyStemming(listOfTokens, param_stemmer)
+        # listOfTokens = applyLemmatization(listOfTokens, lemmatizer)
+
+        corpus[index] = " ".join(listOfTokens)
+        corpus[index] = unidecode(corpus[index])
+
+    return corpus
+
 
 # Generates a word cloud of the most frequent and influential words in a cluster. Returns a list of figure objects
 def generateWordClouds(df, df_labels, cleaned_text):
@@ -95,7 +174,9 @@ def generateWordClouds(df, df_labels, cleaned_text):
     # ex: 0     provid creat relat depend specifi consid suppo...
     wordcloud = WordCloud(max_font_size=100, background_color="white")
     list_plots = []
+    cleaned_text = pd.Series(cleaned_text)
     list_keywords = cleaned_text.str.split()
+    # list_keywords = cleaned_text
     # For each algo:
     for a, algo in enumerate(algorithms):
         # For each cluster c:
@@ -354,10 +435,19 @@ def main(design_problem: str = ""):
     )
     df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
 
-    # Preprocess
-    cleaned_text = preprocess(df["overview"])
-    # Create a dense tfidf matrix
-    tfidf_matrix = do_weighting("Tfidf", cleaned_text)
+    # # Preprocess
+    # cleaned_text = preprocess(df["overview"])
+    # # Create a dense tfidf matrix
+    # tfidf_matrix = do_weighting("Tfidf", cleaned_text)
+    # pre process
+    corpus = df["overview"].tolist()
+    corpus = processCorpus(corpus, language="english", stemmer=stemmer)
+    # vectorize data
+    vectorizer = TfidfVectorizer(sublinear_tf=True)
+    X = vectorizer.fit_transform(corpus)
+    tfidf_matrix = pd.DataFrame(
+        data=X.toarray(), columns=vectorizer.get_feature_names_out()
+    )
     # Perform clustering
     df_labels = do_cluster(tfidf_matrix)
     # Append (horizontally) the cluster labels to the original DF
@@ -372,10 +462,10 @@ def main(design_problem: str = ""):
     max_len = len(max(algorithms_pretty, key=len))
 
     # Generate word clouds for each clustering algorithm
-    plots = generateWordClouds(df, df_labels, cleaned_text)
+    plots = generateWordClouds(df, df_labels, corpus)
     for cloud in plots:
         cloud.show()
-        # breakpoint()
+        breakpoint()
 
     do_output()
     for i, algorithm in enumerate(algorithms):