RETprojects · RETprojects · Apr 23, 2023 · Apr 23, 2023 · Apr 27, 2023
diff --git a/ml/predict.py b/ml/predict.py
@@ -7,14 +7,16 @@
 import numpy as np
 import pandas as pd
 from fcmeans import FCM
-from nltk import PorterStemmer
+from nltk import PorterStemmer, word_tokenize
 from nltk.corpus import stopwords
 from nltk.tag import pos_tag
 from sklearn import cluster
 from sklearn.cluster import AgglomerativeClustering, BisectingKMeans
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn_extra.cluster import KMedoids
+from unidecode import unidecode
+import re
 
 try:
     nltk.find("corpora/stopwords")
@@ -76,43 +78,112 @@ def preprocess(series: pd.Series) -> pd.Series:
     return series
 
 
-def do_cluster(df_weighted: pd.DataFrame) -> pd.DataFrame:
+def do_cluster(df_weighted: pd.DataFrame, algo: str) -> pd.DataFrame:
     # This is the DataFrame we will return that contains all the labels.
     df = pd.DataFrame()
 
-    # Agglomerative (hierarchical)
-    agg = AgglomerativeClustering(n_clusters=3)
-    df["agglomerative"] = agg.fit_predict(df_weighted)
+    if algo == "agglomerative":
+        # Agglomerative (hierarchical)
+        agg = AgglomerativeClustering(n_clusters=3)
+        df["agglomerative"] = agg.fit_predict(df_weighted)
+    elif algo == "fuzzy_cmeans":
+        # Fuzzy c-means
+        final_df_np = df_weighted.to_numpy()
+        fcm = FCM(n_clusters=3, m=1.67, max_iter=270)
+        fcm.fit(final_df_np)
+        df["fuzzy_cmeans"] = fcm.predict(final_df_np)
+    elif algo == "kmeans":
+        # K-means
+        km = cluster.KMeans(n_clusters=3, n_init="auto")
+        df["kmeans"] = km.fit_predict(df_weighted)
+    elif algo == "bi_kmeans_inertia":
+        # Bisecting k-means (inertia)
+        bisect_inertia = BisectingKMeans(n_clusters=3)
+        df["bi_kmeans_inertia"] = bisect_inertia.fit_predict(df_weighted)
+    elif algo == "bi_kmeans_lg_cluster":
+        # Bisecting k-means (largest cluster)
+        bisect_lg_cluster = BisectingKMeans(
+            n_clusters=3, bisecting_strategy="largest_cluster"
+        )
+        df["bi_kmeans_lg_cluster"] = bisect_lg_cluster.fit_predict(df_weighted)
+    elif algo == "pam_euclidean":
+        # K-medoids (Euclidean distance)
+        kmed_euclidean = KMedoids(n_clusters=3)
+        df["pam_euclidean"] = kmed_euclidean.fit_predict(df_weighted)
+    elif algo == "pam_manhattan":
+        # K-medoids (Manhattan distance)
+        kmed_manhattan = KMedoids(n_clusters=3, metric="manhattan")
+        df["pam_manhattan"] = kmed_manhattan.fit_predict(df_weighted)
+    else:
+        print("ERROR: Invalid algorithm!")
 
-    # Bisecting k-means
-    bisect_inertia = BisectingKMeans(n_clusters=3)
-    bisect_lg_cluster = BisectingKMeans(
-        n_clusters=3, bisecting_strategy="largest_cluster"
-    )
-    df["bi_kmeans_inertia"] = bisect_inertia.fit_predict(df_weighted)
-    df["bi_kmeans_lg_cluster"] = bisect_lg_cluster.fit_predict(df_weighted)
+    return df
+
+
+# Better for this to be an enum, but the syntax is a bit tricky.
+weighting_methods = {"Binary", "Count", "Tfidf"}
 
-    # Fuzzy c-means
-    final_df_np = df_weighted.to_numpy()
-    fcm = FCM(n_clusters=3, random_state=9)
-    fcm.fit(final_df_np)
-    df["fuzzy_cmeans"] = fcm.predict(final_df_np)
+# removes a list of words (ie. stopwords) from a tokenized list.
+def removeWords(listOfTokens, listOfWords):
+    return [token for token in listOfTokens if token not in listOfWords]
 
-    # K-means
-    km = cluster.KMeans(n_clusters=3, n_init=10, random_state=9)
-    df["kmeans"] = km.fit_predict(df_weighted)
 
-    # K-medoids
-    kmed_euclidean = KMedoids(n_clusters=3)
-    kmed_manhattan = KMedoids(n_clusters=3, metric="manhattan")
-    df["pam_euclidean"] = kmed_euclidean.fit_predict(df_weighted)
-    df["pam_manhattan"] = kmed_manhattan.fit_predict(df_weighted)
+# applies stemming to a list of tokenized words
+def applyStemming(listOfTokens, stemmer):
+    return [stemmer.stem(token) for token in listOfTokens]
 
-    return df
 
+# applied lemmatization to a list of tokenized words
+def applyLemmatization(listOfTokens, lemmatizer):
+    return [lemmatizer.lemmatize(token) for token in listOfTokens]
 
-# Better for this to be an enum, but the syntax is a bit tricky.
-weighting_methods = {"Binary", "Count", "Tfidf"}
+
+# removes any words composed of less than 2 or more than 21 letters
+def twoLetters(listOfTokens):
+    twoLetterWord = []
+    for token in listOfTokens:
+        if len(token) <= 2 or len(token) >= 21:
+            twoLetterWord.append(token)
+    return twoLetterWord
+
+
+def processCorpus(corpus, language, stemmer):
+    stopwords = nltk.corpus.stopwords.words(language)
+    param_stemmer = stemmer
+
+    for document in corpus:
+        index = corpus.index(document)
+        corpus[index] = str(corpus[index]).replace(
+            "\ufffd", "8"
+        )  # Replaces the ASCII '�' symbol with '8'
+        corpus[index] = corpus[index].replace(",", "")  # Removes commas
+        corpus[index] = corpus[index].rstrip("\n")  # Removes line breaks
+        corpus[index] = corpus[index].casefold()  # Makes all letters lowercase
+
+        corpus[index] = re.sub(
+            "\W_", " ", corpus[index]
+        )  # removes specials characters and leaves only words
+        corpus[index] = re.sub(
+            "\S*\d\S*", " ", corpus[index]
+        )  # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
+        corpus[index] = re.sub(
+            "\S*@\S*\s?", " ", corpus[index]
+        )  # removes emails and mentions (words with @)
+        corpus[index] = re.sub(r"http\S+", "", corpus[index])  # removes URLs with http
+        corpus[index] = re.sub(r"www\S+", "", corpus[index])  # removes URLs with www
+
+        listOfTokens = word_tokenize(corpus[index])
+        twoLetterWord = twoLetters(listOfTokens)
+
+        listOfTokens = removeWords(listOfTokens, stopwords)
+        listOfTokens = removeWords(listOfTokens, twoLetterWord)
+
+        listOfTokens = applyStemming(listOfTokens, param_stemmer)
+
+        corpus[index] = " ".join(listOfTokens)
+        corpus[index] = unidecode(corpus[index])
+
+    return corpus
 
 
 # Output: DataFrame with dense values
@@ -122,7 +193,7 @@ def do_weighting(method: str, series: pd.Series) -> pd.DataFrame:
     elif method == "Count":
         vectorizer = CountVectorizer()
     elif method == "Tfidf":
-        vectorizer = TfidfVectorizer()
+        vectorizer = TfidfVectorizer(sublinear_tf=True)
     else:
         print("Error. Did not pass valid weighting method")
         return
@@ -150,7 +221,61 @@ def ordinal(n: int) -> str:
     return str(n) + suffix
 
 
+# Source: https://danielcaraway.github.io/html/sklearn_cosine_similarity.html
+def cosine_sim(df, df_col, class_no, pos_to_last, predicted_labels):
+    unigram_count = CountVectorizer(encoding="latin-1", binary=False)
+    unigram_count_stop_remove = CountVectorizer(
+        encoding="latin-1", binary=False, stop_words="english"
+    )
+
+    # get the list of candidate patterns
+    txts = df_col.loc[predicted_labels == class_no]  # where label == class_no
+    vecs = unigram_count.fit_transform(txts)
+
+    cos_sim = cosine_similarity(vecs[-pos_to_last], vecs)
+
+    return cos_sim, txts
+
+
+# Written by Akash
+class PredictedPattern:
+    allPatternsPredicted = []
+    # "Stores name and reccurence "
+
+    def __init__(self, name, timesPredicted, cosineSimPercent):
+        self.name = name
+        self.timesPredicted = timesPredicted
+        self.cosineSimPercent = cosineSimPercent
+        self.clusterPercent = 0
+        self.totalPercent = 0
+
+    def assignPercentages(self, clusterPercent, totalPercent):
+        self.clusterPercent = clusterPercent
+        self.totalPercent = totalPercent
+
+
+# Calculates three different percent match metrics between a design problem and a design pattern
+# Written by Akash
+def CalculatePercent(predictedPattern, clusterWeight, cosSimWeight, times_clustered):
+    totalPercent = 0
+    clusterPercent = 0
+    cosSimPercent = 0
+
+    clusterPercent = int((predictedPattern.timesPredicted / times_clustered) * 100)
+    cosSimPercent = predictedPattern.cosineSimPercent
+    totalPercent = int(
+        (((clusterPercent * clusterWeight) + (cosSimPercent * cosSimWeight)) / 2)
+        + ((2 - clusterWeight - cosSimWeight) * 70)
+    )
+
+    return clusterPercent, cosSimPercent, totalPercent
+
+
 def main(design_problem: str = ""):
+    num_of_times_clustered = 10
+    clusterWeight = 0.9
+    cossimWeight = 0.8
+
     # Reset output
     output.clear()
 
@@ -176,66 +301,101 @@ def main(design_problem: str = ""):
     )
     df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
 
-    # Preprocess
-    cleaned_text = preprocess(df["overview"])
-    # Create a dense tfidf matrix
-    tfidf_matrix = do_weighting("Tfidf", cleaned_text)
+    # # Preprocess
+    # cleaned_text = preprocess(df["overview"])
+    # # Create a dense tfidf matrix
+    # tfidf_matrix = do_weighting("Tfidf", cleaned_text)
+
+    # pre process
+    corpus = df["overview"].tolist()
+    corpus = processCorpus(corpus, language="english", stemmer=stemmer)
+
+    # vectorize data
+    vectorizer = TfidfVectorizer(sublinear_tf=True)
+    X = vectorizer.fit_transform(corpus)
+    tfidf_matrix = pd.DataFrame(
+        data=X.toarray(), columns=vectorizer.get_feature_names_out()
+    )
     # Perform clustering
-    df_labels = do_cluster(tfidf_matrix)
-    # Append (horizontally) the cluster labels to the original DF
-    df = pd.concat([df, df_labels], axis=1)
-
-    # Used for cosine similarity
-    vectorizer = CountVectorizer()
-    X = vectorizer.fit_transform(df["overview"])
-
-    # Used for output formatting
-    max_len_pattern = df["name"].str.len().max()
-    max_len = len(max(algorithms_pretty, key=len))
-
-    do_output()
-    for i, algorithm in enumerate(algorithms):
-        do_output(f"{algorithms_pretty[i]}")
-        do_output("-" * max_len)
-
-        # Get the cluster number of the design problem
-        predicted_cluster = df[algorithm].iloc[-1]
-
-        """
-        1. Filter all rows where the cluster number matches
-        2. Select the name, category, and `algorithm` rows
-        3. Remove the design problem row
-        4. Make a copy to make the logic more clear
-        """
-        df_problem_cluster = (
-            df.loc[df[algorithm] == predicted_cluster][["name", "category", algorithm]]
-            .iloc[:-1]
-            .copy()
-        )
+    for a, algo in enumerate(algorithms):
+        predictedPatterns = []
+        PredictedPattern.allPatternsPredicted.clear()
 
-        # Calculate cosine similarity for all patterns in the cluster vs. design problem
-        df_problem_cluster["match"] = cosine_similarity(
-            X[df_problem_cluster.index], X[-1]
-        ).flatten()
+        for j in range(num_of_times_clustered):
+            df_labels = do_cluster(tfidf_matrix, algo)
+            df[algo] = df_labels
 
-        # Calculate the most likely category
-        predicted_category = (
-            df_problem_cluster.groupby("category")["match"].mean().idxmax()
-        )
-        do_output(f"Category is most likely {predicted_category}\n")
-
-        # Display the matching patterns by match % in descending order
-        for index, (name, percent) in enumerate(
-            sorted(
-                zip(df_problem_cluster["name"], df_problem_cluster["match"]),
-                key=lambda x: x[1],
-                reverse=True,
+            cos_sim, txts = cosine_sim(
+                df, df["overview"], df[algo].iloc[df.index[-1]], 1, df[algo]
             )
-        ):
-            do_output(
-                f"{ordinal(index + 1).ljust(4)} pattern: {name.ljust(max_len_pattern)} {percent:.0%}"
+
+            # get problem row
+            n = df.index[-1]
+            problemRow = df.iloc[[n]]
+
+            sim_sorted_doc_idx = cos_sim.argsort()
+
+            # loop patterns of the same cluster as the design problem
+            for i in range(len(txts) - 1):
+                # pattern predicted description
+                patternDesc = txts.iloc[sim_sorted_doc_idx[-1][len(txts) - (i + 2)]]
+                # pattern name matching the description
+                patternName = (df["name"][(df["overview"] == patternDesc)]).to_string(
+                    index=False
+                )
+
+                # percent cos sim match
+                percentMatch = int(
+                    (cos_sim[0][sim_sorted_doc_idx[-1][len(txts) - (i + 2)]]) * 100
+                )
+
+                # add the clustering by running it multiple times
+                if patternName in PredictedPattern.allPatternsPredicted:
+                    # print(patternName)
+                    patternIndex = PredictedPattern.allPatternsPredicted.index(
+                        patternName
+                    )
+                    # print(predictedPatterns[patternIndex].timesPredicted)
+
+                    # print(patternIndex)
+                    predictedPatterns[patternIndex].timesPredicted = (
+                        predictedPatterns[patternIndex].timesPredicted + 1
+                    )
+                else:
+                    predictedPatterns.append(
+                        PredictedPattern(patternName, 1, percentMatch)
+                    )
+                    PredictedPattern.allPatternsPredicted.append(patternName)
+
+        print("\n", algo, "\n")
+        for predictedPattern in predictedPatterns:
+            clusterPercent, cosSimPercent, totalPercent = CalculatePercent(
+                predictedPattern,
+                clusterWeight,
+                cossimWeight,
+                num_of_times_clustered,
             )
-        do_output()
+            predictedPattern.assignPercentages(clusterPercent, totalPercent)
+        predictedPatterns = sorted(
+            predictedPatterns, key=lambda x: x.totalPercent, reverse=True
+        )
+        k = 0
+        for predictedPattern in predictedPatterns:
+            k += 1
+            print(
+                "{}th pattern:  {:<25}{}%  CosSim match {}% Clustering Match {}% Total Match".format(
+                    k,
+                    predictedPattern.name,
+                    predictedPattern.cosineSimPercent,
+                    predictedPattern.clusterPercent,
+                    predictedPattern.totalPercent,
+                )
+            )
+
+    # clean up
+    # Using drop() function to delete last row
+    df.drop(index=n, axis=0, inplace=True)
+    df = df.drop([str(algo)], axis=1)
 
     return output