diff --git a/ml/predict.py b/ml/predict.py index 745d72b..a58da6e 100644 --- a/ml/predict.py +++ b/ml/predict.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from fcmeans import FCM -from nltk import PorterStemmer +from nltk import PorterStemmer, word_tokenize from nltk.corpus import stopwords from nltk.tag import pos_tag from sklearn import cluster @@ -15,6 +15,8 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn_extra.cluster import KMedoids +from unidecode import unidecode +import re try: nltk.find("corpora/stopwords") @@ -76,43 +78,112 @@ def preprocess(series: pd.Series) -> pd.Series: return series -def do_cluster(df_weighted: pd.DataFrame) -> pd.DataFrame: +def do_cluster(df_weighted: pd.DataFrame, algo: str) -> pd.DataFrame: # This is the DataFrame we will return that contains all the labels. df = pd.DataFrame() - # Agglomerative (hierarchical) - agg = AgglomerativeClustering(n_clusters=3) - df["agglomerative"] = agg.fit_predict(df_weighted) + if algo == "agglomerative": + # Agglomerative (hierarchical) + agg = AgglomerativeClustering(n_clusters=3) + df["agglomerative"] = agg.fit_predict(df_weighted) + elif algo == "fuzzy_cmeans": + # Fuzzy c-means + final_df_np = df_weighted.to_numpy() + fcm = FCM(n_clusters=3, m=1.67, max_iter=270) + fcm.fit(final_df_np) + df["fuzzy_cmeans"] = fcm.predict(final_df_np) + elif algo == "kmeans": + # K-means + km = cluster.KMeans(n_clusters=3, n_init="auto") + df["kmeans"] = km.fit_predict(df_weighted) + elif algo == "bi_kmeans_inertia": + # Bisecting k-means (inertia) + bisect_inertia = BisectingKMeans(n_clusters=3) + df["bi_kmeans_inertia"] = bisect_inertia.fit_predict(df_weighted) + elif algo == "bi_kmeans_lg_cluster": + # Bisecting k-means (largest cluster) + bisect_lg_cluster = BisectingKMeans( + n_clusters=3, bisecting_strategy="largest_cluster" + ) + df["bi_kmeans_lg_cluster"] = bisect_lg_cluster.fit_predict(df_weighted) + elif algo == "pam_euclidean": + # K-medoids (Euclidean distance) + kmed_euclidean = KMedoids(n_clusters=3) + df["pam_euclidean"] = kmed_euclidean.fit_predict(df_weighted) + elif algo == "pam_manhattan": + # K-medoids (Manhattan distance) + kmed_manhattan = KMedoids(n_clusters=3, metric="manhattan") + df["pam_manhattan"] = kmed_manhattan.fit_predict(df_weighted) + else: + print("ERROR: Invalid algorithm!") - # Bisecting k-means - bisect_inertia = BisectingKMeans(n_clusters=3) - bisect_lg_cluster = BisectingKMeans( - n_clusters=3, bisecting_strategy="largest_cluster" - ) - df["bi_kmeans_inertia"] = bisect_inertia.fit_predict(df_weighted) - df["bi_kmeans_lg_cluster"] = bisect_lg_cluster.fit_predict(df_weighted) + return df + + +# Better for this to be an enum, but the syntax is a bit tricky. +weighting_methods = {"Binary", "Count", "Tfidf"} - # Fuzzy c-means - final_df_np = df_weighted.to_numpy() - fcm = FCM(n_clusters=3, random_state=9) - fcm.fit(final_df_np) - df["fuzzy_cmeans"] = fcm.predict(final_df_np) +# removes a list of words (ie. stopwords) from a tokenized list. +def removeWords(listOfTokens, listOfWords): + return [token for token in listOfTokens if token not in listOfWords] - # K-means - km = cluster.KMeans(n_clusters=3, n_init=10, random_state=9) - df["kmeans"] = km.fit_predict(df_weighted) - # K-medoids - kmed_euclidean = KMedoids(n_clusters=3) - kmed_manhattan = KMedoids(n_clusters=3, metric="manhattan") - df["pam_euclidean"] = kmed_euclidean.fit_predict(df_weighted) - df["pam_manhattan"] = kmed_manhattan.fit_predict(df_weighted) +# applies stemming to a list of tokenized words +def applyStemming(listOfTokens, stemmer): + return [stemmer.stem(token) for token in listOfTokens] - return df +# applied lemmatization to a list of tokenized words +def applyLemmatization(listOfTokens, lemmatizer): + return [lemmatizer.lemmatize(token) for token in listOfTokens] -# Better for this to be an enum, but the syntax is a bit tricky. -weighting_methods = {"Binary", "Count", "Tfidf"} + +# removes any words composed of less than 2 or more than 21 letters +def twoLetters(listOfTokens): + twoLetterWord = [] + for token in listOfTokens: + if len(token) <= 2 or len(token) >= 21: + twoLetterWord.append(token) + return twoLetterWord + + +def processCorpus(corpus, language, stemmer): + stopwords = nltk.corpus.stopwords.words(language) + param_stemmer = stemmer + + for document in corpus: + index = corpus.index(document) + corpus[index] = str(corpus[index]).replace( + "\ufffd", "8" + ) # Replaces the ASCII '�' symbol with '8' + corpus[index] = corpus[index].replace(",", "") # Removes commas + corpus[index] = corpus[index].rstrip("\n") # Removes line breaks + corpus[index] = corpus[index].casefold() # Makes all letters lowercase + + corpus[index] = re.sub( + "\W_", " ", corpus[index] + ) # removes specials characters and leaves only words + corpus[index] = re.sub( + "\S*\d\S*", " ", corpus[index] + ) # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381. + corpus[index] = re.sub( + "\S*@\S*\s?", " ", corpus[index] + ) # removes emails and mentions (words with @) + corpus[index] = re.sub(r"http\S+", "", corpus[index]) # removes URLs with http + corpus[index] = re.sub(r"www\S+", "", corpus[index]) # removes URLs with www + + listOfTokens = word_tokenize(corpus[index]) + twoLetterWord = twoLetters(listOfTokens) + + listOfTokens = removeWords(listOfTokens, stopwords) + listOfTokens = removeWords(listOfTokens, twoLetterWord) + + listOfTokens = applyStemming(listOfTokens, param_stemmer) + + corpus[index] = " ".join(listOfTokens) + corpus[index] = unidecode(corpus[index]) + + return corpus # Output: DataFrame with dense values @@ -122,7 +193,7 @@ def do_weighting(method: str, series: pd.Series) -> pd.DataFrame: elif method == "Count": vectorizer = CountVectorizer() elif method == "Tfidf": - vectorizer = TfidfVectorizer() + vectorizer = TfidfVectorizer(sublinear_tf=True) else: print("Error. Did not pass valid weighting method") return @@ -150,7 +221,61 @@ def ordinal(n: int) -> str: return str(n) + suffix +# Source: https://danielcaraway.github.io/html/sklearn_cosine_similarity.html +def cosine_sim(df, df_col, class_no, pos_to_last, predicted_labels): + unigram_count = CountVectorizer(encoding="latin-1", binary=False) + unigram_count_stop_remove = CountVectorizer( + encoding="latin-1", binary=False, stop_words="english" + ) + + # get the list of candidate patterns + txts = df_col.loc[predicted_labels == class_no] # where label == class_no + vecs = unigram_count.fit_transform(txts) + + cos_sim = cosine_similarity(vecs[-pos_to_last], vecs) + + return cos_sim, txts + + +# Written by Akash +class PredictedPattern: + allPatternsPredicted = [] + # "Stores name and reccurence " + + def __init__(self, name, timesPredicted, cosineSimPercent): + self.name = name + self.timesPredicted = timesPredicted + self.cosineSimPercent = cosineSimPercent + self.clusterPercent = 0 + self.totalPercent = 0 + + def assignPercentages(self, clusterPercent, totalPercent): + self.clusterPercent = clusterPercent + self.totalPercent = totalPercent + + +# Calculates three different percent match metrics between a design problem and a design pattern +# Written by Akash +def CalculatePercent(predictedPattern, clusterWeight, cosSimWeight, times_clustered): + totalPercent = 0 + clusterPercent = 0 + cosSimPercent = 0 + + clusterPercent = int((predictedPattern.timesPredicted / times_clustered) * 100) + cosSimPercent = predictedPattern.cosineSimPercent + totalPercent = int( + (((clusterPercent * clusterWeight) + (cosSimPercent * cosSimWeight)) / 2) + + ((2 - clusterWeight - cosSimWeight) * 70) + ) + + return clusterPercent, cosSimPercent, totalPercent + + def main(design_problem: str = ""): + num_of_times_clustered = 10 + clusterWeight = 0.9 + cossimWeight = 0.8 + # Reset output output.clear() @@ -176,66 +301,101 @@ def main(design_problem: str = ""): ) df = pd.concat([df, new_row.to_frame().T], ignore_index=True) - # Preprocess - cleaned_text = preprocess(df["overview"]) - # Create a dense tfidf matrix - tfidf_matrix = do_weighting("Tfidf", cleaned_text) + # # Preprocess + # cleaned_text = preprocess(df["overview"]) + # # Create a dense tfidf matrix + # tfidf_matrix = do_weighting("Tfidf", cleaned_text) + + # pre process + corpus = df["overview"].tolist() + corpus = processCorpus(corpus, language="english", stemmer=stemmer) + + # vectorize data + vectorizer = TfidfVectorizer(sublinear_tf=True) + X = vectorizer.fit_transform(corpus) + tfidf_matrix = pd.DataFrame( + data=X.toarray(), columns=vectorizer.get_feature_names_out() + ) # Perform clustering - df_labels = do_cluster(tfidf_matrix) - # Append (horizontally) the cluster labels to the original DF - df = pd.concat([df, df_labels], axis=1) - - # Used for cosine similarity - vectorizer = CountVectorizer() - X = vectorizer.fit_transform(df["overview"]) - - # Used for output formatting - max_len_pattern = df["name"].str.len().max() - max_len = len(max(algorithms_pretty, key=len)) - - do_output() - for i, algorithm in enumerate(algorithms): - do_output(f"{algorithms_pretty[i]}") - do_output("-" * max_len) - - # Get the cluster number of the design problem - predicted_cluster = df[algorithm].iloc[-1] - - """ - 1. Filter all rows where the cluster number matches - 2. Select the name, category, and `algorithm` rows - 3. Remove the design problem row - 4. Make a copy to make the logic more clear - """ - df_problem_cluster = ( - df.loc[df[algorithm] == predicted_cluster][["name", "category", algorithm]] - .iloc[:-1] - .copy() - ) + for a, algo in enumerate(algorithms): + predictedPatterns = [] + PredictedPattern.allPatternsPredicted.clear() - # Calculate cosine similarity for all patterns in the cluster vs. design problem - df_problem_cluster["match"] = cosine_similarity( - X[df_problem_cluster.index], X[-1] - ).flatten() + for j in range(num_of_times_clustered): + df_labels = do_cluster(tfidf_matrix, algo) + df[algo] = df_labels - # Calculate the most likely category - predicted_category = ( - df_problem_cluster.groupby("category")["match"].mean().idxmax() - ) - do_output(f"Category is most likely {predicted_category}\n") - - # Display the matching patterns by match % in descending order - for index, (name, percent) in enumerate( - sorted( - zip(df_problem_cluster["name"], df_problem_cluster["match"]), - key=lambda x: x[1], - reverse=True, + cos_sim, txts = cosine_sim( + df, df["overview"], df[algo].iloc[df.index[-1]], 1, df[algo] ) - ): - do_output( - f"{ordinal(index + 1).ljust(4)} pattern: {name.ljust(max_len_pattern)} {percent:.0%}" + + # get problem row + n = df.index[-1] + problemRow = df.iloc[[n]] + + sim_sorted_doc_idx = cos_sim.argsort() + + # loop patterns of the same cluster as the design problem + for i in range(len(txts) - 1): + # pattern predicted description + patternDesc = txts.iloc[sim_sorted_doc_idx[-1][len(txts) - (i + 2)]] + # pattern name matching the description + patternName = (df["name"][(df["overview"] == patternDesc)]).to_string( + index=False + ) + + # percent cos sim match + percentMatch = int( + (cos_sim[0][sim_sorted_doc_idx[-1][len(txts) - (i + 2)]]) * 100 + ) + + # add the clustering by running it multiple times + if patternName in PredictedPattern.allPatternsPredicted: + # print(patternName) + patternIndex = PredictedPattern.allPatternsPredicted.index( + patternName + ) + # print(predictedPatterns[patternIndex].timesPredicted) + + # print(patternIndex) + predictedPatterns[patternIndex].timesPredicted = ( + predictedPatterns[patternIndex].timesPredicted + 1 + ) + else: + predictedPatterns.append( + PredictedPattern(patternName, 1, percentMatch) + ) + PredictedPattern.allPatternsPredicted.append(patternName) + + print("\n", algo, "\n") + for predictedPattern in predictedPatterns: + clusterPercent, cosSimPercent, totalPercent = CalculatePercent( + predictedPattern, + clusterWeight, + cossimWeight, + num_of_times_clustered, ) - do_output() + predictedPattern.assignPercentages(clusterPercent, totalPercent) + predictedPatterns = sorted( + predictedPatterns, key=lambda x: x.totalPercent, reverse=True + ) + k = 0 + for predictedPattern in predictedPatterns: + k += 1 + print( + "{}th pattern: {:<25}{}% CosSim match {}% Clustering Match {}% Total Match".format( + k, + predictedPattern.name, + predictedPattern.cosineSimPercent, + predictedPattern.clusterPercent, + predictedPattern.totalPercent, + ) + ) + + # clean up + # Using drop() function to delete last row + df.drop(index=n, axis=0, inplace=True) + df = df.drop([str(algo)], axis=1) return output