Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
328 changes: 244 additions & 84 deletions ml/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@
import numpy as np
import pandas as pd
from fcmeans import FCM
from nltk import PorterStemmer
from nltk import PorterStemmer, word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from sklearn import cluster
from sklearn.cluster import AgglomerativeClustering, BisectingKMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn_extra.cluster import KMedoids
from unidecode import unidecode
import re

try:
nltk.find("corpora/stopwords")
Expand Down Expand Up @@ -76,43 +78,112 @@ def preprocess(series: pd.Series) -> pd.Series:
return series


def do_cluster(df_weighted: pd.DataFrame) -> pd.DataFrame:
def do_cluster(df_weighted: pd.DataFrame, algo: str) -> pd.DataFrame:
# This is the DataFrame we will return that contains all the labels.
df = pd.DataFrame()

# Agglomerative (hierarchical)
agg = AgglomerativeClustering(n_clusters=3)
df["agglomerative"] = agg.fit_predict(df_weighted)
if algo == "agglomerative":
# Agglomerative (hierarchical)
agg = AgglomerativeClustering(n_clusters=3)
df["agglomerative"] = agg.fit_predict(df_weighted)
elif algo == "fuzzy_cmeans":
# Fuzzy c-means
final_df_np = df_weighted.to_numpy()
fcm = FCM(n_clusters=3, m=1.67, max_iter=270)
fcm.fit(final_df_np)
df["fuzzy_cmeans"] = fcm.predict(final_df_np)
elif algo == "kmeans":
# K-means
km = cluster.KMeans(n_clusters=3, n_init="auto")
df["kmeans"] = km.fit_predict(df_weighted)
elif algo == "bi_kmeans_inertia":
# Bisecting k-means (inertia)
bisect_inertia = BisectingKMeans(n_clusters=3)
df["bi_kmeans_inertia"] = bisect_inertia.fit_predict(df_weighted)
elif algo == "bi_kmeans_lg_cluster":
# Bisecting k-means (largest cluster)
bisect_lg_cluster = BisectingKMeans(
n_clusters=3, bisecting_strategy="largest_cluster"
)
df["bi_kmeans_lg_cluster"] = bisect_lg_cluster.fit_predict(df_weighted)
elif algo == "pam_euclidean":
# K-medoids (Euclidean distance)
kmed_euclidean = KMedoids(n_clusters=3)
df["pam_euclidean"] = kmed_euclidean.fit_predict(df_weighted)
elif algo == "pam_manhattan":
# K-medoids (Manhattan distance)
kmed_manhattan = KMedoids(n_clusters=3, metric="manhattan")
df["pam_manhattan"] = kmed_manhattan.fit_predict(df_weighted)
else:
print("ERROR: Invalid algorithm!")

# Bisecting k-means
bisect_inertia = BisectingKMeans(n_clusters=3)
bisect_lg_cluster = BisectingKMeans(
n_clusters=3, bisecting_strategy="largest_cluster"
)
df["bi_kmeans_inertia"] = bisect_inertia.fit_predict(df_weighted)
df["bi_kmeans_lg_cluster"] = bisect_lg_cluster.fit_predict(df_weighted)
return df


# Better for this to be an enum, but the syntax is a bit tricky.
weighting_methods = {"Binary", "Count", "Tfidf"}

# Fuzzy c-means
final_df_np = df_weighted.to_numpy()
fcm = FCM(n_clusters=3, random_state=9)
fcm.fit(final_df_np)
df["fuzzy_cmeans"] = fcm.predict(final_df_np)
# removes a list of words (ie. stopwords) from a tokenized list.
def removeWords(listOfTokens, listOfWords):
return [token for token in listOfTokens if token not in listOfWords]

# K-means
km = cluster.KMeans(n_clusters=3, n_init=10, random_state=9)
df["kmeans"] = km.fit_predict(df_weighted)

# K-medoids
kmed_euclidean = KMedoids(n_clusters=3)
kmed_manhattan = KMedoids(n_clusters=3, metric="manhattan")
df["pam_euclidean"] = kmed_euclidean.fit_predict(df_weighted)
df["pam_manhattan"] = kmed_manhattan.fit_predict(df_weighted)
# applies stemming to a list of tokenized words
def applyStemming(listOfTokens, stemmer):
return [stemmer.stem(token) for token in listOfTokens]

return df

# applied lemmatization to a list of tokenized words
def applyLemmatization(listOfTokens, lemmatizer):
return [lemmatizer.lemmatize(token) for token in listOfTokens]

# Better for this to be an enum, but the syntax is a bit tricky.
weighting_methods = {"Binary", "Count", "Tfidf"}

# removes any words composed of less than 2 or more than 21 letters
def twoLetters(listOfTokens):
twoLetterWord = []
for token in listOfTokens:
if len(token) <= 2 or len(token) >= 21:
twoLetterWord.append(token)
return twoLetterWord


def processCorpus(corpus, language, stemmer):
stopwords = nltk.corpus.stopwords.words(language)
param_stemmer = stemmer

for document in corpus:
index = corpus.index(document)
corpus[index] = str(corpus[index]).replace(
"\ufffd", "8"
) # Replaces the ASCII '�' symbol with '8'
corpus[index] = corpus[index].replace(",", "") # Removes commas
corpus[index] = corpus[index].rstrip("\n") # Removes line breaks
corpus[index] = corpus[index].casefold() # Makes all letters lowercase

corpus[index] = re.sub(
"\W_", " ", corpus[index]
) # removes specials characters and leaves only words
corpus[index] = re.sub(
"\S*\d\S*", " ", corpus[index]
) # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
corpus[index] = re.sub(
"\S*@\S*\s?", " ", corpus[index]
) # removes emails and mentions (words with @)
corpus[index] = re.sub(r"http\S+", "", corpus[index]) # removes URLs with http
corpus[index] = re.sub(r"www\S+", "", corpus[index]) # removes URLs with www

listOfTokens = word_tokenize(corpus[index])
twoLetterWord = twoLetters(listOfTokens)

listOfTokens = removeWords(listOfTokens, stopwords)
listOfTokens = removeWords(listOfTokens, twoLetterWord)

listOfTokens = applyStemming(listOfTokens, param_stemmer)

corpus[index] = " ".join(listOfTokens)
corpus[index] = unidecode(corpus[index])

return corpus


# Output: DataFrame with dense values
Expand All @@ -122,7 +193,7 @@ def do_weighting(method: str, series: pd.Series) -> pd.DataFrame:
elif method == "Count":
vectorizer = CountVectorizer()
elif method == "Tfidf":
vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer(sublinear_tf=True)
else:
print("Error. Did not pass valid weighting method")
return
Expand Down Expand Up @@ -150,7 +221,61 @@ def ordinal(n: int) -> str:
return str(n) + suffix


# Source: https://danielcaraway.github.io/html/sklearn_cosine_similarity.html
def cosine_sim(df, df_col, class_no, pos_to_last, predicted_labels):
unigram_count = CountVectorizer(encoding="latin-1", binary=False)
unigram_count_stop_remove = CountVectorizer(
encoding="latin-1", binary=False, stop_words="english"
)

# get the list of candidate patterns
txts = df_col.loc[predicted_labels == class_no] # where label == class_no
vecs = unigram_count.fit_transform(txts)

cos_sim = cosine_similarity(vecs[-pos_to_last], vecs)

return cos_sim, txts


# Written by Akash
class PredictedPattern:
allPatternsPredicted = []
# "Stores name and reccurence "

def __init__(self, name, timesPredicted, cosineSimPercent):
self.name = name
self.timesPredicted = timesPredicted
self.cosineSimPercent = cosineSimPercent
self.clusterPercent = 0
self.totalPercent = 0

def assignPercentages(self, clusterPercent, totalPercent):
self.clusterPercent = clusterPercent
self.totalPercent = totalPercent


# Calculates three different percent match metrics between a design problem and a design pattern
# Written by Akash
def CalculatePercent(predictedPattern, clusterWeight, cosSimWeight, times_clustered):
totalPercent = 0
clusterPercent = 0
cosSimPercent = 0

clusterPercent = int((predictedPattern.timesPredicted / times_clustered) * 100)
cosSimPercent = predictedPattern.cosineSimPercent
totalPercent = int(
(((clusterPercent * clusterWeight) + (cosSimPercent * cosSimWeight)) / 2)
+ ((2 - clusterWeight - cosSimWeight) * 70)
)

return clusterPercent, cosSimPercent, totalPercent


def main(design_problem: str = ""):
num_of_times_clustered = 10
clusterWeight = 0.9
cossimWeight = 0.8

# Reset output
output.clear()

Expand All @@ -176,66 +301,101 @@ def main(design_problem: str = ""):
)
df = pd.concat([df, new_row.to_frame().T], ignore_index=True)

# Preprocess
cleaned_text = preprocess(df["overview"])
# Create a dense tfidf matrix
tfidf_matrix = do_weighting("Tfidf", cleaned_text)
# # Preprocess
# cleaned_text = preprocess(df["overview"])
# # Create a dense tfidf matrix
# tfidf_matrix = do_weighting("Tfidf", cleaned_text)

# pre process
corpus = df["overview"].tolist()
corpus = processCorpus(corpus, language="english", stemmer=stemmer)

# vectorize data
vectorizer = TfidfVectorizer(sublinear_tf=True)
X = vectorizer.fit_transform(corpus)
tfidf_matrix = pd.DataFrame(
data=X.toarray(), columns=vectorizer.get_feature_names_out()
)
# Perform clustering
df_labels = do_cluster(tfidf_matrix)
# Append (horizontally) the cluster labels to the original DF
df = pd.concat([df, df_labels], axis=1)

# Used for cosine similarity
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["overview"])

# Used for output formatting
max_len_pattern = df["name"].str.len().max()
max_len = len(max(algorithms_pretty, key=len))

do_output()
for i, algorithm in enumerate(algorithms):
do_output(f"{algorithms_pretty[i]}")
do_output("-" * max_len)

# Get the cluster number of the design problem
predicted_cluster = df[algorithm].iloc[-1]

"""
1. Filter all rows where the cluster number matches
2. Select the name, category, and `algorithm` rows
3. Remove the design problem row
4. Make a copy to make the logic more clear
"""
df_problem_cluster = (
df.loc[df[algorithm] == predicted_cluster][["name", "category", algorithm]]
.iloc[:-1]
.copy()
)
for a, algo in enumerate(algorithms):
predictedPatterns = []
PredictedPattern.allPatternsPredicted.clear()

# Calculate cosine similarity for all patterns in the cluster vs. design problem
df_problem_cluster["match"] = cosine_similarity(
X[df_problem_cluster.index], X[-1]
).flatten()
for j in range(num_of_times_clustered):
df_labels = do_cluster(tfidf_matrix, algo)
df[algo] = df_labels

# Calculate the most likely category
predicted_category = (
df_problem_cluster.groupby("category")["match"].mean().idxmax()
)
do_output(f"Category is most likely {predicted_category}\n")

# Display the matching patterns by match % in descending order
for index, (name, percent) in enumerate(
sorted(
zip(df_problem_cluster["name"], df_problem_cluster["match"]),
key=lambda x: x[1],
reverse=True,
cos_sim, txts = cosine_sim(
df, df["overview"], df[algo].iloc[df.index[-1]], 1, df[algo]
)
):
do_output(
f"{ordinal(index + 1).ljust(4)} pattern: {name.ljust(max_len_pattern)} {percent:.0%}"

# get problem row
n = df.index[-1]
problemRow = df.iloc[[n]]

sim_sorted_doc_idx = cos_sim.argsort()

# loop patterns of the same cluster as the design problem
for i in range(len(txts) - 1):
# pattern predicted description
patternDesc = txts.iloc[sim_sorted_doc_idx[-1][len(txts) - (i + 2)]]
# pattern name matching the description
patternName = (df["name"][(df["overview"] == patternDesc)]).to_string(
index=False
)

# percent cos sim match
percentMatch = int(
(cos_sim[0][sim_sorted_doc_idx[-1][len(txts) - (i + 2)]]) * 100
)

# add the clustering by running it multiple times
if patternName in PredictedPattern.allPatternsPredicted:
# print(patternName)
patternIndex = PredictedPattern.allPatternsPredicted.index(
patternName
)
# print(predictedPatterns[patternIndex].timesPredicted)

# print(patternIndex)
predictedPatterns[patternIndex].timesPredicted = (
predictedPatterns[patternIndex].timesPredicted + 1
)
else:
predictedPatterns.append(
PredictedPattern(patternName, 1, percentMatch)
)
PredictedPattern.allPatternsPredicted.append(patternName)

print("\n", algo, "\n")
for predictedPattern in predictedPatterns:
clusterPercent, cosSimPercent, totalPercent = CalculatePercent(
predictedPattern,
clusterWeight,
cossimWeight,
num_of_times_clustered,
)
do_output()
predictedPattern.assignPercentages(clusterPercent, totalPercent)
predictedPatterns = sorted(
predictedPatterns, key=lambda x: x.totalPercent, reverse=True
)
k = 0
for predictedPattern in predictedPatterns:
k += 1
print(
"{}th pattern: {:<25}{}% CosSim match {}% Clustering Match {}% Total Match".format(
k,
predictedPattern.name,
predictedPattern.cosineSimPercent,
predictedPattern.clusterPercent,
predictedPattern.totalPercent,
)
)

# clean up
# Using drop() function to delete last row
df.drop(index=n, axis=0, inplace=True)
df = df.drop([str(algo)], axis=1)

return output

Expand Down