From 2ee591b828adb0f0a5606e6441809b0484ff5a36 Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Thu, 2 Jun 2022 21:41:48 -0400 Subject: [PATCH 01/12] add Kmeans module --- KMeans.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 KMeans.py diff --git a/KMeans.py b/KMeans.py new file mode 100644 index 0000000..b1024df --- /dev/null +++ b/KMeans.py @@ -0,0 +1,49 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs + +''' +Function that gets data points and cluster number(centroids), returns coordinates +of cluster centers +Default values: number of runs on different centroid seeds = 10, max runs = 300 +''' +def run_kmeans(data, centroids, n_init=10, max_iter=300): + KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter) + y_KM = KM.fit_predict(data) + return KM.cluster_centers_ + +''' +Function that helps to determine how many clusters to use by using trials of K clusters +The idea is to find the cluster number that gives the maximum reduction in inertia +''' +def elbow_method(data, num_k, n_init=10, max_iter=300): + inertia = [] + for i in range(1, num_k): + KM = KMeans( + n_clusters=i, + n_init=n_init, max_iter=max_iter + ) + KM.fit_predict(data) + inertia.append(KM.inertia_) + + plt.plot(range(1, num_k), inertia, marker='o') + plt.xlabel('Number of clusters') + plt.ylabel('Inertia') + plt.show() + + +'''Generate random sample (write another method to get data later?), just to show an example''' +data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0) +# plt.scatter(data[:, 0], data[:, 1]) +# plt.show() +elbow_method(data, 10) +# print(run_kmeans(data, 6)) + + + + + + + \ No newline at end of file From 5c84ba9f2d320f35754755b6acf767e645fec24b Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Wed, 22 Jun 2022 00:12:36 -0400 Subject: [PATCH 02/12] Finish with T-test --- ANOVA.py | 0 KMeans.py | 2 ++ LinearRegression.py | 35 +++++++++++++++++++++++++++++++++++ T-tests.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+) create mode 100644 ANOVA.py create mode 100644 LinearRegression.py create mode 100644 T-tests.py diff --git a/ANOVA.py b/ANOVA.py new file mode 100644 index 0000000..e69de29 diff --git a/KMeans.py b/KMeans.py index b1024df..edbaa01 100644 --- a/KMeans.py +++ b/KMeans.py @@ -35,7 +35,9 @@ def elbow_method(data, num_k, n_init=10, max_iter=300): '''Generate random sample (write another method to get data later?), just to show an example''' +# Assume we get this from the pre-processed data? data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0) + # plt.scatter(data[:, 0], data[:, 1]) # plt.show() elbow_method(data, 10) diff --git a/LinearRegression.py b/LinearRegression.py new file mode 100644 index 0000000..9f0b669 --- /dev/null +++ b/LinearRegression.py @@ -0,0 +1,35 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +''' +Regression class takes in a dataframe of values with two columns, which are respectively x and y +User can call respective functions to get regression analysis outputs +''' +class LinearRegression(): + + def __init__(self, data) -> None: + self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]}) + self.beta = None + self.alpha = None + + def get_alpha_beta(self): + '''return a tuple (paried values) of beta and alpha, with beta first, alpha second''' + x_mean = np.mean(self.df['x']) + y_mean = np.mean(self.df['y']) + self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean) + self.df['x_var'] = (self.df['x'] - x_mean)**2 + beta = self.df['xy_cov'].sum() / self.df['x_var'].sum() + alpha = y_mean - (beta * x_mean) + self.beta, self.alpha = beta, alpha + + return beta, alpha + + def predict_y(self): + '''Obtain regression results, store into data frame, and return as an output''' + self.get_alpha_beta() + self.df['y_pred'] = self.alpha + self.beta*self.df['x'] + return self.df['y_pred'] + + + \ No newline at end of file diff --git a/T-tests.py b/T-tests.py new file mode 100644 index 0000000..45d198a --- /dev/null +++ b/T-tests.py @@ -0,0 +1,32 @@ +import pandas as pd +import scipy.stats as stats +import numpy as np + +''' +GUIDELINE: pass data as an array(s) into T-test class +Then use functions in this class to get desired results +''' + +class t_test(): + + def __init__(self, data1, data2=None) -> None: + self.data1 = data1 + self.data2 = data2 + + def one_sample_t_test(self, population_mean, side): + if side not in ['two-sided', 'less', 'greater']: + raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'") + return stats.ttest_1samp(self.data1, population_mean, alternative=side) + + def two_sample_t_test(self, side): + if side not in ['two-sided', 'less', 'greater']: + raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter") + return stats.ttest_ind(self.data1, self.data2, alternative=side) + + def paired_sample_t_test(self): + return stats.ttest_rel(self.data1, self.data2) + + + + + \ No newline at end of file From cd7967f27c465f8f8d2e4768746624d51f7a634b Mon Sep 17 00:00:00 2001 From: MuhangTian Date: Wed, 22 Jun 2022 00:42:27 -0400 Subject: [PATCH 03/12] Update README.md --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index eb857b5..5420804 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,22 @@ # Stats Models +## T-Test Tutorial +1. user get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples +2. call functions on t_test() class to get desired values + +```python +# For one sample t-test, call below function to get t-test statistic based on that user wants to test +t_test(data1).one_sample_t_test(mean, 'two-sided') # For two-sided test +t_test(data1).one_sample_t_test(mean, 'less') # For one-sided, less than +t_test(data1).one_sample_t_test(mean, 'greater') # For one-sided, greater than + +# For two sample t-test, call below function to get t-test statistic based on side of the test +t_test(data1, data2).two_sample_t_test('two-sided') # For two-sided test +t_test(data1, data2).two_sample_t_test('less') # For one-sided, less than +t_test(data1, data2).two_sample_t_test('greater') # For one-sided, greater than + +# For paired sample t-test, simply call below function to get t-test statistic +t_test(data1, data2).paired_sample_t_test() +``` + # ML Models # DL Models From 30b3582dd4bec4e2cf0f21a44e248da0c7387ea3 Mon Sep 17 00:00:00 2001 From: MuhangTian Date: Wed, 22 Jun 2022 00:42:55 -0400 Subject: [PATCH 04/12] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5420804..0b70b38 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Stats Models ## T-Test Tutorial -1. user get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples -2. call functions on t_test() class to get desired values +1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples +2. Call functions on t_test() class to get desired values ```python # For one sample t-test, call below function to get t-test statistic based on that user wants to test From 0744f80842a2a450d0b2aafc8b3007aa549ae6be Mon Sep 17 00:00:00 2001 From: MuhangTian Date: Wed, 22 Jun 2022 00:43:39 -0400 Subject: [PATCH 05/12] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0b70b38..3fec82b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Stats Models ## T-Test Tutorial -1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples +1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples 2. Call functions on t_test() class to get desired values ```python From ddd95f0cd0040eceddea28c7789f528c24f24a60 Mon Sep 17 00:00:00 2001 From: MuhangTian Date: Wed, 22 Jun 2022 00:44:12 -0400 Subject: [PATCH 06/12] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3fec82b..83cb74e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ 2. Call functions on t_test() class to get desired values ```python -# For one sample t-test, call below function to get t-test statistic based on that user wants to test +# For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test t_test(data1).one_sample_t_test(mean, 'two-sided') # For two-sided test t_test(data1).one_sample_t_test(mean, 'less') # For one-sided, less than t_test(data1).one_sample_t_test(mean, 'greater') # For one-sided, greater than From 9896cd72244b2308a62851642fd21228c5d217ad Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Thu, 7 Jul 2022 22:02:30 -0400 Subject: [PATCH 07/12] Finish with clustering module, edited docstring --- ANOVA.py | 0 Clustering.py | 139 ++++++++++++++++++++++++++++++++++++++++++++ LinearRegression.py | 28 +++++++-- T-tests.py | 46 +++++++++++---- 4 files changed, 197 insertions(+), 16 deletions(-) delete mode 100644 ANOVA.py create mode 100644 Clustering.py diff --git a/ANOVA.py b/ANOVA.py deleted file mode 100644 index e69de29..0000000 diff --git a/Clustering.py b/Clustering.py new file mode 100644 index 0000000..968765a --- /dev/null +++ b/Clustering.py @@ -0,0 +1,139 @@ +from array import array +import numpy as np +from sklearn.cluster import MeanShift +from sklearn.cluster import DBSCAN +from sklearn.cluster import AgglomerativeClustering +from sklearn.mixture import GaussianMixture + +def mean_shift(centers, predict_data=None): + """Function that perform mean shift clustering, can also predict values if predict_data is passed + + Parameters + ---------- + centers : 2D array like + centers of data to perform clustering on + predict_data : 2D array like, optional + data to be predicted by the clustering, by default None + + Returns + ------- + cluster_centers, labels, num_features, predict + cluster_centers: centers after clustering + labels: labels of each point + num_features: number of features seen during fit + predict: predicted values by the clustering for predict_data + + Raises + ------ + Exception + raise exception when normal array (non 2D array) is passed in as predict data + """ + ms = MeanShift() + clustering = ms.fit(centers) + cluster_centers = clustering.cluster_centers_ + labels = clustering.labels_ + num_features = clustering.n_features_in_ + if type(predict_data) == type(array) or type(np.array): + try: predicted = clustering.predict(predict_data) + except: raise Exception ('Use 2D array for predict_data') + else: + predicted = None + return cluster_centers, labels, num_features, predicted + +def perform_DBSCAN(data, eps, min_samples): + """Perform DBSCAN algorithm on a given set of data + + Parameters + ---------- + data : 2D array-like + array of data of interest to perform DBSCAN + eps : float + The maximum distance between two samples for one to be considered as in the neighborhood of the other. + This is not a maximum bound on the distances of points within a cluster. + This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. + min_samples : int + The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + This includes the point itself. + + Returns + ------- + labels, num_features, core_sample_indices, components + labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1. + num_features: Number of features seen during fit. + core_sample_indices: Indices of core samples. + components: Copy of each core sample found by training. + """ + clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data) + labels = clustering.labels_ + num_features = clustering.n_features_in_ + core_sample_indices = clustering.core_sample_indices_ + components = clustering.components_ + return labels, num_features, core_sample_indices, components + +def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None): + """Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed + + Parameters + ---------- + data : 2D array + Array of data to be fitted with Gaussian Mixture Model + num_components : int + number of underlying Gaussian distributions + num_random_state : int + random seed for initialization, by default 0 + predict_data : 2D array, optional + array of data to be predicted from the model, by default None + + Returns + ------- + predicted + predicted is the predicted data of data passed into the model, which is predict_data + """ + GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data) + if type(predict_data) == type(array) or type(np.array): + predicted = GMM.predict(predict_data) + else: predicted = None + return predicted + +def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None): + """Function that performs hiearchical clustering and fit to an array of data + + Parameters + ---------- + data : 2D array + data to be fitted + n_clusters : int, default=2 + number of clusters to find + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. + The algorithm will merge the pairs of cluster that minimize this criterion. + + 'ward' minimizes the variance of the clusters being merged. + 'average' uses the average of the distances of each observation of the two sets. + 'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets. + 'single' uses the minimum of the distances between all observations of the two sets. + distance_threshold : float, default=None + The linkage distance threshold above which, clusters will not be merged. + If not None, n_clusters must be None and compute_full_tree must be True. + + Returns + ------- + num_clusters : int + The number of clusters found by the algorithm + labels : ndarray of shape (n_samples) + Cluster labels for each point. + num_leaves : int + Number of leaves in the hierarchical tree + num_connected_components : int + The estimated number of connected components in the graph + num_features : int + number of features seen during fit + """ + model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold) + model.fit(data) + num_clusters = model.n_clusters_ + labels = model.labels_ + num_leaves = model.n_leaves_ + num_connected_components = model.n_connected_components_ + num_features = model.n_features_in_ + return num_clusters, labels, num_leaves, num_connected_components, num_features \ No newline at end of file diff --git a/LinearRegression.py b/LinearRegression.py index 9f0b669..b0631b4 100644 --- a/LinearRegression.py +++ b/LinearRegression.py @@ -2,11 +2,16 @@ import numpy as np import matplotlib.pyplot as plt -''' -Regression class takes in a dataframe of values with two columns, which are respectively x and y -User can call respective functions to get regression analysis outputs -''' class LinearRegression(): + """ + Regression class takes in a dataframe of values with two columns, which are respectively x and y + User can call respective functions to get regression analysis outputs + + Parameters + ---------- + df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second + being y-values + """ def __init__(self, data) -> None: self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]}) @@ -14,7 +19,12 @@ def __init__(self, data) -> None: self.alpha = None def get_alpha_beta(self): - '''return a tuple (paried values) of beta and alpha, with beta first, alpha second''' + """ + Function that gets alpha and beta of the data in DataFrame + + Returns + ------- + a tuple (paried values) of beta and alpha, with beta first, alpha second""" x_mean = np.mean(self.df['x']) y_mean = np.mean(self.df['y']) self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean) @@ -26,7 +36,13 @@ def get_alpha_beta(self): return beta, alpha def predict_y(self): - '''Obtain regression results, store into data frame, and return as an output''' + """ + Obtain regression results, store into data frame, and return as an output + + Returns + ------- + A column of DataFrame of predicted y-values + """ self.get_alpha_beta() self.df['y_pred'] = self.alpha + self.beta*self.df['x'] return self.df['y_pred'] diff --git a/T-tests.py b/T-tests.py index 45d198a..0f6ce66 100644 --- a/T-tests.py +++ b/T-tests.py @@ -2,31 +2,57 @@ import scipy.stats as stats import numpy as np -''' -GUIDELINE: pass data as an array(s) into T-test class -Then use functions in this class to get desired results -''' - class t_test(): + """ + A class containing methods that perform various t-tests + Parameters + ---------- + data1 : (array) array of data of interest + data2 : (array) [optional] array of data of interest, only need to pass it for two sample test + """ def __init__(self, data1, data2=None) -> None: self.data1 = data1 self.data2 = data2 def one_sample_t_test(self, population_mean, side): + """ + Perform one sample t-test with a side and population mean + + Parameters + ---------- + population_mean : (float) population mean to be tested + side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform + + Returns + ------- + t-statistic (float) + """ if side not in ['two-sided', 'less', 'greater']: raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'") return stats.ttest_1samp(self.data1, population_mean, alternative=side) def two_sample_t_test(self, side): + """ + Perform two sample t-test between data1 and data2 + + Parameters + ---------- + side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform + + Returns + ------- + t-statistic (float) + """ if side not in ['two-sided', 'less', 'greater']: raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter") return stats.ttest_ind(self.data1, self.data2, alternative=side) def paired_sample_t_test(self): - return stats.ttest_rel(self.data1, self.data2) - - - + """Perform paired sample t-test between data1 and data2 - \ No newline at end of file + Returns + ------- + t-statistic (float) + """ + return stats.ttest_rel(self.data1, self.data2) \ No newline at end of file From ff4d832ebeeae8d82e617970f3b3b5a45628896d Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Thu, 7 Jul 2022 22:03:38 -0400 Subject: [PATCH 08/12] Include K-means --- KMeans.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/KMeans.py b/KMeans.py index edbaa01..57dadce 100644 --- a/KMeans.py +++ b/KMeans.py @@ -31,21 +31,4 @@ def elbow_method(data, num_k, n_init=10, max_iter=300): plt.plot(range(1, num_k), inertia, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Inertia') - plt.show() - - -'''Generate random sample (write another method to get data later?), just to show an example''' -# Assume we get this from the pre-processed data? -data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0) - -# plt.scatter(data[:, 0], data[:, 1]) -# plt.show() -elbow_method(data, 10) -# print(run_kmeans(data, 6)) - - - - - - - \ No newline at end of file + plt.show() \ No newline at end of file From d59e0c2e3fdd5ddcb2257e2676e3f2ed67364f41 Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Thu, 7 Jul 2022 22:04:50 -0400 Subject: [PATCH 09/12] Edited Changes --- KMeans.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 KMeans.py diff --git a/KMeans.py b/KMeans.py deleted file mode 100644 index 57dadce..0000000 --- a/KMeans.py +++ /dev/null @@ -1,34 +0,0 @@ -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from sklearn.cluster import KMeans -from sklearn.datasets import make_blobs - -''' -Function that gets data points and cluster number(centroids), returns coordinates -of cluster centers -Default values: number of runs on different centroid seeds = 10, max runs = 300 -''' -def run_kmeans(data, centroids, n_init=10, max_iter=300): - KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter) - y_KM = KM.fit_predict(data) - return KM.cluster_centers_ - -''' -Function that helps to determine how many clusters to use by using trials of K clusters -The idea is to find the cluster number that gives the maximum reduction in inertia -''' -def elbow_method(data, num_k, n_init=10, max_iter=300): - inertia = [] - for i in range(1, num_k): - KM = KMeans( - n_clusters=i, - n_init=n_init, max_iter=max_iter - ) - KM.fit_predict(data) - inertia.append(KM.inertia_) - - plt.plot(range(1, num_k), inertia, marker='o') - plt.xlabel('Number of clusters') - plt.ylabel('Inertia') - plt.show() \ No newline at end of file From 2f8abfc035165c6ae4a1b95b4eae1cf619fbdf21 Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Thu, 14 Jul 2022 23:46:53 -0400 Subject: [PATCH 10/12] Finish jupyter notebooks --- ml_regression.ipynb | 347 ++++++++++++++++++++++++++++++++++ unsupervised_clustering.ipynb | 187 ++++++++++++++++++ 2 files changed, 534 insertions(+) create mode 100644 ml_regression.ipynb create mode 100644 unsupervised_clustering.ipynb diff --git a/ml_regression.ipynb b/ml_regression.ipynb new file mode 100644 index 0000000..58f5fe0 --- /dev/null +++ b/ml_regression.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Linear Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class LinearRegression():\n", + " \"\"\"\n", + " Regression class takes in a dataframe of values with two columns, which are respectively x and y\n", + " User can call respective functions to get regression analysis outputs\n", + " \n", + " Parameters\n", + " ----------\n", + " df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n", + " being y-values\n", + " \"\"\"\n", + " \n", + " def __init__(self, data) -> None:\n", + " self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n", + " self.beta = None\n", + " self.alpha = None\n", + " \n", + " def get_alpha_beta(self):\n", + " \"\"\"\n", + " Function that gets alpha and beta of the data in DataFrame\n", + " \n", + " Returns\n", + " -------\n", + " a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n", + " x_mean = np.mean(self.df['x'])\n", + " y_mean = np.mean(self.df['y'])\n", + " self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n", + " self.df['x_var'] = (self.df['x'] - x_mean)**2\n", + " beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n", + " alpha = y_mean - (beta * x_mean)\n", + " self.beta, self.alpha = beta, alpha\n", + " \n", + " return beta, alpha\n", + "\n", + " def predict_y(self):\n", + " \"\"\"\n", + " Obtain regression results, store into data frame, and return as an output\n", + " \n", + " Returns\n", + " -------\n", + " A column of DataFrame of predicted y-values\n", + " \"\"\"\n", + " self.get_alpha_beta()\n", + " self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n", + " return self.df['y_pred']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Support Vector Regression from Sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVR\n", + "def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n", + " \"\"\"\n", + " run support vector regression using library from scikit learn\n", + "\n", + " Parameters\n", + " ----------\n", + " data_in : array or float\n", + " data to be analyzed and predicted based on model\n", + " x_data : array\n", + " x values of data\n", + " y_data : array\n", + " y values of data\n", + " kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n", + " Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n", + " If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n", + " degree : int, optional\n", + " Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n", + " gamma : {‘scale’, ‘auto’} or float, optional\n", + " Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n", + " tol : float, optional\n", + " tolerance for stopping criterion, by default 1e-3\n", + " c : float, optional\n", + " Regularization parameter. The strength of the regularization is inversely proportional to C. \n", + " Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n", + " epsilon : float, optional\n", + " Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n", + " the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n", + " cache_size : int, optional\n", + " Specify the size of the kernel cache (in MB)., by default 200\n", + " verbose : bool, optional\n", + " Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n", + " that, if enabled, may not work properly in a multithreaded context., by default False\n", + "\n", + " Returns\n", + " -------\n", + " array or float\n", + " predicted values from data_in\n", + " \"\"\"\n", + " svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n", + " svr.fit(x_data, y_data)\n", + " y_pred = svr.predict(data_in)\n", + " return y_pred\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeRegressor\n", + "def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n", + " \"\"\"\n", + " Run regression with decision tree from scikit learn\n", + "\n", + " Parameters\n", + " ----------\n", + " data_in : array or float\n", + " data to be predicted from fitted model\n", + " x_data : array\n", + " x values for the regression\n", + " y_data : array\n", + " y values for the regression\n", + " criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n", + " The function to measure the quality of a split. \n", + " Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n", + " feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n", + " which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n", + " the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n", + " which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n", + " \n", + " splitter : {“best”, “random”}, optional\n", + " The strategy used to choose the split at each node. \n", + " Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n", + " \n", + " max_depth : int, optional\n", + " The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n", + " \n", + " min_samples_split : int or float, optional\n", + " The minimum number of samples required to split an internal node:\n", + "\n", + " If int, then consider min_samples_split as the minimum number.\n", + " If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n", + " \n", + " min_samples_leaf : int or float, optional\n", + " The minimum number of samples required to be at a leaf node. \n", + " A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n", + " in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n", + "\n", + " Returns\n", + " -------\n", + " array or float\n", + " predicted values from data_in\n", + " \"\"\"\n", + " regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n", + " regressor.fit(x_data, y_data)\n", + " y_predict = regressor.predict(data_in)\n", + " return y_predict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Random Forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n", + " \"\"\"\n", + " run random forest regression with fitted data and data_in\n", + "\n", + " Parameters\n", + " ----------\n", + " data_in : array or float\n", + " data to be predicted from the learned models\n", + " x_data : array\n", + " array of x values of data to be fitted\n", + " y_data : array\n", + " array of y values of data to be fitted\n", + " n_estimators : int, optional\n", + " number of trees in the forest, by default 100\n", + " criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n", + " The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n", + " which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n", + " and “poisson” which uses reduction in Poisson deviance to find splits. \n", + "\n", + " Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n", + " \n", + " max_depth : int, optional\n", + " The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n", + " \n", + " min_samples_split : int or float, optional\n", + " The minimum number of samples required to split an internal node:\n", + "\n", + " If int, then consider min_samples_split as the minimum number.\n", + " If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n", + " \n", + " min_samples_leaf : int or float, optional\n", + " The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n", + " \n", + " max_features : {“sqrt”, “log2”, None} int or float, optional\n", + " The number of features to consider when looking for the best split:\n", + "\n", + " If int, then consider max_features features at each split.\n", + " If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n", + " If “auto”, then max_features=n_features.\n", + " If “sqrt”, then max_features=sqrt(n_features).\n", + " If “log2”, then max_features=log2(n_features).\n", + " If None or 1.0, then max_features=n_features.\n", + " \n", + " , by default 1.0\n", + "\n", + " Returns\n", + " -------\n", + " array or float\n", + " predicted data from random forest regressor using data_in passed by user\n", + " \"\"\"\n", + " regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n", + " regressor.fit(x_data, y_data)\n", + " y_predict = regressor.predict(data_in)\n", + " return y_predict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "XGBoost" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n", + " \"\"\"\n", + " Run xgboost regression fitted with x_data and y_data, and predict using data_in\n", + "\n", + " Parameters\n", + " ----------\n", + " data_in : array or float\n", + " data to be predicted from regression\n", + " x_data : array\n", + " x values of data for regression\n", + " y_data : array\n", + " y values of data for regression\n", + " n_estimators : int\n", + " Number of gradient boosted trees. Equivalent to number of boosting rounds.\n", + " max_depth : int\n", + " maximum tree depth\n", + " max_leaves : int\n", + " Maximum number of leaves; 0 indicates no limit.\n", + " max_bin : int\n", + " If using histogram-based algorithm, maximum number of bins per feature\n", + " grow_policy : 0 or 1\n", + " Tree growing policy. \n", + " 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n", + " 1: favor splitting at nodes with highest loss change.\n", + " learning_rate : float\n", + " boosting learning rate\n", + " verbosity : int\n", + " The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n", + " gamma : float\n", + " Minimum loss reduction required to make a further partition on a leaf node of the tree.\n", + "\n", + " Returns\n", + " -------\n", + " array or float\n", + " predicted values from data_in after regression\n", + " \"\"\"\n", + " regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n", + " regressor.fit(x_data, y_data)\n", + " pred = regressor.predict(data_in)\n", + " return pred" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad" + }, + "kernelspec": { + "display_name": "Python 3.10.1 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.1" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/unsupervised_clustering.ipynb b/unsupervised_clustering.ipynb new file mode 100644 index 0000000..1161472 --- /dev/null +++ b/unsupervised_clustering.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from array import array\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import MeanShift\n", + "def mean_shift(centers, predict_data=None):\n", + " \"\"\"Function that perform mean shift clustering, can also predict values if predict_data is passed\n", + "\n", + " Parameters\n", + " ----------\n", + " centers : 2D array like\n", + " centers of data to perform clustering on\n", + " predict_data : 2D array like, optional\n", + " data to be predicted by the clustering, by default None\n", + "\n", + " Returns\n", + " -------\n", + " cluster_centers, labels, num_features, predict\n", + " cluster_centers: centers after clustering\n", + " labels: labels of each point\n", + " num_features: number of features seen during fit\n", + " predict: predicted values by the clustering for predict_data\n", + "\n", + " Raises\n", + " ------\n", + " Exception\n", + " raise exception when normal array (non 2D array) is passed in as predict data\n", + " \"\"\"\n", + " ms = MeanShift()\n", + " clustering = ms.fit(centers)\n", + " cluster_centers = clustering.cluster_centers_\n", + " labels = clustering.labels_\n", + " num_features = clustering.n_features_in_\n", + " if type(predict_data) == type(array) or type(np.array):\n", + " try: predicted = clustering.predict(predict_data)\n", + " except: raise Exception ('Use 2D array for predict_data')\n", + " else:\n", + " predicted = None\n", + " return cluster_centers, labels, num_features, predicted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import DBSCAN\n", + "def perform_DBSCAN(data, eps, min_samples):\n", + " \"\"\"Perform DBSCAN algorithm on a given set of data\n", + "\n", + " Parameters\n", + " ----------\n", + " data : 2D array-like\n", + " array of data of interest to perform DBSCAN\n", + " eps : float\n", + " The maximum distance between two samples for one to be considered as in the neighborhood of the other. \n", + " This is not a maximum bound on the distances of points within a cluster. \n", + " This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.\n", + " min_samples : int\n", + " The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. \n", + " This includes the point itself.\n", + "\n", + " Returns\n", + " -------\n", + " labels, num_features, core_sample_indices, components\n", + " labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.\n", + " num_features: Number of features seen during fit.\n", + " core_sample_indices: Indices of core samples.\n", + " components: Copy of each core sample found by training.\n", + " \"\"\"\n", + " clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)\n", + " labels = clustering.labels_\n", + " num_features = clustering.n_features_in_\n", + " core_sample_indices = clustering.core_sample_indices_\n", + " components = clustering.components_\n", + " return labels, num_features, core_sample_indices, components\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import AgglomerativeClustering\n", + "def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):\n", + " \"\"\"Function that performs hiearchical clustering and fit to an array of data\n", + "\n", + " Parameters\n", + " ----------\n", + " data : 2D array\n", + " data to be fitted\n", + " n_clusters : int, default=2\n", + " number of clusters to find\n", + " linkage : {'ward', 'complete', 'average', 'single'}, default='ward'\n", + " Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. \n", + " The algorithm will merge the pairs of cluster that minimize this criterion.\n", + " \n", + " 'ward' minimizes the variance of the clusters being merged.\n", + " 'average' uses the average of the distances of each observation of the two sets.\n", + " 'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.\n", + " 'single' uses the minimum of the distances between all observations of the two sets.\n", + " distance_threshold : float, default=None\n", + " The linkage distance threshold above which, clusters will not be merged. \n", + " If not None, n_clusters must be None and compute_full_tree must be True.\n", + "\n", + " Returns\n", + " -------\n", + " num_clusters : int\n", + " The number of clusters found by the algorithm\n", + " labels : ndarray of shape (n_samples)\n", + " Cluster labels for each point.\n", + " num_leaves : int\n", + " Number of leaves in the hierarchical tree\n", + " num_connected_components : int\n", + " The estimated number of connected components in the graph\n", + " num_features : int\n", + " number of features seen during fit\n", + " \"\"\"\n", + " model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)\n", + " model.fit(data)\n", + " num_clusters = model.n_clusters_\n", + " labels = model.labels_\n", + " num_leaves = model.n_leaves_\n", + " num_connected_components = model.n_connected_components_\n", + " num_features = model.n_features_in_\n", + " return num_clusters, labels, num_leaves, num_connected_components, num_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.mixture import GaussianMixture \n", + "def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):\n", + " \"\"\"Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed\n", + "\n", + " Parameters\n", + " ----------\n", + " data : 2D array\n", + " Array of data to be fitted with Gaussian Mixture Model\n", + " num_components : int\n", + " number of underlying Gaussian distributions\n", + " num_random_state : int\n", + " random seed for initialization, by default 0\n", + " predict_data : 2D array, optional\n", + " array of data to be predicted from the model, by default None\n", + "\n", + " Returns\n", + " -------\n", + " predicted\n", + " predicted is the predicted data of data passed into the model, which is predict_data\n", + " \"\"\"\n", + " GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)\n", + " if type(predict_data) == type(array) or type(np.array):\n", + " predicted = GMM.predict(predict_data)\n", + " else: predicted = None\n", + " return predicted" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 85cdb383dd369c59a7570c27bd4fc612cc2b4ce8 Mon Sep 17 00:00:00 2001 From: Tony_Tian_1122 Date: Fri, 5 Aug 2022 01:58:18 -0400 Subject: [PATCH 11/12] Change README, change ipynb to py --- README.md | 2 + ml_regression.ipynb | 347 ---------------------------------- ml_regression.py | 238 +++++++++++++++++++++++ unsupervised_clustering.ipynb | 187 ------------------ unsupervised_clustering.py | 139 ++++++++++++++ 5 files changed, 379 insertions(+), 534 deletions(-) delete mode 100644 ml_regression.ipynb create mode 100644 ml_regression.py delete mode 100644 unsupervised_clustering.ipynb create mode 100644 unsupervised_clustering.py diff --git a/README.md b/README.md index 83cb74e..a8f07c3 100644 --- a/README.md +++ b/README.md @@ -19,4 +19,6 @@ t_test(data1, data2).paired_sample_t_test() ``` # ML Models +ml_regression.py: contain 5 most popular machine learning regression functions, implemented using scikit-learn standard library +unsupervised_clustering.py: contain most popular unsupervised learning clustering functions, implemented using scikit-learn standard library # DL Models diff --git a/ml_regression.ipynb b/ml_regression.ipynb deleted file mode 100644 index 58f5fe0..0000000 --- a/ml_regression.ipynb +++ /dev/null @@ -1,347 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Linear Regression" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "class LinearRegression():\n", - " \"\"\"\n", - " Regression class takes in a dataframe of values with two columns, which are respectively x and y\n", - " User can call respective functions to get regression analysis outputs\n", - " \n", - " Parameters\n", - " ----------\n", - " df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n", - " being y-values\n", - " \"\"\"\n", - " \n", - " def __init__(self, data) -> None:\n", - " self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n", - " self.beta = None\n", - " self.alpha = None\n", - " \n", - " def get_alpha_beta(self):\n", - " \"\"\"\n", - " Function that gets alpha and beta of the data in DataFrame\n", - " \n", - " Returns\n", - " -------\n", - " a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n", - " x_mean = np.mean(self.df['x'])\n", - " y_mean = np.mean(self.df['y'])\n", - " self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n", - " self.df['x_var'] = (self.df['x'] - x_mean)**2\n", - " beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n", - " alpha = y_mean - (beta * x_mean)\n", - " self.beta, self.alpha = beta, alpha\n", - " \n", - " return beta, alpha\n", - "\n", - " def predict_y(self):\n", - " \"\"\"\n", - " Obtain regression results, store into data frame, and return as an output\n", - " \n", - " Returns\n", - " -------\n", - " A column of DataFrame of predicted y-values\n", - " \"\"\"\n", - " self.get_alpha_beta()\n", - " self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n", - " return self.df['y_pred']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Support Vector Regression from Sklearn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.svm import SVR\n", - "def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n", - " \"\"\"\n", - " run support vector regression using library from scikit learn\n", - "\n", - " Parameters\n", - " ----------\n", - " data_in : array or float\n", - " data to be analyzed and predicted based on model\n", - " x_data : array\n", - " x values of data\n", - " y_data : array\n", - " y values of data\n", - " kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n", - " Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n", - " If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n", - " degree : int, optional\n", - " Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n", - " gamma : {‘scale’, ‘auto’} or float, optional\n", - " Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n", - " tol : float, optional\n", - " tolerance for stopping criterion, by default 1e-3\n", - " c : float, optional\n", - " Regularization parameter. The strength of the regularization is inversely proportional to C. \n", - " Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n", - " epsilon : float, optional\n", - " Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n", - " the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n", - " cache_size : int, optional\n", - " Specify the size of the kernel cache (in MB)., by default 200\n", - " verbose : bool, optional\n", - " Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n", - " that, if enabled, may not work properly in a multithreaded context., by default False\n", - "\n", - " Returns\n", - " -------\n", - " array or float\n", - " predicted values from data_in\n", - " \"\"\"\n", - " svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n", - " svr.fit(x_data, y_data)\n", - " y_pred = svr.predict(data_in)\n", - " return y_pred\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Decision Tree" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.tree import DecisionTreeRegressor\n", - "def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n", - " \"\"\"\n", - " Run regression with decision tree from scikit learn\n", - "\n", - " Parameters\n", - " ----------\n", - " data_in : array or float\n", - " data to be predicted from fitted model\n", - " x_data : array\n", - " x values for the regression\n", - " y_data : array\n", - " y values for the regression\n", - " criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n", - " The function to measure the quality of a split. \n", - " Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n", - " feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n", - " which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n", - " the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n", - " which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n", - " \n", - " splitter : {“best”, “random”}, optional\n", - " The strategy used to choose the split at each node. \n", - " Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n", - " \n", - " max_depth : int, optional\n", - " The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n", - " \n", - " min_samples_split : int or float, optional\n", - " The minimum number of samples required to split an internal node:\n", - "\n", - " If int, then consider min_samples_split as the minimum number.\n", - " If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n", - " \n", - " min_samples_leaf : int or float, optional\n", - " The minimum number of samples required to be at a leaf node. \n", - " A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n", - " in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n", - "\n", - " Returns\n", - " -------\n", - " array or float\n", - " predicted values from data_in\n", - " \"\"\"\n", - " regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n", - " regressor.fit(x_data, y_data)\n", - " y_predict = regressor.predict(data_in)\n", - " return y_predict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Random Forest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.ensemble import RandomForestRegressor\n", - "def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n", - " \"\"\"\n", - " run random forest regression with fitted data and data_in\n", - "\n", - " Parameters\n", - " ----------\n", - " data_in : array or float\n", - " data to be predicted from the learned models\n", - " x_data : array\n", - " array of x values of data to be fitted\n", - " y_data : array\n", - " array of y values of data to be fitted\n", - " n_estimators : int, optional\n", - " number of trees in the forest, by default 100\n", - " criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n", - " The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n", - " which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n", - " and “poisson” which uses reduction in Poisson deviance to find splits. \n", - "\n", - " Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n", - " \n", - " max_depth : int, optional\n", - " The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n", - " \n", - " min_samples_split : int or float, optional\n", - " The minimum number of samples required to split an internal node:\n", - "\n", - " If int, then consider min_samples_split as the minimum number.\n", - " If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n", - " \n", - " min_samples_leaf : int or float, optional\n", - " The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n", - " \n", - " max_features : {“sqrt”, “log2”, None} int or float, optional\n", - " The number of features to consider when looking for the best split:\n", - "\n", - " If int, then consider max_features features at each split.\n", - " If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n", - " If “auto”, then max_features=n_features.\n", - " If “sqrt”, then max_features=sqrt(n_features).\n", - " If “log2”, then max_features=log2(n_features).\n", - " If None or 1.0, then max_features=n_features.\n", - " \n", - " , by default 1.0\n", - "\n", - " Returns\n", - " -------\n", - " array or float\n", - " predicted data from random forest regressor using data_in passed by user\n", - " \"\"\"\n", - " regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n", - " regressor.fit(x_data, y_data)\n", - " y_predict = regressor.predict(data_in)\n", - " return y_predict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "XGBoost" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import xgboost as xgb\n", - "def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n", - " \"\"\"\n", - " Run xgboost regression fitted with x_data and y_data, and predict using data_in\n", - "\n", - " Parameters\n", - " ----------\n", - " data_in : array or float\n", - " data to be predicted from regression\n", - " x_data : array\n", - " x values of data for regression\n", - " y_data : array\n", - " y values of data for regression\n", - " n_estimators : int\n", - " Number of gradient boosted trees. Equivalent to number of boosting rounds.\n", - " max_depth : int\n", - " maximum tree depth\n", - " max_leaves : int\n", - " Maximum number of leaves; 0 indicates no limit.\n", - " max_bin : int\n", - " If using histogram-based algorithm, maximum number of bins per feature\n", - " grow_policy : 0 or 1\n", - " Tree growing policy. \n", - " 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n", - " 1: favor splitting at nodes with highest loss change.\n", - " learning_rate : float\n", - " boosting learning rate\n", - " verbosity : int\n", - " The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n", - " gamma : float\n", - " Minimum loss reduction required to make a further partition on a leaf node of the tree.\n", - "\n", - " Returns\n", - " -------\n", - " array or float\n", - " predicted values from data_in after regression\n", - " \"\"\"\n", - " regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n", - " regressor.fit(x_data, y_data)\n", - " pred = regressor.predict(data_in)\n", - " return pred" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad" - }, - "kernelspec": { - "display_name": "Python 3.10.1 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.1" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/ml_regression.py b/ml_regression.py new file mode 100644 index 0000000..a756193 --- /dev/null +++ b/ml_regression.py @@ -0,0 +1,238 @@ +import pandas as pd +import numpy as np + +class LinearRegression(): + """ + Regression class takes in a dataframe of values with two columns, which are respectively x and y + User can call respective functions to get regression analysis outputs + + Parameters + ---------- + df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second + being y-values + """ + + def __init__(self, data) -> None: + self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]}) + self.beta = None + self.alpha = None + + def get_alpha_beta(self): + """ + Function that gets alpha and beta of the data in DataFrame + + Returns + ------- + a tuple (paried values) of beta and alpha, with beta first, alpha second""" + x_mean = np.mean(self.df['x']) + y_mean = np.mean(self.df['y']) + self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean) + self.df['x_var'] = (self.df['x'] - x_mean)**2 + beta = self.df['xy_cov'].sum() / self.df['x_var'].sum() + alpha = y_mean - (beta * x_mean) + self.beta, self.alpha = beta, alpha + + return beta, alpha + + def predict_y(self): + """ + Obtain regression results, store into data frame, and return as an output + + Returns + ------- + A column of DataFrame of predicted y-values + """ + self.get_alpha_beta() + self.df['y_pred'] = self.alpha + self.beta*self.df['x'] + return self.df['y_pred'] + +from sklearn.svm import SVR +def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False): + """ + run support vector regression using library from scikit learn + + Parameters + ---------- + data_in : array or float + data to be analyzed and predicted based on model + x_data : array + x values of data + y_data : array + y values of data + kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional + Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. + If a callable is given it is used to precompute the kernel matrix., by default 'rbf' + degree : int, optional + Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3 + gamma : {‘scale’, ‘auto’} or float, optional + Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale' + tol : float, optional + tolerance for stopping criterion, by default 1e-3 + c : float, optional + Regularization parameter. The strength of the regularization is inversely proportional to C. + Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0 + epsilon : float, optional + Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in + the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1 + cache_size : int, optional + Specify the size of the kernel cache (in MB)., by default 200 + verbose : bool, optional + Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm + that, if enabled, may not work properly in a multithreaded context., by default False + + Returns + ------- + array or float + predicted values from data_in + """ + svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose) + svr.fit(x_data, y_data) + y_pred = svr.predict(data_in) + return y_pred + +from sklearn.tree import DecisionTreeRegressor +def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1): + """ + Run regression with decision tree from scikit learn + + Parameters + ---------- + data_in : array or float + data to be predicted from fitted model + x_data : array + x values for the regression + y_data : array + y values for the regression + criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional + The function to measure the quality of a split. + Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as + feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, + which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for + the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” + which uses reduction in Poisson deviance to find splits., by default 'squared_error' + + splitter : {“best”, “random”}, optional + The strategy used to choose the split at each node. + Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best' + + max_depth : int, optional + The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None + + min_samples_split : int or float, optional + The minimum number of samples required to split an internal node: + + If int, then consider min_samples_split as the minimum number. + If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2 + + min_samples_leaf : int or float, optional + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples + in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1 + + Returns + ------- + array or float + predicted values from data_in + """ + regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf) + regressor.fit(x_data, y_data) + y_predict = regressor.predict(data_in) + return y_predict + +from sklearn.ensemble import RandomForestRegressor +def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0): + """ + run random forest regression with fitted data and data_in + + Parameters + ---------- + data_in : array or float + data to be predicted from the learned models + x_data : array + array of x values of data to be fitted + y_data : array + array of y values of data to be fitted + n_estimators : int, optional + number of trees in the forest, by default 100 + criterion : {“squared_error”, “absolute_error”, “poisson”}, optional + The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, + which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, + and “poisson” which uses reduction in Poisson deviance to find splits. + + Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error' + + max_depth : int, optional + The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None + + min_samples_split : int or float, optional + The minimum number of samples required to split an internal node: + + If int, then consider min_samples_split as the minimum number. + If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2 + + min_samples_leaf : int or float, optional + The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1 + + max_features : {“sqrt”, “log2”, None} int or float, optional + The number of features to consider when looking for the best split: + + If int, then consider max_features features at each split. + If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split. + If “auto”, then max_features=n_features. + If “sqrt”, then max_features=sqrt(n_features). + If “log2”, then max_features=log2(n_features). + If None or 1.0, then max_features=n_features. + + , by default 1.0 + + Returns + ------- + array or float + predicted data from random forest regressor using data_in passed by user + """ + regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features) + regressor.fit(x_data, y_data) + y_predict = regressor.predict(data_in) + return y_predict + +import xgboost as xgb +def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma): + """ + Run xgboost regression fitted with x_data and y_data, and predict using data_in + + Parameters + ---------- + data_in : array or float + data to be predicted from regression + x_data : array + x values of data for regression + y_data : array + y values of data for regression + n_estimators : int + Number of gradient boosted trees. Equivalent to number of boosting rounds. + max_depth : int + maximum tree depth + max_leaves : int + Maximum number of leaves; 0 indicates no limit. + max_bin : int + If using histogram-based algorithm, maximum number of bins per feature + grow_policy : 0 or 1 + Tree growing policy. + 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. + 1: favor splitting at nodes with highest loss change. + learning_rate : float + boosting learning rate + verbosity : int + The degree of verbosity. Valid values are 0 (silent) - 3 (debug). + gamma : float + Minimum loss reduction required to make a further partition on a leaf node of the tree. + + Returns + ------- + array or float + predicted values from data_in after regression + """ + regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma) + regressor.fit(x_data, y_data) + pred = regressor.predict(data_in) + return pred \ No newline at end of file diff --git a/unsupervised_clustering.ipynb b/unsupervised_clustering.ipynb deleted file mode 100644 index 1161472..0000000 --- a/unsupervised_clustering.ipynb +++ /dev/null @@ -1,187 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from array import array\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cluster import MeanShift\n", - "def mean_shift(centers, predict_data=None):\n", - " \"\"\"Function that perform mean shift clustering, can also predict values if predict_data is passed\n", - "\n", - " Parameters\n", - " ----------\n", - " centers : 2D array like\n", - " centers of data to perform clustering on\n", - " predict_data : 2D array like, optional\n", - " data to be predicted by the clustering, by default None\n", - "\n", - " Returns\n", - " -------\n", - " cluster_centers, labels, num_features, predict\n", - " cluster_centers: centers after clustering\n", - " labels: labels of each point\n", - " num_features: number of features seen during fit\n", - " predict: predicted values by the clustering for predict_data\n", - "\n", - " Raises\n", - " ------\n", - " Exception\n", - " raise exception when normal array (non 2D array) is passed in as predict data\n", - " \"\"\"\n", - " ms = MeanShift()\n", - " clustering = ms.fit(centers)\n", - " cluster_centers = clustering.cluster_centers_\n", - " labels = clustering.labels_\n", - " num_features = clustering.n_features_in_\n", - " if type(predict_data) == type(array) or type(np.array):\n", - " try: predicted = clustering.predict(predict_data)\n", - " except: raise Exception ('Use 2D array for predict_data')\n", - " else:\n", - " predicted = None\n", - " return cluster_centers, labels, num_features, predicted" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cluster import DBSCAN\n", - "def perform_DBSCAN(data, eps, min_samples):\n", - " \"\"\"Perform DBSCAN algorithm on a given set of data\n", - "\n", - " Parameters\n", - " ----------\n", - " data : 2D array-like\n", - " array of data of interest to perform DBSCAN\n", - " eps : float\n", - " The maximum distance between two samples for one to be considered as in the neighborhood of the other. \n", - " This is not a maximum bound on the distances of points within a cluster. \n", - " This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.\n", - " min_samples : int\n", - " The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. \n", - " This includes the point itself.\n", - "\n", - " Returns\n", - " -------\n", - " labels, num_features, core_sample_indices, components\n", - " labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.\n", - " num_features: Number of features seen during fit.\n", - " core_sample_indices: Indices of core samples.\n", - " components: Copy of each core sample found by training.\n", - " \"\"\"\n", - " clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)\n", - " labels = clustering.labels_\n", - " num_features = clustering.n_features_in_\n", - " core_sample_indices = clustering.core_sample_indices_\n", - " components = clustering.components_\n", - " return labels, num_features, core_sample_indices, components\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cluster import AgglomerativeClustering\n", - "def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):\n", - " \"\"\"Function that performs hiearchical clustering and fit to an array of data\n", - "\n", - " Parameters\n", - " ----------\n", - " data : 2D array\n", - " data to be fitted\n", - " n_clusters : int, default=2\n", - " number of clusters to find\n", - " linkage : {'ward', 'complete', 'average', 'single'}, default='ward'\n", - " Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. \n", - " The algorithm will merge the pairs of cluster that minimize this criterion.\n", - " \n", - " 'ward' minimizes the variance of the clusters being merged.\n", - " 'average' uses the average of the distances of each observation of the two sets.\n", - " 'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.\n", - " 'single' uses the minimum of the distances between all observations of the two sets.\n", - " distance_threshold : float, default=None\n", - " The linkage distance threshold above which, clusters will not be merged. \n", - " If not None, n_clusters must be None and compute_full_tree must be True.\n", - "\n", - " Returns\n", - " -------\n", - " num_clusters : int\n", - " The number of clusters found by the algorithm\n", - " labels : ndarray of shape (n_samples)\n", - " Cluster labels for each point.\n", - " num_leaves : int\n", - " Number of leaves in the hierarchical tree\n", - " num_connected_components : int\n", - " The estimated number of connected components in the graph\n", - " num_features : int\n", - " number of features seen during fit\n", - " \"\"\"\n", - " model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)\n", - " model.fit(data)\n", - " num_clusters = model.n_clusters_\n", - " labels = model.labels_\n", - " num_leaves = model.n_leaves_\n", - " num_connected_components = model.n_connected_components_\n", - " num_features = model.n_features_in_\n", - " return num_clusters, labels, num_leaves, num_connected_components, num_features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.mixture import GaussianMixture \n", - "def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):\n", - " \"\"\"Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed\n", - "\n", - " Parameters\n", - " ----------\n", - " data : 2D array\n", - " Array of data to be fitted with Gaussian Mixture Model\n", - " num_components : int\n", - " number of underlying Gaussian distributions\n", - " num_random_state : int\n", - " random seed for initialization, by default 0\n", - " predict_data : 2D array, optional\n", - " array of data to be predicted from the model, by default None\n", - "\n", - " Returns\n", - " -------\n", - " predicted\n", - " predicted is the predicted data of data passed into the model, which is predict_data\n", - " \"\"\"\n", - " GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)\n", - " if type(predict_data) == type(array) or type(np.array):\n", - " predicted = GMM.predict(predict_data)\n", - " else: predicted = None\n", - " return predicted" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/unsupervised_clustering.py b/unsupervised_clustering.py new file mode 100644 index 0000000..39be74a --- /dev/null +++ b/unsupervised_clustering.py @@ -0,0 +1,139 @@ +from array import array +import numpy as np + +from sklearn.cluster import MeanShift +def mean_shift(centers, predict_data=None): + """Function that perform mean shift clustering, can also predict values if predict_data is passed + + Parameters + ---------- + centers : 2D array like + centers of data to perform clustering on + predict_data : 2D array like, optional + data to be predicted by the clustering, by default None + + Returns + ------- + cluster_centers, labels, num_features, predict + cluster_centers: centers after clustering + labels: labels of each point + num_features: number of features seen during fit + predict: predicted values by the clustering for predict_data + + Raises + ------ + Exception + raise exception when normal array (non 2D array) is passed in as predict data + """ + ms = MeanShift() + clustering = ms.fit(centers) + cluster_centers = clustering.cluster_centers_ + labels = clustering.labels_ + num_features = clustering.n_features_in_ + if type(predict_data) == type(array) or type(np.array): + try: predicted = clustering.predict(predict_data) + except: raise Exception ('Use 2D array for predict_data') + else: + predicted = None + return cluster_centers, labels, num_features, predicted + +from sklearn.cluster import DBSCAN +def perform_DBSCAN(data, eps, min_samples): + """Perform DBSCAN algorithm on a given set of data + + Parameters + ---------- + data : 2D array-like + array of data of interest to perform DBSCAN + eps : float + The maximum distance between two samples for one to be considered as in the neighborhood of the other. + This is not a maximum bound on the distances of points within a cluster. + This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. + min_samples : int + The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + This includes the point itself. + + Returns + ------- + labels, num_features, core_sample_indices, components + labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1. + num_features: Number of features seen during fit. + core_sample_indices: Indices of core samples. + components: Copy of each core sample found by training. + """ + clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data) + labels = clustering.labels_ + num_features = clustering.n_features_in_ + core_sample_indices = clustering.core_sample_indices_ + components = clustering.components_ + return labels, num_features, core_sample_indices, components + +from sklearn.cluster import AgglomerativeClustering +def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None): + """Function that performs hiearchical clustering and fit to an array of data + + Parameters + ---------- + data : 2D array + data to be fitted + n_clusters : int, default=2 + number of clusters to find + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. + The algorithm will merge the pairs of cluster that minimize this criterion. + + 'ward' minimizes the variance of the clusters being merged. + 'average' uses the average of the distances of each observation of the two sets. + 'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets. + 'single' uses the minimum of the distances between all observations of the two sets. + distance_threshold : float, default=None + The linkage distance threshold above which, clusters will not be merged. + If not None, n_clusters must be None and compute_full_tree must be True. + + Returns + ------- + num_clusters : int + The number of clusters found by the algorithm + labels : ndarray of shape (n_samples) + Cluster labels for each point. + num_leaves : int + Number of leaves in the hierarchical tree + num_connected_components : int + The estimated number of connected components in the graph + num_features : int + number of features seen during fit + """ + model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold) + model.fit(data) + num_clusters = model.n_clusters_ + labels = model.labels_ + num_leaves = model.n_leaves_ + num_connected_components = model.n_connected_components_ + num_features = model.n_features_in_ + return num_clusters, labels, num_leaves, num_connected_components, num_features + +from sklearn.mixture import GaussianMixture +def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None): + """Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed + + Parameters + ---------- + data : 2D array + Array of data to be fitted with Gaussian Mixture Model + num_components : int + number of underlying Gaussian distributions + num_random_state : int + random seed for initialization, by default 0 + predict_data : 2D array, optional + array of data to be predicted from the model, by default None + + Returns + ------- + predicted + predicted is the predicted data of data passed into the model, which is predict_data + """ + GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data) + if type(predict_data) == type(array) or type(np.array): + predicted = GMM.predict(predict_data) + else: predicted = None + return predicted \ No newline at end of file From 2df95a6198b56039242f9f53091c167d9243f573 Mon Sep 17 00:00:00 2001 From: MuhangTian Date: Fri, 5 Aug 2022 02:02:28 -0400 Subject: [PATCH 12/12] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a8f07c3..25f1b2c 100644 --- a/README.md +++ b/README.md @@ -20,5 +20,6 @@ t_test(data1, data2).paired_sample_t_test() # ML Models ml_regression.py: contain 5 most popular machine learning regression functions, implemented using scikit-learn standard library + unsupervised_clustering.py: contain most popular unsupervised learning clustering functions, implemented using scikit-learn standard library # DL Models