From 2ee591b828adb0f0a5606e6441809b0484ff5a36 Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Thu, 2 Jun 2022 21:41:48 -0400
Subject: [PATCH 01/12] add Kmeans module

---
 KMeans.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 KMeans.py

diff --git a/KMeans.py b/KMeans.py
new file mode 100644
index 0000000..b1024df
--- /dev/null
+++ b/KMeans.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+
+'''
+Function that gets data points and cluster number(centroids), returns coordinates
+of cluster centers
+Default values: number of runs on different centroid seeds = 10, max runs = 300
+'''
+def run_kmeans(data, centroids, n_init=10, max_iter=300):
+    KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter)
+    y_KM = KM.fit_predict(data)
+    return KM.cluster_centers_
+
+'''
+Function that helps to determine how many clusters to use by using trials of K clusters
+The idea is to find the cluster number that gives the maximum reduction in inertia
+'''
+def elbow_method(data, num_k, n_init=10, max_iter=300):
+    inertia = []
+    for i in range(1, num_k):
+        KM = KMeans(
+        n_clusters=i,
+        n_init=n_init, max_iter=max_iter
+        )
+        KM.fit_predict(data)
+        inertia.append(KM.inertia_)
+    
+    plt.plot(range(1, num_k), inertia, marker='o')
+    plt.xlabel('Number of clusters')
+    plt.ylabel('Inertia')
+    plt.show()
+
+
+'''Generate random sample (write another method to get data later?), just to show an example'''
+data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0)
+# plt.scatter(data[:, 0], data[:, 1])
+# plt.show()
+elbow_method(data, 10)
+# print(run_kmeans(data, 6))
+
+
+    
+    
+    
+    
+    
\ No newline at end of file

From 5c84ba9f2d320f35754755b6acf767e645fec24b Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Wed, 22 Jun 2022 00:12:36 -0400
Subject: [PATCH 02/12] Finish with T-test

---
 ANOVA.py            |  0
 KMeans.py           |  2 ++
 LinearRegression.py | 35 +++++++++++++++++++++++++++++++++++
 T-tests.py          | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+)
 create mode 100644 ANOVA.py
 create mode 100644 LinearRegression.py
 create mode 100644 T-tests.py

diff --git a/ANOVA.py b/ANOVA.py
new file mode 100644
index 0000000..e69de29
diff --git a/KMeans.py b/KMeans.py
index b1024df..edbaa01 100644
--- a/KMeans.py
+++ b/KMeans.py
@@ -35,7 +35,9 @@ def elbow_method(data, num_k, n_init=10, max_iter=300):
 
 
 '''Generate random sample (write another method to get data later?), just to show an example'''
+# Assume we get this from the pre-processed data?
 data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0)
+
 # plt.scatter(data[:, 0], data[:, 1])
 # plt.show()
 elbow_method(data, 10)
diff --git a/LinearRegression.py b/LinearRegression.py
new file mode 100644
index 0000000..9f0b669
--- /dev/null
+++ b/LinearRegression.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+'''
+Regression class takes in a dataframe of values with two columns, which are respectively x and y
+User can call respective functions to get regression analysis outputs
+'''
+class LinearRegression():
+    
+    def __init__(self, data) -> None:
+        self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})
+        self.beta = None
+        self.alpha = None
+    
+    def get_alpha_beta(self):
+        '''return a tuple (paried values) of beta and alpha, with beta first, alpha second'''
+        x_mean = np.mean(self.df['x'])
+        y_mean = np.mean(self.df['y'])
+        self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)
+        self.df['x_var'] = (self.df['x'] - x_mean)**2
+        beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()
+        alpha = y_mean - (beta * x_mean)
+        self.beta, self.alpha = beta, alpha
+        
+        return beta, alpha
+
+    def predict_y(self):
+        '''Obtain regression results, store into data frame, and return as an output'''
+        self.get_alpha_beta()
+        self.df['y_pred'] = self.alpha + self.beta*self.df['x']
+        return self.df['y_pred']
+    
+    
+    
\ No newline at end of file
diff --git a/T-tests.py b/T-tests.py
new file mode 100644
index 0000000..45d198a
--- /dev/null
+++ b/T-tests.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import scipy.stats as stats
+import numpy as np
+
+'''
+GUIDELINE: pass data as an array(s) into T-test class
+Then use functions in this class to get desired results
+'''
+
+class t_test():
+    
+    def __init__(self, data1, data2=None) -> None:
+        self.data1 = data1
+        self.data2 = data2
+    
+    def one_sample_t_test(self, population_mean, side):
+        if side not in ['two-sided', 'less', 'greater']:
+            raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'")
+        return stats.ttest_1samp(self.data1, population_mean, alternative=side)
+    
+    def two_sample_t_test(self, side):
+        if side not in ['two-sided', 'less', 'greater']:
+            raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter")
+        return stats.ttest_ind(self.data1, self.data2, alternative=side)
+    
+    def paired_sample_t_test(self):
+        return stats.ttest_rel(self.data1, self.data2)
+    
+    
+    
+        
+    
\ No newline at end of file

From cd7967f27c465f8f8d2e4768746624d51f7a634b Mon Sep 17 00:00:00 2001
From: MuhangTian <muhang.tian@duke.edu>
Date: Wed, 22 Jun 2022 00:42:27 -0400
Subject: [PATCH 03/12] Update README.md

---
 README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/README.md b/README.md
index eb857b5..5420804 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,22 @@
 # Stats Models
+## T-Test Tutorial
+1. user get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
+2. call functions on t_test() class to get desired values
+
+```python
+# For one sample t-test, call below function to get t-test statistic based on that user wants to test
+t_test(data1).one_sample_t_test(mean, 'two-sided')      # For two-sided test
+t_test(data1).one_sample_t_test(mean, 'less')           # For one-sided, less than
+t_test(data1).one_sample_t_test(mean, 'greater')        # For one-sided, greater than 
+
+# For two sample t-test, call below function to get t-test statistic based on side of the test
+t_test(data1, data2).two_sample_t_test('two-sided')     # For two-sided test
+t_test(data1, data2).two_sample_t_test('less')          # For one-sided, less than
+t_test(data1, data2).two_sample_t_test('greater')       # For one-sided, greater than
+
+# For paired sample t-test, simply call below function to get t-test statistic
+t_test(data1, data2).paired_sample_t_test()
+```
+
 # ML Models
 # DL Models

From 30b3582dd4bec4e2cf0f21a44e248da0c7387ea3 Mon Sep 17 00:00:00 2001
From: MuhangTian <muhang.tian@duke.edu>
Date: Wed, 22 Jun 2022 00:42:55 -0400
Subject: [PATCH 04/12] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5420804..0b70b38 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Stats Models
 ## T-Test Tutorial
-1. user get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
-2. call functions on t_test() class to get desired values
+1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
+2. Call functions on t_test() class to get desired values
 
 ```python
 # For one sample t-test, call below function to get t-test statistic based on that user wants to test

From 0744f80842a2a450d0b2aafc8b3007aa549ae6be Mon Sep 17 00:00:00 2001
From: MuhangTian <muhang.tian@duke.edu>
Date: Wed, 22 Jun 2022 00:43:39 -0400
Subject: [PATCH 05/12] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0b70b38..3fec82b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Stats Models
 ## T-Test Tutorial
-1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
+1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
 2. Call functions on t_test() class to get desired values
 
 ```python

From ddd95f0cd0040eceddea28c7789f528c24f24a60 Mon Sep 17 00:00:00 2001
From: MuhangTian <muhang.tian@duke.edu>
Date: Wed, 22 Jun 2022 00:44:12 -0400
Subject: [PATCH 06/12] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3fec82b..83cb74e 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 2. Call functions on t_test() class to get desired values
 
 ```python
-# For one sample t-test, call below function to get t-test statistic based on that user wants to test
+# For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test
 t_test(data1).one_sample_t_test(mean, 'two-sided')      # For two-sided test
 t_test(data1).one_sample_t_test(mean, 'less')           # For one-sided, less than
 t_test(data1).one_sample_t_test(mean, 'greater')        # For one-sided, greater than 

From 9896cd72244b2308a62851642fd21228c5d217ad Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Thu, 7 Jul 2022 22:02:30 -0400
Subject: [PATCH 07/12] Finish with clustering module, edited docstring

---
 ANOVA.py            |   0
 Clustering.py       | 139 ++++++++++++++++++++++++++++++++++++++++++++
 LinearRegression.py |  28 +++++++--
 T-tests.py          |  46 +++++++++++----
 4 files changed, 197 insertions(+), 16 deletions(-)
 delete mode 100644 ANOVA.py
 create mode 100644 Clustering.py

diff --git a/ANOVA.py b/ANOVA.py
deleted file mode 100644
index e69de29..0000000
diff --git a/Clustering.py b/Clustering.py
new file mode 100644
index 0000000..968765a
--- /dev/null
+++ b/Clustering.py
@@ -0,0 +1,139 @@
+from array import array
+import numpy as np
+from sklearn.cluster import MeanShift
+from sklearn.cluster import DBSCAN
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.mixture import GaussianMixture 
+
+def mean_shift(centers, predict_data=None):
+    """Function that perform mean shift clustering, can also predict values if predict_data is passed
+
+    Parameters
+    ----------
+    centers : 2D array like
+        centers of data to perform clustering on
+    predict_data : 2D array like, optional
+        data to be predicted by the clustering, by default None
+
+    Returns
+    -------
+    cluster_centers, labels, num_features, predict
+        cluster_centers: centers after clustering
+        labels: labels of each point
+        num_features: number of features seen during fit
+        predict: predicted values by the clustering for predict_data
+
+    Raises
+    ------
+    Exception
+        raise exception when normal array (non 2D array) is passed in as predict data
+    """
+    ms = MeanShift()
+    clustering = ms.fit(centers)
+    cluster_centers = clustering.cluster_centers_
+    labels = clustering.labels_
+    num_features = clustering.n_features_in_
+    if type(predict_data) == type(array) or type(np.array):
+        try: predicted = clustering.predict(predict_data)
+        except: raise Exception ('Use 2D array for predict_data')
+    else:
+        predicted = None
+    return cluster_centers, labels, num_features, predicted
+
+def perform_DBSCAN(data, eps, min_samples):
+    """Perform DBSCAN algorithm on a given set of data
+
+    Parameters
+    ----------
+    data : 2D array-like
+        array of data of interest to perform DBSCAN
+    eps : float
+        The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
+        This is not a maximum bound on the distances of points within a cluster. 
+        This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
+    min_samples : int
+        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 
+        This includes the point itself.
+
+    Returns
+    -------
+    labels, num_features, core_sample_indices, components
+        labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
+        num_features: Number of features seen during fit.
+        core_sample_indices: Indices of core samples.
+        components: Copy of each core sample found by training.
+    """
+    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
+    labels = clustering.labels_
+    num_features = clustering.n_features_in_
+    core_sample_indices = clustering.core_sample_indices_
+    components = clustering.components_
+    return labels, num_features, core_sample_indices, components
+
+def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):
+    """Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed
+
+    Parameters
+    ----------
+    data : 2D array
+        Array of data to be fitted with Gaussian Mixture Model
+    num_components : int
+        number of underlying Gaussian distributions
+    num_random_state : int
+        random seed for initialization, by default 0
+    predict_data : 2D array, optional
+        array of data to be predicted from the model, by default None
+
+    Returns
+    -------
+    predicted
+        predicted is the predicted data of data passed into the model, which is predict_data
+    """
+    GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)
+    if type(predict_data) == type(array) or type(np.array):
+        predicted = GMM.predict(predict_data)
+    else: predicted = None
+    return predicted
+
+def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):
+    """Function that performs hiearchical clustering and fit to an array of data
+
+    Parameters
+    ----------
+    data : 2D array
+        data to be fitted
+    n_clusters : int, default=2
+        number of clusters to find
+    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
+        Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. 
+        The algorithm will merge the pairs of cluster that minimize this criterion.
+        
+        'ward' minimizes the variance of the clusters being merged.
+        'average' uses the average of the distances of each observation of the two sets.
+        'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.
+        'single' uses the minimum of the distances between all observations of the two sets.
+    distance_threshold : float, default=None
+        The linkage distance threshold above which, clusters will not be merged. 
+        If not None, n_clusters must be None and compute_full_tree must be True.
+
+    Returns
+    -------
+    num_clusters : int
+        The number of clusters found by the algorithm
+    labels : ndarray of shape (n_samples)
+        Cluster labels for each point.
+    num_leaves : int
+        Number of leaves in the hierarchical tree
+    num_connected_components : int
+        The estimated number of connected components in the graph
+    num_features : int
+        number of features seen during fit
+    """
+    model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)
+    model.fit(data)
+    num_clusters = model.n_clusters_
+    labels = model.labels_
+    num_leaves = model.n_leaves_
+    num_connected_components = model.n_connected_components_
+    num_features = model.n_features_in_
+    return num_clusters, labels, num_leaves, num_connected_components, num_features
\ No newline at end of file
diff --git a/LinearRegression.py b/LinearRegression.py
index 9f0b669..b0631b4 100644
--- a/LinearRegression.py
+++ b/LinearRegression.py
@@ -2,11 +2,16 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-'''
-Regression class takes in a dataframe of values with two columns, which are respectively x and y
-User can call respective functions to get regression analysis outputs
-'''
 class LinearRegression():
+    """
+    Regression class takes in a dataframe of values with two columns, which are respectively x and y
+    User can call respective functions to get regression analysis outputs
+    
+    Parameters
+    ----------
+    df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second
+    being y-values
+    """
     
     def __init__(self, data) -> None:
         self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})
@@ -14,7 +19,12 @@ def __init__(self, data) -> None:
         self.alpha = None
     
     def get_alpha_beta(self):
-        '''return a tuple (paried values) of beta and alpha, with beta first, alpha second'''
+        """
+        Function that gets alpha and beta of the data in DataFrame
+        
+        Returns
+        -------
+        a tuple (paried values) of beta and alpha, with beta first, alpha second"""
         x_mean = np.mean(self.df['x'])
         y_mean = np.mean(self.df['y'])
         self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)
@@ -26,7 +36,13 @@ def get_alpha_beta(self):
         return beta, alpha
 
     def predict_y(self):
-        '''Obtain regression results, store into data frame, and return as an output'''
+        """
+        Obtain regression results, store into data frame, and return as an output
+        
+        Returns
+        -------
+        A column of DataFrame of predicted y-values
+        """
         self.get_alpha_beta()
         self.df['y_pred'] = self.alpha + self.beta*self.df['x']
         return self.df['y_pred']
diff --git a/T-tests.py b/T-tests.py
index 45d198a..0f6ce66 100644
--- a/T-tests.py
+++ b/T-tests.py
@@ -2,31 +2,57 @@
 import scipy.stats as stats
 import numpy as np
 
-'''
-GUIDELINE: pass data as an array(s) into T-test class
-Then use functions in this class to get desired results
-'''
-
 class t_test():
+    """
+    A class containing methods that perform various t-tests
     
+    Parameters
+    ----------
+    data1 : (array) array of data of interest
+    data2 : (array) [optional] array of data of interest, only need to pass it for two sample test
+    """
     def __init__(self, data1, data2=None) -> None:
         self.data1 = data1
         self.data2 = data2
     
     def one_sample_t_test(self, population_mean, side):
+        """
+        Perform one sample t-test with a side and population mean
+        
+        Parameters
+        ----------
+        population_mean : (float) population mean to be tested
+        side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform
+        
+        Returns
+        -------
+        t-statistic (float)
+        """
         if side not in ['two-sided', 'less', 'greater']:
             raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'")
         return stats.ttest_1samp(self.data1, population_mean, alternative=side)
     
     def two_sample_t_test(self, side):
+        """
+        Perform two sample t-test between data1 and data2
+        
+        Parameters
+        ----------
+        side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform
+        
+        Returns
+        -------
+        t-statistic (float)
+        """
         if side not in ['two-sided', 'less', 'greater']:
             raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter")
         return stats.ttest_ind(self.data1, self.data2, alternative=side)
     
     def paired_sample_t_test(self):
-        return stats.ttest_rel(self.data1, self.data2)
-    
-    
-    
+        """Perform paired sample t-test between data1 and data2
         
-    
\ No newline at end of file
+        Returns
+        -------
+        t-statistic (float)
+        """
+        return stats.ttest_rel(self.data1, self.data2)
\ No newline at end of file

From ff4d832ebeeae8d82e617970f3b3b5a45628896d Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Thu, 7 Jul 2022 22:03:38 -0400
Subject: [PATCH 08/12] Include K-means

---
 KMeans.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/KMeans.py b/KMeans.py
index edbaa01..57dadce 100644
--- a/KMeans.py
+++ b/KMeans.py
@@ -31,21 +31,4 @@ def elbow_method(data, num_k, n_init=10, max_iter=300):
     plt.plot(range(1, num_k), inertia, marker='o')
     plt.xlabel('Number of clusters')
     plt.ylabel('Inertia')
-    plt.show()
-
-
-'''Generate random sample (write another method to get data later?), just to show an example'''
-# Assume we get this from the pre-processed data?
-data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0)
-
-# plt.scatter(data[:, 0], data[:, 1])
-# plt.show()
-elbow_method(data, 10)
-# print(run_kmeans(data, 6))
-
-
-    
-    
-    
-    
-    
\ No newline at end of file
+    plt.show() 
\ No newline at end of file

From d59e0c2e3fdd5ddcb2257e2676e3f2ed67364f41 Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Thu, 7 Jul 2022 22:04:50 -0400
Subject: [PATCH 09/12] Edited Changes

---
 KMeans.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 KMeans.py

diff --git a/KMeans.py b/KMeans.py
deleted file mode 100644
index 57dadce..0000000
--- a/KMeans.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.cluster import KMeans
-from sklearn.datasets import make_blobs
-
-'''
-Function that gets data points and cluster number(centroids), returns coordinates
-of cluster centers
-Default values: number of runs on different centroid seeds = 10, max runs = 300
-'''
-def run_kmeans(data, centroids, n_init=10, max_iter=300):
-    KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter)
-    y_KM = KM.fit_predict(data)
-    return KM.cluster_centers_
-
-'''
-Function that helps to determine how many clusters to use by using trials of K clusters
-The idea is to find the cluster number that gives the maximum reduction in inertia
-'''
-def elbow_method(data, num_k, n_init=10, max_iter=300):
-    inertia = []
-    for i in range(1, num_k):
-        KM = KMeans(
-        n_clusters=i,
-        n_init=n_init, max_iter=max_iter
-        )
-        KM.fit_predict(data)
-        inertia.append(KM.inertia_)
-    
-    plt.plot(range(1, num_k), inertia, marker='o')
-    plt.xlabel('Number of clusters')
-    plt.ylabel('Inertia')
-    plt.show() 
\ No newline at end of file

From 2f8abfc035165c6ae4a1b95b4eae1cf619fbdf21 Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Thu, 14 Jul 2022 23:46:53 -0400
Subject: [PATCH 10/12] Finish jupyter notebooks

---
 ml_regression.ipynb           | 347 ++++++++++++++++++++++++++++++++++
 unsupervised_clustering.ipynb | 187 ++++++++++++++++++
 2 files changed, 534 insertions(+)
 create mode 100644 ml_regression.ipynb
 create mode 100644 unsupervised_clustering.ipynb

diff --git a/ml_regression.ipynb b/ml_regression.ipynb
new file mode 100644
index 0000000..58f5fe0
--- /dev/null
+++ b/ml_regression.ipynb
@@ -0,0 +1,347 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Linear Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LinearRegression():\n",
+    "    \"\"\"\n",
+    "    Regression class takes in a dataframe of values with two columns, which are respectively x and y\n",
+    "    User can call respective functions to get regression analysis outputs\n",
+    "    \n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n",
+    "    being y-values\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, data) -> None:\n",
+    "        self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n",
+    "        self.beta = None\n",
+    "        self.alpha = None\n",
+    "    \n",
+    "    def get_alpha_beta(self):\n",
+    "        \"\"\"\n",
+    "        Function that gets alpha and beta of the data in DataFrame\n",
+    "        \n",
+    "        Returns\n",
+    "        -------\n",
+    "        a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n",
+    "        x_mean = np.mean(self.df['x'])\n",
+    "        y_mean = np.mean(self.df['y'])\n",
+    "        self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n",
+    "        self.df['x_var'] = (self.df['x'] - x_mean)**2\n",
+    "        beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n",
+    "        alpha = y_mean - (beta * x_mean)\n",
+    "        self.beta, self.alpha = beta, alpha\n",
+    "        \n",
+    "        return beta, alpha\n",
+    "\n",
+    "    def predict_y(self):\n",
+    "        \"\"\"\n",
+    "        Obtain regression results, store into data frame, and return as an output\n",
+    "        \n",
+    "        Returns\n",
+    "        -------\n",
+    "        A column of DataFrame of predicted y-values\n",
+    "        \"\"\"\n",
+    "        self.get_alpha_beta()\n",
+    "        self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n",
+    "        return self.df['y_pred']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Support Vector Regression from Sklearn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.svm import SVR\n",
+    "def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n",
+    "    \"\"\"\n",
+    "    run support vector regression using library from scikit learn\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be analyzed and predicted based on model\n",
+    "    x_data : array\n",
+    "        x values of data\n",
+    "    y_data : array\n",
+    "        y values of data\n",
+    "    kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n",
+    "       Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n",
+    "       If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n",
+    "    degree : int, optional\n",
+    "        Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n",
+    "    gamma : {‘scale’, ‘auto’} or float, optional\n",
+    "        Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n",
+    "    tol : float, optional\n",
+    "        tolerance for stopping criterion, by default 1e-3\n",
+    "    c : float, optional\n",
+    "        Regularization parameter. The strength of the regularization is inversely proportional to C. \n",
+    "        Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n",
+    "    epsilon : float, optional\n",
+    "        Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n",
+    "        the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n",
+    "    cache_size : int, optional\n",
+    "        Specify the size of the kernel cache (in MB)., by default 200\n",
+    "    verbose : bool, optional\n",
+    "        Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n",
+    "        that, if enabled, may not work properly in a multithreaded context., by default False\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted values from data_in\n",
+    "    \"\"\"\n",
+    "    svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n",
+    "    svr.fit(x_data, y_data)\n",
+    "    y_pred = svr.predict(data_in)\n",
+    "    return y_pred\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Decision Tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n",
+    "    \"\"\"\n",
+    "    Run regression with decision tree from scikit learn\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be predicted from fitted model\n",
+    "    x_data : array\n",
+    "        x values for the regression\n",
+    "    y_data : array\n",
+    "        y values for the regression\n",
+    "    criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n",
+    "        The function to measure the quality of a split. \n",
+    "        Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n",
+    "        feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n",
+    "        which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n",
+    "        the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n",
+    "        which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n",
+    "        \n",
+    "    splitter : {“best”, “random”}, optional\n",
+    "       The strategy used to choose the split at each node. \n",
+    "       Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n",
+    "       \n",
+    "    max_depth : int, optional\n",
+    "        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
+    "        \n",
+    "    min_samples_split : int or float, optional\n",
+    "        The minimum number of samples required to split an internal node:\n",
+    "\n",
+    "        If int, then consider min_samples_split as the minimum number.\n",
+    "        If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
+    "        \n",
+    "    min_samples_leaf : int or float, optional\n",
+    "        The minimum number of samples required to be at a leaf node. \n",
+    "        A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n",
+    "        in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted values from data_in\n",
+    "    \"\"\"\n",
+    "    regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n",
+    "    regressor.fit(x_data, y_data)\n",
+    "    y_predict = regressor.predict(data_in)\n",
+    "    return y_predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Random Forest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n",
+    "    \"\"\"\n",
+    "    run random forest regression with fitted data and data_in\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be predicted from the learned models\n",
+    "    x_data : array\n",
+    "        array of x values of data to be fitted\n",
+    "    y_data : array\n",
+    "        array of y values of data to be fitted\n",
+    "    n_estimators : int, optional\n",
+    "        number of trees in the forest, by default 100\n",
+    "    criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n",
+    "        The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n",
+    "        which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n",
+    "        and “poisson” which uses reduction in Poisson deviance to find splits. \n",
+    "\n",
+    "        Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n",
+    "        \n",
+    "    max_depth : int, optional\n",
+    "        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
+    "        \n",
+    "    min_samples_split : int or float, optional\n",
+    "        The minimum number of samples required to split an internal node:\n",
+    "\n",
+    "            If int, then consider min_samples_split as the minimum number.\n",
+    "            If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
+    "            \n",
+    "    min_samples_leaf : int or float, optional\n",
+    "        The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
+    "        \n",
+    "    max_features : {“sqrt”, “log2”, None} int or float, optional\n",
+    "        The number of features to consider when looking for the best split:\n",
+    "\n",
+    "            If int, then consider max_features features at each split.\n",
+    "            If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n",
+    "            If “auto”, then max_features=n_features.\n",
+    "            If “sqrt”, then max_features=sqrt(n_features).\n",
+    "            If “log2”, then max_features=log2(n_features).\n",
+    "            If None or 1.0, then max_features=n_features.\n",
+    "        \n",
+    "        , by default 1.0\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted data from random forest regressor using data_in passed by user\n",
+    "    \"\"\"\n",
+    "    regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n",
+    "    regressor.fit(x_data, y_data)\n",
+    "    y_predict = regressor.predict(data_in)\n",
+    "    return y_predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "XGBoost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xgboost as xgb\n",
+    "def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n",
+    "    \"\"\"\n",
+    "    Run xgboost regression fitted with x_data and y_data, and predict using data_in\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be predicted from regression\n",
+    "    x_data : array\n",
+    "        x values of data for regression\n",
+    "    y_data : array\n",
+    "        y values of data for regression\n",
+    "    n_estimators : int\n",
+    "        Number of gradient boosted trees. Equivalent to number of boosting rounds.\n",
+    "    max_depth : int\n",
+    "        maximum tree depth\n",
+    "    max_leaves : int\n",
+    "        Maximum number of leaves; 0 indicates no limit.\n",
+    "    max_bin : int\n",
+    "        If using histogram-based algorithm, maximum number of bins per feature\n",
+    "    grow_policy : 0 or 1\n",
+    "        Tree growing policy. \n",
+    "        0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n",
+    "        1: favor splitting at nodes with highest loss change.\n",
+    "    learning_rate : float\n",
+    "        boosting learning rate\n",
+    "    verbosity : int\n",
+    "        The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n",
+    "    gamma : float\n",
+    "         Minimum loss reduction required to make a further partition on a leaf node of the tree.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted values from data_in after regression\n",
+    "    \"\"\"\n",
+    "    regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n",
+    "    regressor.fit(x_data, y_data)\n",
+    "    pred = regressor.predict(data_in)\n",
+    "    return pred"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.10.1 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.1"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/unsupervised_clustering.ipynb b/unsupervised_clustering.ipynb
new file mode 100644
index 0000000..1161472
--- /dev/null
+++ b/unsupervised_clustering.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from array import array\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import MeanShift\n",
+    "def mean_shift(centers, predict_data=None):\n",
+    "    \"\"\"Function that perform mean shift clustering, can also predict values if predict_data is passed\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    centers : 2D array like\n",
+    "        centers of data to perform clustering on\n",
+    "    predict_data : 2D array like, optional\n",
+    "        data to be predicted by the clustering, by default None\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    cluster_centers, labels, num_features, predict\n",
+    "        cluster_centers: centers after clustering\n",
+    "        labels: labels of each point\n",
+    "        num_features: number of features seen during fit\n",
+    "        predict: predicted values by the clustering for predict_data\n",
+    "\n",
+    "    Raises\n",
+    "    ------\n",
+    "    Exception\n",
+    "        raise exception when normal array (non 2D array) is passed in as predict data\n",
+    "    \"\"\"\n",
+    "    ms = MeanShift()\n",
+    "    clustering = ms.fit(centers)\n",
+    "    cluster_centers = clustering.cluster_centers_\n",
+    "    labels = clustering.labels_\n",
+    "    num_features = clustering.n_features_in_\n",
+    "    if type(predict_data) == type(array) or type(np.array):\n",
+    "        try: predicted = clustering.predict(predict_data)\n",
+    "        except: raise Exception ('Use 2D array for predict_data')\n",
+    "    else:\n",
+    "        predicted = None\n",
+    "    return cluster_centers, labels, num_features, predicted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import DBSCAN\n",
+    "def perform_DBSCAN(data, eps, min_samples):\n",
+    "    \"\"\"Perform DBSCAN algorithm on a given set of data\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data : 2D array-like\n",
+    "        array of data of interest to perform DBSCAN\n",
+    "    eps : float\n",
+    "        The maximum distance between two samples for one to be considered as in the neighborhood of the other. \n",
+    "        This is not a maximum bound on the distances of points within a cluster. \n",
+    "        This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.\n",
+    "    min_samples : int\n",
+    "        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. \n",
+    "        This includes the point itself.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    labels, num_features, core_sample_indices, components\n",
+    "        labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.\n",
+    "        num_features: Number of features seen during fit.\n",
+    "        core_sample_indices: Indices of core samples.\n",
+    "        components: Copy of each core sample found by training.\n",
+    "    \"\"\"\n",
+    "    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)\n",
+    "    labels = clustering.labels_\n",
+    "    num_features = clustering.n_features_in_\n",
+    "    core_sample_indices = clustering.core_sample_indices_\n",
+    "    components = clustering.components_\n",
+    "    return labels, num_features, core_sample_indices, components\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):\n",
+    "    \"\"\"Function that performs hiearchical clustering and fit to an array of data\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data : 2D array\n",
+    "        data to be fitted\n",
+    "    n_clusters : int, default=2\n",
+    "        number of clusters to find\n",
+    "    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'\n",
+    "        Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. \n",
+    "        The algorithm will merge the pairs of cluster that minimize this criterion.\n",
+    "        \n",
+    "        'ward' minimizes the variance of the clusters being merged.\n",
+    "        'average' uses the average of the distances of each observation of the two sets.\n",
+    "        'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.\n",
+    "        'single' uses the minimum of the distances between all observations of the two sets.\n",
+    "    distance_threshold : float, default=None\n",
+    "        The linkage distance threshold above which, clusters will not be merged. \n",
+    "        If not None, n_clusters must be None and compute_full_tree must be True.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    num_clusters : int\n",
+    "        The number of clusters found by the algorithm\n",
+    "    labels : ndarray of shape (n_samples)\n",
+    "        Cluster labels for each point.\n",
+    "    num_leaves : int\n",
+    "        Number of leaves in the hierarchical tree\n",
+    "    num_connected_components : int\n",
+    "        The estimated number of connected components in the graph\n",
+    "    num_features : int\n",
+    "        number of features seen during fit\n",
+    "    \"\"\"\n",
+    "    model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)\n",
+    "    model.fit(data)\n",
+    "    num_clusters = model.n_clusters_\n",
+    "    labels = model.labels_\n",
+    "    num_leaves = model.n_leaves_\n",
+    "    num_connected_components = model.n_connected_components_\n",
+    "    num_features = model.n_features_in_\n",
+    "    return num_clusters, labels, num_leaves, num_connected_components, num_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.mixture import GaussianMixture \n",
+    "def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):\n",
+    "    \"\"\"Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data : 2D array\n",
+    "        Array of data to be fitted with Gaussian Mixture Model\n",
+    "    num_components : int\n",
+    "        number of underlying Gaussian distributions\n",
+    "    num_random_state : int\n",
+    "        random seed for initialization, by default 0\n",
+    "    predict_data : 2D array, optional\n",
+    "        array of data to be predicted from the model, by default None\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    predicted\n",
+    "        predicted is the predicted data of data passed into the model, which is predict_data\n",
+    "    \"\"\"\n",
+    "    GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)\n",
+    "    if type(predict_data) == type(array) or type(np.array):\n",
+    "        predicted = GMM.predict(predict_data)\n",
+    "    else: predicted = None\n",
+    "    return predicted"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 85cdb383dd369c59a7570c27bd4fc612cc2b4ce8 Mon Sep 17 00:00:00 2001
From: Tony_Tian_1122 <tonytian@stayafloat.io>
Date: Fri, 5 Aug 2022 01:58:18 -0400
Subject: [PATCH 11/12] Change README, change ipynb to py

---
 README.md                     |   2 +
 ml_regression.ipynb           | 347 ----------------------------------
 ml_regression.py              | 238 +++++++++++++++++++++++
 unsupervised_clustering.ipynb | 187 ------------------
 unsupervised_clustering.py    | 139 ++++++++++++++
 5 files changed, 379 insertions(+), 534 deletions(-)
 delete mode 100644 ml_regression.ipynb
 create mode 100644 ml_regression.py
 delete mode 100644 unsupervised_clustering.ipynb
 create mode 100644 unsupervised_clustering.py

diff --git a/README.md b/README.md
index 83cb74e..a8f07c3 100644
--- a/README.md
+++ b/README.md
@@ -19,4 +19,6 @@ t_test(data1, data2).paired_sample_t_test()
 ```
 
 # ML Models
+ml_regression.py: contain 5 most popular machine learning regression functions, implemented using scikit-learn standard library
+unsupervised_clustering.py: contain most popular unsupervised learning clustering functions, implemented using scikit-learn standard library
 # DL Models
diff --git a/ml_regression.ipynb b/ml_regression.ipynb
deleted file mode 100644
index 58f5fe0..0000000
--- a/ml_regression.ipynb
+++ /dev/null
@@ -1,347 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Linear Regression"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class LinearRegression():\n",
-    "    \"\"\"\n",
-    "    Regression class takes in a dataframe of values with two columns, which are respectively x and y\n",
-    "    User can call respective functions to get regression analysis outputs\n",
-    "    \n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n",
-    "    being y-values\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    def __init__(self, data) -> None:\n",
-    "        self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n",
-    "        self.beta = None\n",
-    "        self.alpha = None\n",
-    "    \n",
-    "    def get_alpha_beta(self):\n",
-    "        \"\"\"\n",
-    "        Function that gets alpha and beta of the data in DataFrame\n",
-    "        \n",
-    "        Returns\n",
-    "        -------\n",
-    "        a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n",
-    "        x_mean = np.mean(self.df['x'])\n",
-    "        y_mean = np.mean(self.df['y'])\n",
-    "        self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n",
-    "        self.df['x_var'] = (self.df['x'] - x_mean)**2\n",
-    "        beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n",
-    "        alpha = y_mean - (beta * x_mean)\n",
-    "        self.beta, self.alpha = beta, alpha\n",
-    "        \n",
-    "        return beta, alpha\n",
-    "\n",
-    "    def predict_y(self):\n",
-    "        \"\"\"\n",
-    "        Obtain regression results, store into data frame, and return as an output\n",
-    "        \n",
-    "        Returns\n",
-    "        -------\n",
-    "        A column of DataFrame of predicted y-values\n",
-    "        \"\"\"\n",
-    "        self.get_alpha_beta()\n",
-    "        self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n",
-    "        return self.df['y_pred']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Support Vector Regression from Sklearn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.svm import SVR\n",
-    "def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n",
-    "    \"\"\"\n",
-    "    run support vector regression using library from scikit learn\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data_in : array or float\n",
-    "        data to be analyzed and predicted based on model\n",
-    "    x_data : array\n",
-    "        x values of data\n",
-    "    y_data : array\n",
-    "        y values of data\n",
-    "    kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n",
-    "       Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n",
-    "       If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n",
-    "    degree : int, optional\n",
-    "        Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n",
-    "    gamma : {‘scale’, ‘auto’} or float, optional\n",
-    "        Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n",
-    "    tol : float, optional\n",
-    "        tolerance for stopping criterion, by default 1e-3\n",
-    "    c : float, optional\n",
-    "        Regularization parameter. The strength of the regularization is inversely proportional to C. \n",
-    "        Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n",
-    "    epsilon : float, optional\n",
-    "        Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n",
-    "        the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n",
-    "    cache_size : int, optional\n",
-    "        Specify the size of the kernel cache (in MB)., by default 200\n",
-    "    verbose : bool, optional\n",
-    "        Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n",
-    "        that, if enabled, may not work properly in a multithreaded context., by default False\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    array or float\n",
-    "        predicted values from data_in\n",
-    "    \"\"\"\n",
-    "    svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n",
-    "    svr.fit(x_data, y_data)\n",
-    "    y_pred = svr.predict(data_in)\n",
-    "    return y_pred\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Decision Tree"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.tree import DecisionTreeRegressor\n",
-    "def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n",
-    "    \"\"\"\n",
-    "    Run regression with decision tree from scikit learn\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data_in : array or float\n",
-    "        data to be predicted from fitted model\n",
-    "    x_data : array\n",
-    "        x values for the regression\n",
-    "    y_data : array\n",
-    "        y values for the regression\n",
-    "    criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n",
-    "        The function to measure the quality of a split. \n",
-    "        Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n",
-    "        feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n",
-    "        which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n",
-    "        the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n",
-    "        which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n",
-    "        \n",
-    "    splitter : {“best”, “random”}, optional\n",
-    "       The strategy used to choose the split at each node. \n",
-    "       Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n",
-    "       \n",
-    "    max_depth : int, optional\n",
-    "        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
-    "        \n",
-    "    min_samples_split : int or float, optional\n",
-    "        The minimum number of samples required to split an internal node:\n",
-    "\n",
-    "        If int, then consider min_samples_split as the minimum number.\n",
-    "        If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
-    "        \n",
-    "    min_samples_leaf : int or float, optional\n",
-    "        The minimum number of samples required to be at a leaf node. \n",
-    "        A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n",
-    "        in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    array or float\n",
-    "        predicted values from data_in\n",
-    "    \"\"\"\n",
-    "    regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n",
-    "    regressor.fit(x_data, y_data)\n",
-    "    y_predict = regressor.predict(data_in)\n",
-    "    return y_predict"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Random Forest"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.ensemble import RandomForestRegressor\n",
-    "def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n",
-    "    \"\"\"\n",
-    "    run random forest regression with fitted data and data_in\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data_in : array or float\n",
-    "        data to be predicted from the learned models\n",
-    "    x_data : array\n",
-    "        array of x values of data to be fitted\n",
-    "    y_data : array\n",
-    "        array of y values of data to be fitted\n",
-    "    n_estimators : int, optional\n",
-    "        number of trees in the forest, by default 100\n",
-    "    criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n",
-    "        The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n",
-    "        which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n",
-    "        and “poisson” which uses reduction in Poisson deviance to find splits. \n",
-    "\n",
-    "        Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n",
-    "        \n",
-    "    max_depth : int, optional\n",
-    "        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
-    "        \n",
-    "    min_samples_split : int or float, optional\n",
-    "        The minimum number of samples required to split an internal node:\n",
-    "\n",
-    "            If int, then consider min_samples_split as the minimum number.\n",
-    "            If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
-    "            \n",
-    "    min_samples_leaf : int or float, optional\n",
-    "        The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
-    "        \n",
-    "    max_features : {“sqrt”, “log2”, None} int or float, optional\n",
-    "        The number of features to consider when looking for the best split:\n",
-    "\n",
-    "            If int, then consider max_features features at each split.\n",
-    "            If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n",
-    "            If “auto”, then max_features=n_features.\n",
-    "            If “sqrt”, then max_features=sqrt(n_features).\n",
-    "            If “log2”, then max_features=log2(n_features).\n",
-    "            If None or 1.0, then max_features=n_features.\n",
-    "        \n",
-    "        , by default 1.0\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    array or float\n",
-    "        predicted data from random forest regressor using data_in passed by user\n",
-    "    \"\"\"\n",
-    "    regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n",
-    "    regressor.fit(x_data, y_data)\n",
-    "    y_predict = regressor.predict(data_in)\n",
-    "    return y_predict"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "XGBoost"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import xgboost as xgb\n",
-    "def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n",
-    "    \"\"\"\n",
-    "    Run xgboost regression fitted with x_data and y_data, and predict using data_in\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data_in : array or float\n",
-    "        data to be predicted from regression\n",
-    "    x_data : array\n",
-    "        x values of data for regression\n",
-    "    y_data : array\n",
-    "        y values of data for regression\n",
-    "    n_estimators : int\n",
-    "        Number of gradient boosted trees. Equivalent to number of boosting rounds.\n",
-    "    max_depth : int\n",
-    "        maximum tree depth\n",
-    "    max_leaves : int\n",
-    "        Maximum number of leaves; 0 indicates no limit.\n",
-    "    max_bin : int\n",
-    "        If using histogram-based algorithm, maximum number of bins per feature\n",
-    "    grow_policy : 0 or 1\n",
-    "        Tree growing policy. \n",
-    "        0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n",
-    "        1: favor splitting at nodes with highest loss change.\n",
-    "    learning_rate : float\n",
-    "        boosting learning rate\n",
-    "    verbosity : int\n",
-    "        The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n",
-    "    gamma : float\n",
-    "         Minimum loss reduction required to make a further partition on a leaf node of the tree.\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    array or float\n",
-    "        predicted values from data_in after regression\n",
-    "    \"\"\"\n",
-    "    regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n",
-    "    regressor.fit(x_data, y_data)\n",
-    "    pred = regressor.predict(data_in)\n",
-    "    return pred"
-   ]
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad"
-  },
-  "kernelspec": {
-   "display_name": "Python 3.10.1 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.1"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/ml_regression.py b/ml_regression.py
new file mode 100644
index 0000000..a756193
--- /dev/null
+++ b/ml_regression.py
@@ -0,0 +1,238 @@
+import pandas as pd
+import numpy as np
+
+class LinearRegression():
+    """
+    Regression class takes in a dataframe of values with two columns, which are respectively x and y
+    User can call respective functions to get regression analysis outputs
+    
+    Parameters
+    ----------
+    df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second
+    being y-values
+    """
+    
+    def __init__(self, data) -> None:
+        self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})
+        self.beta = None
+        self.alpha = None
+    
+    def get_alpha_beta(self):
+        """
+        Function that gets alpha and beta of the data in DataFrame
+        
+        Returns
+        -------
+        a tuple (paried values) of beta and alpha, with beta first, alpha second"""
+        x_mean = np.mean(self.df['x'])
+        y_mean = np.mean(self.df['y'])
+        self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)
+        self.df['x_var'] = (self.df['x'] - x_mean)**2
+        beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()
+        alpha = y_mean - (beta * x_mean)
+        self.beta, self.alpha = beta, alpha
+        
+        return beta, alpha
+
+    def predict_y(self):
+        """
+        Obtain regression results, store into data frame, and return as an output
+        
+        Returns
+        -------
+        A column of DataFrame of predicted y-values
+        """
+        self.get_alpha_beta()
+        self.df['y_pred'] = self.alpha + self.beta*self.df['x']
+        return self.df['y_pred']
+
+from sklearn.svm import SVR
+def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):
+    """
+    run support vector regression using library from scikit learn
+
+    Parameters
+    ----------
+    data_in : array or float
+        data to be analyzed and predicted based on model
+    x_data : array
+        x values of data
+    y_data : array
+        y values of data
+    kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional
+       Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. 
+       If a callable is given it is used to precompute the kernel matrix., by default 'rbf'
+    degree : int, optional
+        Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3
+    gamma : {‘scale’, ‘auto’} or float, optional
+        Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'
+    tol : float, optional
+        tolerance for stopping criterion, by default 1e-3
+    c : float, optional
+        Regularization parameter. The strength of the regularization is inversely proportional to C. 
+        Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0
+    epsilon : float, optional
+        Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in 
+        the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1
+    cache_size : int, optional
+        Specify the size of the kernel cache (in MB)., by default 200
+    verbose : bool, optional
+        Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm 
+        that, if enabled, may not work properly in a multithreaded context., by default False
+
+    Returns
+    -------
+    array or float
+        predicted values from data_in
+    """
+    svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)
+    svr.fit(x_data, y_data)
+    y_pred = svr.predict(data_in)
+    return y_pred
+
+from sklearn.tree import DecisionTreeRegressor
+def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):
+    """
+    Run regression with decision tree from scikit learn
+
+    Parameters
+    ----------
+    data_in : array or float
+        data to be predicted from fitted model
+    x_data : array
+        x values for the regression
+    y_data : array
+        y values for the regression
+    criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional
+        The function to measure the quality of a split. 
+        Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as 
+        feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, 
+        which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for 
+        the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” 
+        which uses reduction in Poisson deviance to find splits., by default 'squared_error'
+        
+    splitter : {“best”, “random”}, optional
+       The strategy used to choose the split at each node. 
+       Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'
+       
+    max_depth : int, optional
+        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None
+        
+    min_samples_split : int or float, optional
+        The minimum number of samples required to split an internal node:
+
+        If int, then consider min_samples_split as the minimum number.
+        If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2
+        
+    min_samples_leaf : int or float, optional
+        The minimum number of samples required to be at a leaf node. 
+        A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples 
+        in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1
+
+    Returns
+    -------
+    array or float
+        predicted values from data_in
+    """
+    regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)
+    regressor.fit(x_data, y_data)
+    y_predict = regressor.predict(data_in)
+    return y_predict
+
+from sklearn.ensemble import RandomForestRegressor
+def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):
+    """
+    run random forest regression with fitted data and data_in
+
+    Parameters
+    ----------
+    data_in : array or float
+        data to be predicted from the learned models
+    x_data : array
+        array of x values of data to be fitted
+    y_data : array
+        array of y values of data to be fitted
+    n_estimators : int, optional
+        number of trees in the forest, by default 100
+    criterion : {“squared_error”, “absolute_error”, “poisson”}, optional
+        The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, 
+        which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, 
+        and “poisson” which uses reduction in Poisson deviance to find splits. 
+
+        Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'
+        
+    max_depth : int, optional
+        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None
+        
+    min_samples_split : int or float, optional
+        The minimum number of samples required to split an internal node:
+
+            If int, then consider min_samples_split as the minimum number.
+            If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2
+            
+    min_samples_leaf : int or float, optional
+        The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1
+        
+    max_features : {“sqrt”, “log2”, None} int or float, optional
+        The number of features to consider when looking for the best split:
+
+            If int, then consider max_features features at each split.
+            If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
+            If “auto”, then max_features=n_features.
+            If “sqrt”, then max_features=sqrt(n_features).
+            If “log2”, then max_features=log2(n_features).
+            If None or 1.0, then max_features=n_features.
+        
+        , by default 1.0
+
+    Returns
+    -------
+    array or float
+        predicted data from random forest regressor using data_in passed by user
+    """
+    regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)
+    regressor.fit(x_data, y_data)
+    y_predict = regressor.predict(data_in)
+    return y_predict
+
+import xgboost as xgb
+def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):
+    """
+    Run xgboost regression fitted with x_data and y_data, and predict using data_in
+
+    Parameters
+    ----------
+    data_in : array or float
+        data to be predicted from regression
+    x_data : array
+        x values of data for regression
+    y_data : array
+        y values of data for regression
+    n_estimators : int
+        Number of gradient boosted trees. Equivalent to number of boosting rounds.
+    max_depth : int
+        maximum tree depth
+    max_leaves : int
+        Maximum number of leaves; 0 indicates no limit.
+    max_bin : int
+        If using histogram-based algorithm, maximum number of bins per feature
+    grow_policy : 0 or 1
+        Tree growing policy. 
+        0: favor splitting at nodes closest to the node, i.e. grow depth-wise. 
+        1: favor splitting at nodes with highest loss change.
+    learning_rate : float
+        boosting learning rate
+    verbosity : int
+        The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
+    gamma : float
+         Minimum loss reduction required to make a further partition on a leaf node of the tree.
+
+    Returns
+    -------
+    array or float
+        predicted values from data_in after regression
+    """
+    regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)
+    regressor.fit(x_data, y_data)
+    pred = regressor.predict(data_in)
+    return pred
\ No newline at end of file
diff --git a/unsupervised_clustering.ipynb b/unsupervised_clustering.ipynb
deleted file mode 100644
index 1161472..0000000
--- a/unsupervised_clustering.ipynb
+++ /dev/null
@@ -1,187 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from array import array\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.cluster import MeanShift\n",
-    "def mean_shift(centers, predict_data=None):\n",
-    "    \"\"\"Function that perform mean shift clustering, can also predict values if predict_data is passed\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    centers : 2D array like\n",
-    "        centers of data to perform clustering on\n",
-    "    predict_data : 2D array like, optional\n",
-    "        data to be predicted by the clustering, by default None\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    cluster_centers, labels, num_features, predict\n",
-    "        cluster_centers: centers after clustering\n",
-    "        labels: labels of each point\n",
-    "        num_features: number of features seen during fit\n",
-    "        predict: predicted values by the clustering for predict_data\n",
-    "\n",
-    "    Raises\n",
-    "    ------\n",
-    "    Exception\n",
-    "        raise exception when normal array (non 2D array) is passed in as predict data\n",
-    "    \"\"\"\n",
-    "    ms = MeanShift()\n",
-    "    clustering = ms.fit(centers)\n",
-    "    cluster_centers = clustering.cluster_centers_\n",
-    "    labels = clustering.labels_\n",
-    "    num_features = clustering.n_features_in_\n",
-    "    if type(predict_data) == type(array) or type(np.array):\n",
-    "        try: predicted = clustering.predict(predict_data)\n",
-    "        except: raise Exception ('Use 2D array for predict_data')\n",
-    "    else:\n",
-    "        predicted = None\n",
-    "    return cluster_centers, labels, num_features, predicted"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.cluster import DBSCAN\n",
-    "def perform_DBSCAN(data, eps, min_samples):\n",
-    "    \"\"\"Perform DBSCAN algorithm on a given set of data\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data : 2D array-like\n",
-    "        array of data of interest to perform DBSCAN\n",
-    "    eps : float\n",
-    "        The maximum distance between two samples for one to be considered as in the neighborhood of the other. \n",
-    "        This is not a maximum bound on the distances of points within a cluster. \n",
-    "        This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.\n",
-    "    min_samples : int\n",
-    "        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. \n",
-    "        This includes the point itself.\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    labels, num_features, core_sample_indices, components\n",
-    "        labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.\n",
-    "        num_features: Number of features seen during fit.\n",
-    "        core_sample_indices: Indices of core samples.\n",
-    "        components: Copy of each core sample found by training.\n",
-    "    \"\"\"\n",
-    "    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)\n",
-    "    labels = clustering.labels_\n",
-    "    num_features = clustering.n_features_in_\n",
-    "    core_sample_indices = clustering.core_sample_indices_\n",
-    "    components = clustering.components_\n",
-    "    return labels, num_features, core_sample_indices, components\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.cluster import AgglomerativeClustering\n",
-    "def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):\n",
-    "    \"\"\"Function that performs hiearchical clustering and fit to an array of data\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data : 2D array\n",
-    "        data to be fitted\n",
-    "    n_clusters : int, default=2\n",
-    "        number of clusters to find\n",
-    "    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'\n",
-    "        Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. \n",
-    "        The algorithm will merge the pairs of cluster that minimize this criterion.\n",
-    "        \n",
-    "        'ward' minimizes the variance of the clusters being merged.\n",
-    "        'average' uses the average of the distances of each observation of the two sets.\n",
-    "        'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.\n",
-    "        'single' uses the minimum of the distances between all observations of the two sets.\n",
-    "    distance_threshold : float, default=None\n",
-    "        The linkage distance threshold above which, clusters will not be merged. \n",
-    "        If not None, n_clusters must be None and compute_full_tree must be True.\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    num_clusters : int\n",
-    "        The number of clusters found by the algorithm\n",
-    "    labels : ndarray of shape (n_samples)\n",
-    "        Cluster labels for each point.\n",
-    "    num_leaves : int\n",
-    "        Number of leaves in the hierarchical tree\n",
-    "    num_connected_components : int\n",
-    "        The estimated number of connected components in the graph\n",
-    "    num_features : int\n",
-    "        number of features seen during fit\n",
-    "    \"\"\"\n",
-    "    model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)\n",
-    "    model.fit(data)\n",
-    "    num_clusters = model.n_clusters_\n",
-    "    labels = model.labels_\n",
-    "    num_leaves = model.n_leaves_\n",
-    "    num_connected_components = model.n_connected_components_\n",
-    "    num_features = model.n_features_in_\n",
-    "    return num_clusters, labels, num_leaves, num_connected_components, num_features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.mixture import GaussianMixture \n",
-    "def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):\n",
-    "    \"\"\"Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    data : 2D array\n",
-    "        Array of data to be fitted with Gaussian Mixture Model\n",
-    "    num_components : int\n",
-    "        number of underlying Gaussian distributions\n",
-    "    num_random_state : int\n",
-    "        random seed for initialization, by default 0\n",
-    "    predict_data : 2D array, optional\n",
-    "        array of data to be predicted from the model, by default None\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    predicted\n",
-    "        predicted is the predicted data of data passed into the model, which is predict_data\n",
-    "    \"\"\"\n",
-    "    GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)\n",
-    "    if type(predict_data) == type(array) or type(np.array):\n",
-    "        predicted = GMM.predict(predict_data)\n",
-    "    else: predicted = None\n",
-    "    return predicted"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/unsupervised_clustering.py b/unsupervised_clustering.py
new file mode 100644
index 0000000..39be74a
--- /dev/null
+++ b/unsupervised_clustering.py
@@ -0,0 +1,139 @@
+from array import array
+import numpy as np
+
+from sklearn.cluster import MeanShift
+def mean_shift(centers, predict_data=None):
+    """Function that perform mean shift clustering, can also predict values if predict_data is passed
+
+    Parameters
+    ----------
+    centers : 2D array like
+        centers of data to perform clustering on
+    predict_data : 2D array like, optional
+        data to be predicted by the clustering, by default None
+
+    Returns
+    -------
+    cluster_centers, labels, num_features, predict
+        cluster_centers: centers after clustering
+        labels: labels of each point
+        num_features: number of features seen during fit
+        predict: predicted values by the clustering for predict_data
+
+    Raises
+    ------
+    Exception
+        raise exception when normal array (non 2D array) is passed in as predict data
+    """
+    ms = MeanShift()
+    clustering = ms.fit(centers)
+    cluster_centers = clustering.cluster_centers_
+    labels = clustering.labels_
+    num_features = clustering.n_features_in_
+    if type(predict_data) == type(array) or type(np.array):
+        try: predicted = clustering.predict(predict_data)
+        except: raise Exception ('Use 2D array for predict_data')
+    else:
+        predicted = None
+    return cluster_centers, labels, num_features, predicted
+
+from sklearn.cluster import DBSCAN
+def perform_DBSCAN(data, eps, min_samples):
+    """Perform DBSCAN algorithm on a given set of data
+
+    Parameters
+    ----------
+    data : 2D array-like
+        array of data of interest to perform DBSCAN
+    eps : float
+        The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
+        This is not a maximum bound on the distances of points within a cluster. 
+        This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
+    min_samples : int
+        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 
+        This includes the point itself.
+
+    Returns
+    -------
+    labels, num_features, core_sample_indices, components
+        labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
+        num_features: Number of features seen during fit.
+        core_sample_indices: Indices of core samples.
+        components: Copy of each core sample found by training.
+    """
+    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
+    labels = clustering.labels_
+    num_features = clustering.n_features_in_
+    core_sample_indices = clustering.core_sample_indices_
+    components = clustering.components_
+    return labels, num_features, core_sample_indices, components
+
+from sklearn.cluster import AgglomerativeClustering
+def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):
+    """Function that performs hiearchical clustering and fit to an array of data
+
+    Parameters
+    ----------
+    data : 2D array
+        data to be fitted
+    n_clusters : int, default=2
+        number of clusters to find
+    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
+        Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. 
+        The algorithm will merge the pairs of cluster that minimize this criterion.
+        
+        'ward' minimizes the variance of the clusters being merged.
+        'average' uses the average of the distances of each observation of the two sets.
+        'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.
+        'single' uses the minimum of the distances between all observations of the two sets.
+    distance_threshold : float, default=None
+        The linkage distance threshold above which, clusters will not be merged. 
+        If not None, n_clusters must be None and compute_full_tree must be True.
+
+    Returns
+    -------
+    num_clusters : int
+        The number of clusters found by the algorithm
+    labels : ndarray of shape (n_samples)
+        Cluster labels for each point.
+    num_leaves : int
+        Number of leaves in the hierarchical tree
+    num_connected_components : int
+        The estimated number of connected components in the graph
+    num_features : int
+        number of features seen during fit
+    """
+    model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)
+    model.fit(data)
+    num_clusters = model.n_clusters_
+    labels = model.labels_
+    num_leaves = model.n_leaves_
+    num_connected_components = model.n_connected_components_
+    num_features = model.n_features_in_
+    return num_clusters, labels, num_leaves, num_connected_components, num_features
+
+from sklearn.mixture import GaussianMixture 
+def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):
+    """Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed
+
+    Parameters
+    ----------
+    data : 2D array
+        Array of data to be fitted with Gaussian Mixture Model
+    num_components : int
+        number of underlying Gaussian distributions
+    num_random_state : int
+        random seed for initialization, by default 0
+    predict_data : 2D array, optional
+        array of data to be predicted from the model, by default None
+
+    Returns
+    -------
+    predicted
+        predicted is the predicted data of data passed into the model, which is predict_data
+    """
+    GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)
+    if type(predict_data) == type(array) or type(np.array):
+        predicted = GMM.predict(predict_data)
+    else: predicted = None
+    return predicted
\ No newline at end of file

From 2df95a6198b56039242f9f53091c167d9243f573 Mon Sep 17 00:00:00 2001
From: MuhangTian <muhang.tian@duke.edu>
Date: Fri, 5 Aug 2022 02:02:28 -0400
Subject: [PATCH 12/12] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a8f07c3..25f1b2c 100644
--- a/README.md
+++ b/README.md
@@ -20,5 +20,6 @@ t_test(data1, data2).paired_sample_t_test()
 
 # ML Models
 ml_regression.py: contain 5 most popular machine learning regression functions, implemented using scikit-learn standard library
+
 unsupervised_clustering.py: contain most popular unsupervised learning clustering functions, implemented using scikit-learn standard library
 # DL Models