diff --git a/Clustering.py b/Clustering.py new file mode 100644 index 0000000..968765a --- /dev/null +++ b/Clustering.py @@ -0,0 +1,139 @@ +from array import array +import numpy as np +from sklearn.cluster import MeanShift +from sklearn.cluster import DBSCAN +from sklearn.cluster import AgglomerativeClustering +from sklearn.mixture import GaussianMixture + +def mean_shift(centers, predict_data=None): + """Function that perform mean shift clustering, can also predict values if predict_data is passed + + Parameters + ---------- + centers : 2D array like + centers of data to perform clustering on + predict_data : 2D array like, optional + data to be predicted by the clustering, by default None + + Returns + ------- + cluster_centers, labels, num_features, predict + cluster_centers: centers after clustering + labels: labels of each point + num_features: number of features seen during fit + predict: predicted values by the clustering for predict_data + + Raises + ------ + Exception + raise exception when normal array (non 2D array) is passed in as predict data + """ + ms = MeanShift() + clustering = ms.fit(centers) + cluster_centers = clustering.cluster_centers_ + labels = clustering.labels_ + num_features = clustering.n_features_in_ + if type(predict_data) == type(array) or type(np.array): + try: predicted = clustering.predict(predict_data) + except: raise Exception ('Use 2D array for predict_data') + else: + predicted = None + return cluster_centers, labels, num_features, predicted + +def perform_DBSCAN(data, eps, min_samples): + """Perform DBSCAN algorithm on a given set of data + + Parameters + ---------- + data : 2D array-like + array of data of interest to perform DBSCAN + eps : float + The maximum distance between two samples for one to be considered as in the neighborhood of the other. + This is not a maximum bound on the distances of points within a cluster. + This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. + min_samples : int + The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + This includes the point itself. + + Returns + ------- + labels, num_features, core_sample_indices, components + labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1. + num_features: Number of features seen during fit. + core_sample_indices: Indices of core samples. + components: Copy of each core sample found by training. + """ + clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data) + labels = clustering.labels_ + num_features = clustering.n_features_in_ + core_sample_indices = clustering.core_sample_indices_ + components = clustering.components_ + return labels, num_features, core_sample_indices, components + +def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None): + """Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed + + Parameters + ---------- + data : 2D array + Array of data to be fitted with Gaussian Mixture Model + num_components : int + number of underlying Gaussian distributions + num_random_state : int + random seed for initialization, by default 0 + predict_data : 2D array, optional + array of data to be predicted from the model, by default None + + Returns + ------- + predicted + predicted is the predicted data of data passed into the model, which is predict_data + """ + GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data) + if type(predict_data) == type(array) or type(np.array): + predicted = GMM.predict(predict_data) + else: predicted = None + return predicted + +def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None): + """Function that performs hiearchical clustering and fit to an array of data + + Parameters + ---------- + data : 2D array + data to be fitted + n_clusters : int, default=2 + number of clusters to find + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. + The algorithm will merge the pairs of cluster that minimize this criterion. + + 'ward' minimizes the variance of the clusters being merged. + 'average' uses the average of the distances of each observation of the two sets. + 'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets. + 'single' uses the minimum of the distances between all observations of the two sets. + distance_threshold : float, default=None + The linkage distance threshold above which, clusters will not be merged. + If not None, n_clusters must be None and compute_full_tree must be True. + + Returns + ------- + num_clusters : int + The number of clusters found by the algorithm + labels : ndarray of shape (n_samples) + Cluster labels for each point. + num_leaves : int + Number of leaves in the hierarchical tree + num_connected_components : int + The estimated number of connected components in the graph + num_features : int + number of features seen during fit + """ + model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold) + model.fit(data) + num_clusters = model.n_clusters_ + labels = model.labels_ + num_leaves = model.n_leaves_ + num_connected_components = model.n_connected_components_ + num_features = model.n_features_in_ + return num_clusters, labels, num_leaves, num_connected_components, num_features \ No newline at end of file diff --git a/LinearRegression.py b/LinearRegression.py new file mode 100644 index 0000000..b0631b4 --- /dev/null +++ b/LinearRegression.py @@ -0,0 +1,51 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +class LinearRegression(): + """ + Regression class takes in a dataframe of values with two columns, which are respectively x and y + User can call respective functions to get regression analysis outputs + + Parameters + ---------- + df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second + being y-values + """ + + def __init__(self, data) -> None: + self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]}) + self.beta = None + self.alpha = None + + def get_alpha_beta(self): + """ + Function that gets alpha and beta of the data in DataFrame + + Returns + ------- + a tuple (paried values) of beta and alpha, with beta first, alpha second""" + x_mean = np.mean(self.df['x']) + y_mean = np.mean(self.df['y']) + self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean) + self.df['x_var'] = (self.df['x'] - x_mean)**2 + beta = self.df['xy_cov'].sum() / self.df['x_var'].sum() + alpha = y_mean - (beta * x_mean) + self.beta, self.alpha = beta, alpha + + return beta, alpha + + def predict_y(self): + """ + Obtain regression results, store into data frame, and return as an output + + Returns + ------- + A column of DataFrame of predicted y-values + """ + self.get_alpha_beta() + self.df['y_pred'] = self.alpha + self.beta*self.df['x'] + return self.df['y_pred'] + + + \ No newline at end of file diff --git a/README.md b/README.md index eb857b5..25f1b2c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,25 @@ # Stats Models +## T-Test Tutorial +1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples +2. Call functions on t_test() class to get desired values + +```python +# For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test +t_test(data1).one_sample_t_test(mean, 'two-sided') # For two-sided test +t_test(data1).one_sample_t_test(mean, 'less') # For one-sided, less than +t_test(data1).one_sample_t_test(mean, 'greater') # For one-sided, greater than + +# For two sample t-test, call below function to get t-test statistic based on side of the test +t_test(data1, data2).two_sample_t_test('two-sided') # For two-sided test +t_test(data1, data2).two_sample_t_test('less') # For one-sided, less than +t_test(data1, data2).two_sample_t_test('greater') # For one-sided, greater than + +# For paired sample t-test, simply call below function to get t-test statistic +t_test(data1, data2).paired_sample_t_test() +``` + # ML Models +ml_regression.py: contain 5 most popular machine learning regression functions, implemented using scikit-learn standard library + +unsupervised_clustering.py: contain most popular unsupervised learning clustering functions, implemented using scikit-learn standard library # DL Models diff --git a/T-tests.py b/T-tests.py new file mode 100644 index 0000000..0f6ce66 --- /dev/null +++ b/T-tests.py @@ -0,0 +1,58 @@ +import pandas as pd +import scipy.stats as stats +import numpy as np + +class t_test(): + """ + A class containing methods that perform various t-tests + + Parameters + ---------- + data1 : (array) array of data of interest + data2 : (array) [optional] array of data of interest, only need to pass it for two sample test + """ + def __init__(self, data1, data2=None) -> None: + self.data1 = data1 + self.data2 = data2 + + def one_sample_t_test(self, population_mean, side): + """ + Perform one sample t-test with a side and population mean + + Parameters + ---------- + population_mean : (float) population mean to be tested + side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform + + Returns + ------- + t-statistic (float) + """ + if side not in ['two-sided', 'less', 'greater']: + raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'") + return stats.ttest_1samp(self.data1, population_mean, alternative=side) + + def two_sample_t_test(self, side): + """ + Perform two sample t-test between data1 and data2 + + Parameters + ---------- + side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform + + Returns + ------- + t-statistic (float) + """ + if side not in ['two-sided', 'less', 'greater']: + raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter") + return stats.ttest_ind(self.data1, self.data2, alternative=side) + + def paired_sample_t_test(self): + """Perform paired sample t-test between data1 and data2 + + Returns + ------- + t-statistic (float) + """ + return stats.ttest_rel(self.data1, self.data2) \ No newline at end of file diff --git a/ml_regression.py b/ml_regression.py new file mode 100644 index 0000000..a756193 --- /dev/null +++ b/ml_regression.py @@ -0,0 +1,238 @@ +import pandas as pd +import numpy as np + +class LinearRegression(): + """ + Regression class takes in a dataframe of values with two columns, which are respectively x and y + User can call respective functions to get regression analysis outputs + + Parameters + ---------- + df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second + being y-values + """ + + def __init__(self, data) -> None: + self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]}) + self.beta = None + self.alpha = None + + def get_alpha_beta(self): + """ + Function that gets alpha and beta of the data in DataFrame + + Returns + ------- + a tuple (paried values) of beta and alpha, with beta first, alpha second""" + x_mean = np.mean(self.df['x']) + y_mean = np.mean(self.df['y']) + self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean) + self.df['x_var'] = (self.df['x'] - x_mean)**2 + beta = self.df['xy_cov'].sum() / self.df['x_var'].sum() + alpha = y_mean - (beta * x_mean) + self.beta, self.alpha = beta, alpha + + return beta, alpha + + def predict_y(self): + """ + Obtain regression results, store into data frame, and return as an output + + Returns + ------- + A column of DataFrame of predicted y-values + """ + self.get_alpha_beta() + self.df['y_pred'] = self.alpha + self.beta*self.df['x'] + return self.df['y_pred'] + +from sklearn.svm import SVR +def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False): + """ + run support vector regression using library from scikit learn + + Parameters + ---------- + data_in : array or float + data to be analyzed and predicted based on model + x_data : array + x values of data + y_data : array + y values of data + kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional + Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. + If a callable is given it is used to precompute the kernel matrix., by default 'rbf' + degree : int, optional + Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3 + gamma : {‘scale’, ‘auto’} or float, optional + Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale' + tol : float, optional + tolerance for stopping criterion, by default 1e-3 + c : float, optional + Regularization parameter. The strength of the regularization is inversely proportional to C. + Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0 + epsilon : float, optional + Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in + the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1 + cache_size : int, optional + Specify the size of the kernel cache (in MB)., by default 200 + verbose : bool, optional + Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm + that, if enabled, may not work properly in a multithreaded context., by default False + + Returns + ------- + array or float + predicted values from data_in + """ + svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose) + svr.fit(x_data, y_data) + y_pred = svr.predict(data_in) + return y_pred + +from sklearn.tree import DecisionTreeRegressor +def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1): + """ + Run regression with decision tree from scikit learn + + Parameters + ---------- + data_in : array or float + data to be predicted from fitted model + x_data : array + x values for the regression + y_data : array + y values for the regression + criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional + The function to measure the quality of a split. + Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as + feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, + which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for + the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” + which uses reduction in Poisson deviance to find splits., by default 'squared_error' + + splitter : {“best”, “random”}, optional + The strategy used to choose the split at each node. + Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best' + + max_depth : int, optional + The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None + + min_samples_split : int or float, optional + The minimum number of samples required to split an internal node: + + If int, then consider min_samples_split as the minimum number. + If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2 + + min_samples_leaf : int or float, optional + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples + in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1 + + Returns + ------- + array or float + predicted values from data_in + """ + regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf) + regressor.fit(x_data, y_data) + y_predict = regressor.predict(data_in) + return y_predict + +from sklearn.ensemble import RandomForestRegressor +def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0): + """ + run random forest regression with fitted data and data_in + + Parameters + ---------- + data_in : array or float + data to be predicted from the learned models + x_data : array + array of x values of data to be fitted + y_data : array + array of y values of data to be fitted + n_estimators : int, optional + number of trees in the forest, by default 100 + criterion : {“squared_error”, “absolute_error”, “poisson”}, optional + The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, + which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, + and “poisson” which uses reduction in Poisson deviance to find splits. + + Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error' + + max_depth : int, optional + The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None + + min_samples_split : int or float, optional + The minimum number of samples required to split an internal node: + + If int, then consider min_samples_split as the minimum number. + If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2 + + min_samples_leaf : int or float, optional + The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1 + + max_features : {“sqrt”, “log2”, None} int or float, optional + The number of features to consider when looking for the best split: + + If int, then consider max_features features at each split. + If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split. + If “auto”, then max_features=n_features. + If “sqrt”, then max_features=sqrt(n_features). + If “log2”, then max_features=log2(n_features). + If None or 1.0, then max_features=n_features. + + , by default 1.0 + + Returns + ------- + array or float + predicted data from random forest regressor using data_in passed by user + """ + regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features) + regressor.fit(x_data, y_data) + y_predict = regressor.predict(data_in) + return y_predict + +import xgboost as xgb +def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma): + """ + Run xgboost regression fitted with x_data and y_data, and predict using data_in + + Parameters + ---------- + data_in : array or float + data to be predicted from regression + x_data : array + x values of data for regression + y_data : array + y values of data for regression + n_estimators : int + Number of gradient boosted trees. Equivalent to number of boosting rounds. + max_depth : int + maximum tree depth + max_leaves : int + Maximum number of leaves; 0 indicates no limit. + max_bin : int + If using histogram-based algorithm, maximum number of bins per feature + grow_policy : 0 or 1 + Tree growing policy. + 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. + 1: favor splitting at nodes with highest loss change. + learning_rate : float + boosting learning rate + verbosity : int + The degree of verbosity. Valid values are 0 (silent) - 3 (debug). + gamma : float + Minimum loss reduction required to make a further partition on a leaf node of the tree. + + Returns + ------- + array or float + predicted values from data_in after regression + """ + regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma) + regressor.fit(x_data, y_data) + pred = regressor.predict(data_in) + return pred \ No newline at end of file diff --git a/unsupervised_clustering.py b/unsupervised_clustering.py new file mode 100644 index 0000000..39be74a --- /dev/null +++ b/unsupervised_clustering.py @@ -0,0 +1,139 @@ +from array import array +import numpy as np + +from sklearn.cluster import MeanShift +def mean_shift(centers, predict_data=None): + """Function that perform mean shift clustering, can also predict values if predict_data is passed + + Parameters + ---------- + centers : 2D array like + centers of data to perform clustering on + predict_data : 2D array like, optional + data to be predicted by the clustering, by default None + + Returns + ------- + cluster_centers, labels, num_features, predict + cluster_centers: centers after clustering + labels: labels of each point + num_features: number of features seen during fit + predict: predicted values by the clustering for predict_data + + Raises + ------ + Exception + raise exception when normal array (non 2D array) is passed in as predict data + """ + ms = MeanShift() + clustering = ms.fit(centers) + cluster_centers = clustering.cluster_centers_ + labels = clustering.labels_ + num_features = clustering.n_features_in_ + if type(predict_data) == type(array) or type(np.array): + try: predicted = clustering.predict(predict_data) + except: raise Exception ('Use 2D array for predict_data') + else: + predicted = None + return cluster_centers, labels, num_features, predicted + +from sklearn.cluster import DBSCAN +def perform_DBSCAN(data, eps, min_samples): + """Perform DBSCAN algorithm on a given set of data + + Parameters + ---------- + data : 2D array-like + array of data of interest to perform DBSCAN + eps : float + The maximum distance between two samples for one to be considered as in the neighborhood of the other. + This is not a maximum bound on the distances of points within a cluster. + This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. + min_samples : int + The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + This includes the point itself. + + Returns + ------- + labels, num_features, core_sample_indices, components + labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1. + num_features: Number of features seen during fit. + core_sample_indices: Indices of core samples. + components: Copy of each core sample found by training. + """ + clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data) + labels = clustering.labels_ + num_features = clustering.n_features_in_ + core_sample_indices = clustering.core_sample_indices_ + components = clustering.components_ + return labels, num_features, core_sample_indices, components + +from sklearn.cluster import AgglomerativeClustering +def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None): + """Function that performs hiearchical clustering and fit to an array of data + + Parameters + ---------- + data : 2D array + data to be fitted + n_clusters : int, default=2 + number of clusters to find + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. + The algorithm will merge the pairs of cluster that minimize this criterion. + + 'ward' minimizes the variance of the clusters being merged. + 'average' uses the average of the distances of each observation of the two sets. + 'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets. + 'single' uses the minimum of the distances between all observations of the two sets. + distance_threshold : float, default=None + The linkage distance threshold above which, clusters will not be merged. + If not None, n_clusters must be None and compute_full_tree must be True. + + Returns + ------- + num_clusters : int + The number of clusters found by the algorithm + labels : ndarray of shape (n_samples) + Cluster labels for each point. + num_leaves : int + Number of leaves in the hierarchical tree + num_connected_components : int + The estimated number of connected components in the graph + num_features : int + number of features seen during fit + """ + model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold) + model.fit(data) + num_clusters = model.n_clusters_ + labels = model.labels_ + num_leaves = model.n_leaves_ + num_connected_components = model.n_connected_components_ + num_features = model.n_features_in_ + return num_clusters, labels, num_leaves, num_connected_components, num_features + +from sklearn.mixture import GaussianMixture +def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None): + """Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed + + Parameters + ---------- + data : 2D array + Array of data to be fitted with Gaussian Mixture Model + num_components : int + number of underlying Gaussian distributions + num_random_state : int + random seed for initialization, by default 0 + predict_data : 2D array, optional + array of data to be predicted from the model, by default None + + Returns + ------- + predicted + predicted is the predicted data of data passed into the model, which is predict_data + """ + GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data) + if type(predict_data) == type(array) or type(np.array): + predicted = GMM.predict(predict_data) + else: predicted = None + return predicted \ No newline at end of file