|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 4, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [], |
| 8 | + "source": [ |
| 9 | + "\n", |
| 10 | + "import pandas as pd\n", |
| 11 | + "import numpy as np" |
| 12 | + ] |
| 13 | + }, |
| 14 | + { |
| 15 | + "cell_type": "markdown", |
| 16 | + "metadata": {}, |
| 17 | + "source": [ |
| 18 | + "Linear Regression" |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "code", |
| 23 | + "execution_count": 5, |
| 24 | + "metadata": {}, |
| 25 | + "outputs": [], |
| 26 | + "source": [ |
| 27 | + "class LinearRegression():\n", |
| 28 | + " \"\"\"\n", |
| 29 | + " Regression class takes in a dataframe of values with two columns, which are respectively x and y\n", |
| 30 | + " User can call respective functions to get regression analysis outputs\n", |
| 31 | + " \n", |
| 32 | + " Parameters\n", |
| 33 | + " ----------\n", |
| 34 | + " df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n", |
| 35 | + " being y-values\n", |
| 36 | + " \"\"\"\n", |
| 37 | + " \n", |
| 38 | + " def __init__(self, data) -> None:\n", |
| 39 | + " self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n", |
| 40 | + " self.beta = None\n", |
| 41 | + " self.alpha = None\n", |
| 42 | + " \n", |
| 43 | + " def get_alpha_beta(self):\n", |
| 44 | + " \"\"\"\n", |
| 45 | + " Function that gets alpha and beta of the data in DataFrame\n", |
| 46 | + " \n", |
| 47 | + " Returns\n", |
| 48 | + " -------\n", |
| 49 | + " a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n", |
| 50 | + " x_mean = np.mean(self.df['x'])\n", |
| 51 | + " y_mean = np.mean(self.df['y'])\n", |
| 52 | + " self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n", |
| 53 | + " self.df['x_var'] = (self.df['x'] - x_mean)**2\n", |
| 54 | + " beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n", |
| 55 | + " alpha = y_mean - (beta * x_mean)\n", |
| 56 | + " self.beta, self.alpha = beta, alpha\n", |
| 57 | + " \n", |
| 58 | + " return beta, alpha\n", |
| 59 | + "\n", |
| 60 | + " def predict_y(self):\n", |
| 61 | + " \"\"\"\n", |
| 62 | + " Obtain regression results, store into data frame, and return as an output\n", |
| 63 | + " \n", |
| 64 | + " Returns\n", |
| 65 | + " -------\n", |
| 66 | + " A column of DataFrame of predicted y-values\n", |
| 67 | + " \"\"\"\n", |
| 68 | + " self.get_alpha_beta()\n", |
| 69 | + " self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n", |
| 70 | + " return self.df['y_pred']" |
| 71 | + ] |
| 72 | + }, |
| 73 | + { |
| 74 | + "cell_type": "markdown", |
| 75 | + "metadata": {}, |
| 76 | + "source": [ |
| 77 | + "Support Vector Regression from Sklearn" |
| 78 | + ] |
| 79 | + }, |
| 80 | + { |
| 81 | + "cell_type": "code", |
| 82 | + "execution_count": null, |
| 83 | + "metadata": {}, |
| 84 | + "outputs": [], |
| 85 | + "source": [ |
| 86 | + "from sklearn.svm import SVR\n", |
| 87 | + "def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n", |
| 88 | + " \"\"\"\n", |
| 89 | + " run support vector regression using library from scikit learn\n", |
| 90 | + "\n", |
| 91 | + " Parameters\n", |
| 92 | + " ----------\n", |
| 93 | + " data_in : array or float\n", |
| 94 | + " data to be analyzed and predicted based on model\n", |
| 95 | + " x_data : array\n", |
| 96 | + " x values of data\n", |
| 97 | + " y_data : array\n", |
| 98 | + " y values of data\n", |
| 99 | + " kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n", |
| 100 | + " Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n", |
| 101 | + " If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n", |
| 102 | + " degree : int, optional\n", |
| 103 | + " Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n", |
| 104 | + " gamma : {‘scale’, ‘auto’} or float, optional\n", |
| 105 | + " Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n", |
| 106 | + " tol : float, optional\n", |
| 107 | + " tolerance for stopping criterion, by default 1e-3\n", |
| 108 | + " c : float, optional\n", |
| 109 | + " Regularization parameter. The strength of the regularization is inversely proportional to C. \n", |
| 110 | + " Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n", |
| 111 | + " epsilon : float, optional\n", |
| 112 | + " Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n", |
| 113 | + " the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n", |
| 114 | + " cache_size : int, optional\n", |
| 115 | + " Specify the size of the kernel cache (in MB)., by default 200\n", |
| 116 | + " verbose : bool, optional\n", |
| 117 | + " Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n", |
| 118 | + " that, if enabled, may not work properly in a multithreaded context., by default False\n", |
| 119 | + "\n", |
| 120 | + " Returns\n", |
| 121 | + " -------\n", |
| 122 | + " array or float\n", |
| 123 | + " predicted values from data_in\n", |
| 124 | + " \"\"\"\n", |
| 125 | + " svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n", |
| 126 | + " svr.fit(x_data, y_data)\n", |
| 127 | + " y_pred = svr.predict(data_in)\n", |
| 128 | + " return y_pred\n", |
| 129 | + " " |
| 130 | + ] |
| 131 | + }, |
| 132 | + { |
| 133 | + "cell_type": "markdown", |
| 134 | + "metadata": {}, |
| 135 | + "source": [ |
| 136 | + "Decision Tree" |
| 137 | + ] |
| 138 | + }, |
| 139 | + { |
| 140 | + "cell_type": "code", |
| 141 | + "execution_count": null, |
| 142 | + "metadata": {}, |
| 143 | + "outputs": [], |
| 144 | + "source": [ |
| 145 | + "from sklearn.tree import DecisionTreeRegressor\n", |
| 146 | + "def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n", |
| 147 | + " \"\"\"\n", |
| 148 | + " Run regression with decision tree from scikit learn\n", |
| 149 | + "\n", |
| 150 | + " Parameters\n", |
| 151 | + " ----------\n", |
| 152 | + " data_in : array or float\n", |
| 153 | + " data to be predicted from fitted model\n", |
| 154 | + " x_data : array\n", |
| 155 | + " x values for the regression\n", |
| 156 | + " y_data : array\n", |
| 157 | + " y values for the regression\n", |
| 158 | + " criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n", |
| 159 | + " The function to measure the quality of a split. \n", |
| 160 | + " Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n", |
| 161 | + " feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n", |
| 162 | + " which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n", |
| 163 | + " the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n", |
| 164 | + " which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n", |
| 165 | + " \n", |
| 166 | + " splitter : {“best”, “random”}, optional\n", |
| 167 | + " The strategy used to choose the split at each node. \n", |
| 168 | + " Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n", |
| 169 | + " \n", |
| 170 | + " max_depth : int, optional\n", |
| 171 | + " The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n", |
| 172 | + " \n", |
| 173 | + " min_samples_split : int or float, optional\n", |
| 174 | + " The minimum number of samples required to split an internal node:\n", |
| 175 | + "\n", |
| 176 | + " If int, then consider min_samples_split as the minimum number.\n", |
| 177 | + " If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n", |
| 178 | + " \n", |
| 179 | + " min_samples_leaf : int or float, optional\n", |
| 180 | + " The minimum number of samples required to be at a leaf node. \n", |
| 181 | + " A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n", |
| 182 | + " in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n", |
| 183 | + "\n", |
| 184 | + " Returns\n", |
| 185 | + " -------\n", |
| 186 | + " array or float\n", |
| 187 | + " predicted values from data_in\n", |
| 188 | + " \"\"\"\n", |
| 189 | + " regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n", |
| 190 | + " regressor.fit(x_data, y_data)\n", |
| 191 | + " y_predict = regressor.predict(data_in)\n", |
| 192 | + " return y_predict" |
| 193 | + ] |
| 194 | + }, |
| 195 | + { |
| 196 | + "cell_type": "markdown", |
| 197 | + "metadata": {}, |
| 198 | + "source": [ |
| 199 | + "Random Forest" |
| 200 | + ] |
| 201 | + }, |
| 202 | + { |
| 203 | + "cell_type": "code", |
| 204 | + "execution_count": null, |
| 205 | + "metadata": {}, |
| 206 | + "outputs": [], |
| 207 | + "source": [ |
| 208 | + "from sklearn.ensemble import RandomForestRegressor\n", |
| 209 | + "def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n", |
| 210 | + " \"\"\"\n", |
| 211 | + " run random forest regression with fitted data and data_in\n", |
| 212 | + "\n", |
| 213 | + " Parameters\n", |
| 214 | + " ----------\n", |
| 215 | + " data_in : array or float\n", |
| 216 | + " data to be predicted from the learned models\n", |
| 217 | + " x_data : array\n", |
| 218 | + " array of x values of data to be fitted\n", |
| 219 | + " y_data : array\n", |
| 220 | + " array of y values of data to be fitted\n", |
| 221 | + " n_estimators : int, optional\n", |
| 222 | + " number of trees in the forest, by default 100\n", |
| 223 | + " criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n", |
| 224 | + " The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n", |
| 225 | + " which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n", |
| 226 | + " and “poisson” which uses reduction in Poisson deviance to find splits. \n", |
| 227 | + "\n", |
| 228 | + " Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n", |
| 229 | + " \n", |
| 230 | + " max_depth : int, optional\n", |
| 231 | + " The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n", |
| 232 | + " \n", |
| 233 | + " min_samples_split : int or float, optional\n", |
| 234 | + " The minimum number of samples required to split an internal node:\n", |
| 235 | + "\n", |
| 236 | + " If int, then consider min_samples_split as the minimum number.\n", |
| 237 | + " If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n", |
| 238 | + " \n", |
| 239 | + " min_samples_leaf : int or float, optional\n", |
| 240 | + " The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n", |
| 241 | + " \n", |
| 242 | + " max_features : {“sqrt”, “log2”, None} int or float, optional\n", |
| 243 | + " The number of features to consider when looking for the best split:\n", |
| 244 | + "\n", |
| 245 | + " If int, then consider max_features features at each split.\n", |
| 246 | + " If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n", |
| 247 | + " If “auto”, then max_features=n_features.\n", |
| 248 | + " If “sqrt”, then max_features=sqrt(n_features).\n", |
| 249 | + " If “log2”, then max_features=log2(n_features).\n", |
| 250 | + " If None or 1.0, then max_features=n_features.\n", |
| 251 | + " \n", |
| 252 | + " , by default 1.0\n", |
| 253 | + "\n", |
| 254 | + " Returns\n", |
| 255 | + " -------\n", |
| 256 | + " array or float\n", |
| 257 | + " predicted data from random forest regressor using data_in passed by user\n", |
| 258 | + " \"\"\"\n", |
| 259 | + " regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n", |
| 260 | + " regressor.fit(x_data, y_data)\n", |
| 261 | + " y_predict = regressor.predict(data_in)\n", |
| 262 | + " return y_predict" |
| 263 | + ] |
| 264 | + }, |
| 265 | + { |
| 266 | + "cell_type": "markdown", |
| 267 | + "metadata": {}, |
| 268 | + "source": [ |
| 269 | + "XGBoost" |
| 270 | + ] |
| 271 | + }, |
| 272 | + { |
| 273 | + "cell_type": "code", |
| 274 | + "execution_count": 4, |
| 275 | + "metadata": {}, |
| 276 | + "outputs": [], |
| 277 | + "source": [ |
| 278 | + "import xgboost as xgb\n", |
| 279 | + "def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n", |
| 280 | + " \"\"\"\n", |
| 281 | + " Run xgboost regression fitted with x_data and y_data, and predict using data_in\n", |
| 282 | + "\n", |
| 283 | + " Parameters\n", |
| 284 | + " ----------\n", |
| 285 | + " data_in : array or float\n", |
| 286 | + " data to be predicted from regression\n", |
| 287 | + " x_data : array\n", |
| 288 | + " x values of data for regression\n", |
| 289 | + " y_data : array\n", |
| 290 | + " y values of data for regression\n", |
| 291 | + " n_estimators : int\n", |
| 292 | + " Number of gradient boosted trees. Equivalent to number of boosting rounds.\n", |
| 293 | + " max_depth : int\n", |
| 294 | + " maximum tree depth\n", |
| 295 | + " max_leaves : int\n", |
| 296 | + " Maximum number of leaves; 0 indicates no limit.\n", |
| 297 | + " max_bin : int\n", |
| 298 | + " If using histogram-based algorithm, maximum number of bins per feature\n", |
| 299 | + " grow_policy : 0 or 1\n", |
| 300 | + " Tree growing policy. \n", |
| 301 | + " 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n", |
| 302 | + " 1: favor splitting at nodes with highest loss change.\n", |
| 303 | + " learning_rate : float\n", |
| 304 | + " boosting learning rate\n", |
| 305 | + " verbosity : int\n", |
| 306 | + " The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n", |
| 307 | + " gamma : float\n", |
| 308 | + " Minimum loss reduction required to make a further partition on a leaf node of the tree.\n", |
| 309 | + "\n", |
| 310 | + " Returns\n", |
| 311 | + " -------\n", |
| 312 | + " array or float\n", |
| 313 | + " predicted values from data_in after regression\n", |
| 314 | + " \"\"\"\n", |
| 315 | + " regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n", |
| 316 | + " regressor.fit(x_data, y_data)\n", |
| 317 | + " pred = regressor.predict(data_in)\n", |
| 318 | + " return pred" |
| 319 | + ] |
| 320 | + } |
| 321 | + ], |
| 322 | + "metadata": { |
| 323 | + "interpreter": { |
| 324 | + "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad" |
| 325 | + }, |
| 326 | + "kernelspec": { |
| 327 | + "display_name": "Python 3.10.1 64-bit", |
| 328 | + "language": "python", |
| 329 | + "name": "python3" |
| 330 | + }, |
| 331 | + "language_info": { |
| 332 | + "codemirror_mode": { |
| 333 | + "name": "ipython", |
| 334 | + "version": 3 |
| 335 | + }, |
| 336 | + "file_extension": ".py", |
| 337 | + "mimetype": "text/x-python", |
| 338 | + "name": "python", |
| 339 | + "nbconvert_exporter": "python", |
| 340 | + "pygments_lexer": "ipython3", |
| 341 | + "version": "3.10.1" |
| 342 | + }, |
| 343 | + "orig_nbformat": 4 |
| 344 | + }, |
| 345 | + "nbformat": 4, |
| 346 | + "nbformat_minor": 2 |
| 347 | +} |
0 commit comments