Skip to content
This repository was archived by the owner on Apr 3, 2025. It is now read-only.

Commit 2f8abfc

Browse files
author
Tony_Tian_1122
committed
Finish jupyter notebooks
1 parent d59e0c2 commit 2f8abfc

File tree

2 files changed

+534
-0
lines changed

2 files changed

+534
-0
lines changed

ml_regression.ipynb

Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 4,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"\n",
10+
"import pandas as pd\n",
11+
"import numpy as np"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"Linear Regression"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 5,
24+
"metadata": {},
25+
"outputs": [],
26+
"source": [
27+
"class LinearRegression():\n",
28+
" \"\"\"\n",
29+
" Regression class takes in a dataframe of values with two columns, which are respectively x and y\n",
30+
" User can call respective functions to get regression analysis outputs\n",
31+
" \n",
32+
" Parameters\n",
33+
" ----------\n",
34+
" df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n",
35+
" being y-values\n",
36+
" \"\"\"\n",
37+
" \n",
38+
" def __init__(self, data) -> None:\n",
39+
" self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n",
40+
" self.beta = None\n",
41+
" self.alpha = None\n",
42+
" \n",
43+
" def get_alpha_beta(self):\n",
44+
" \"\"\"\n",
45+
" Function that gets alpha and beta of the data in DataFrame\n",
46+
" \n",
47+
" Returns\n",
48+
" -------\n",
49+
" a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n",
50+
" x_mean = np.mean(self.df['x'])\n",
51+
" y_mean = np.mean(self.df['y'])\n",
52+
" self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n",
53+
" self.df['x_var'] = (self.df['x'] - x_mean)**2\n",
54+
" beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n",
55+
" alpha = y_mean - (beta * x_mean)\n",
56+
" self.beta, self.alpha = beta, alpha\n",
57+
" \n",
58+
" return beta, alpha\n",
59+
"\n",
60+
" def predict_y(self):\n",
61+
" \"\"\"\n",
62+
" Obtain regression results, store into data frame, and return as an output\n",
63+
" \n",
64+
" Returns\n",
65+
" -------\n",
66+
" A column of DataFrame of predicted y-values\n",
67+
" \"\"\"\n",
68+
" self.get_alpha_beta()\n",
69+
" self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n",
70+
" return self.df['y_pred']"
71+
]
72+
},
73+
{
74+
"cell_type": "markdown",
75+
"metadata": {},
76+
"source": [
77+
"Support Vector Regression from Sklearn"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"from sklearn.svm import SVR\n",
87+
"def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n",
88+
" \"\"\"\n",
89+
" run support vector regression using library from scikit learn\n",
90+
"\n",
91+
" Parameters\n",
92+
" ----------\n",
93+
" data_in : array or float\n",
94+
" data to be analyzed and predicted based on model\n",
95+
" x_data : array\n",
96+
" x values of data\n",
97+
" y_data : array\n",
98+
" y values of data\n",
99+
" kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n",
100+
" Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n",
101+
" If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n",
102+
" degree : int, optional\n",
103+
" Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n",
104+
" gamma : {‘scale’, ‘auto’} or float, optional\n",
105+
" Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n",
106+
" tol : float, optional\n",
107+
" tolerance for stopping criterion, by default 1e-3\n",
108+
" c : float, optional\n",
109+
" Regularization parameter. The strength of the regularization is inversely proportional to C. \n",
110+
" Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n",
111+
" epsilon : float, optional\n",
112+
" Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n",
113+
" the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n",
114+
" cache_size : int, optional\n",
115+
" Specify the size of the kernel cache (in MB)., by default 200\n",
116+
" verbose : bool, optional\n",
117+
" Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n",
118+
" that, if enabled, may not work properly in a multithreaded context., by default False\n",
119+
"\n",
120+
" Returns\n",
121+
" -------\n",
122+
" array or float\n",
123+
" predicted values from data_in\n",
124+
" \"\"\"\n",
125+
" svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n",
126+
" svr.fit(x_data, y_data)\n",
127+
" y_pred = svr.predict(data_in)\n",
128+
" return y_pred\n",
129+
" "
130+
]
131+
},
132+
{
133+
"cell_type": "markdown",
134+
"metadata": {},
135+
"source": [
136+
"Decision Tree"
137+
]
138+
},
139+
{
140+
"cell_type": "code",
141+
"execution_count": null,
142+
"metadata": {},
143+
"outputs": [],
144+
"source": [
145+
"from sklearn.tree import DecisionTreeRegressor\n",
146+
"def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n",
147+
" \"\"\"\n",
148+
" Run regression with decision tree from scikit learn\n",
149+
"\n",
150+
" Parameters\n",
151+
" ----------\n",
152+
" data_in : array or float\n",
153+
" data to be predicted from fitted model\n",
154+
" x_data : array\n",
155+
" x values for the regression\n",
156+
" y_data : array\n",
157+
" y values for the regression\n",
158+
" criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n",
159+
" The function to measure the quality of a split. \n",
160+
" Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n",
161+
" feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n",
162+
" which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n",
163+
" the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n",
164+
" which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n",
165+
" \n",
166+
" splitter : {“best”, “random”}, optional\n",
167+
" The strategy used to choose the split at each node. \n",
168+
" Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n",
169+
" \n",
170+
" max_depth : int, optional\n",
171+
" The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
172+
" \n",
173+
" min_samples_split : int or float, optional\n",
174+
" The minimum number of samples required to split an internal node:\n",
175+
"\n",
176+
" If int, then consider min_samples_split as the minimum number.\n",
177+
" If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
178+
" \n",
179+
" min_samples_leaf : int or float, optional\n",
180+
" The minimum number of samples required to be at a leaf node. \n",
181+
" A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n",
182+
" in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
183+
"\n",
184+
" Returns\n",
185+
" -------\n",
186+
" array or float\n",
187+
" predicted values from data_in\n",
188+
" \"\"\"\n",
189+
" regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n",
190+
" regressor.fit(x_data, y_data)\n",
191+
" y_predict = regressor.predict(data_in)\n",
192+
" return y_predict"
193+
]
194+
},
195+
{
196+
"cell_type": "markdown",
197+
"metadata": {},
198+
"source": [
199+
"Random Forest"
200+
]
201+
},
202+
{
203+
"cell_type": "code",
204+
"execution_count": null,
205+
"metadata": {},
206+
"outputs": [],
207+
"source": [
208+
"from sklearn.ensemble import RandomForestRegressor\n",
209+
"def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n",
210+
" \"\"\"\n",
211+
" run random forest regression with fitted data and data_in\n",
212+
"\n",
213+
" Parameters\n",
214+
" ----------\n",
215+
" data_in : array or float\n",
216+
" data to be predicted from the learned models\n",
217+
" x_data : array\n",
218+
" array of x values of data to be fitted\n",
219+
" y_data : array\n",
220+
" array of y values of data to be fitted\n",
221+
" n_estimators : int, optional\n",
222+
" number of trees in the forest, by default 100\n",
223+
" criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n",
224+
" The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n",
225+
" which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n",
226+
" and “poisson” which uses reduction in Poisson deviance to find splits. \n",
227+
"\n",
228+
" Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n",
229+
" \n",
230+
" max_depth : int, optional\n",
231+
" The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
232+
" \n",
233+
" min_samples_split : int or float, optional\n",
234+
" The minimum number of samples required to split an internal node:\n",
235+
"\n",
236+
" If int, then consider min_samples_split as the minimum number.\n",
237+
" If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
238+
" \n",
239+
" min_samples_leaf : int or float, optional\n",
240+
" The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
241+
" \n",
242+
" max_features : {“sqrt”, “log2”, None} int or float, optional\n",
243+
" The number of features to consider when looking for the best split:\n",
244+
"\n",
245+
" If int, then consider max_features features at each split.\n",
246+
" If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n",
247+
" If “auto”, then max_features=n_features.\n",
248+
" If “sqrt”, then max_features=sqrt(n_features).\n",
249+
" If “log2”, then max_features=log2(n_features).\n",
250+
" If None or 1.0, then max_features=n_features.\n",
251+
" \n",
252+
" , by default 1.0\n",
253+
"\n",
254+
" Returns\n",
255+
" -------\n",
256+
" array or float\n",
257+
" predicted data from random forest regressor using data_in passed by user\n",
258+
" \"\"\"\n",
259+
" regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n",
260+
" regressor.fit(x_data, y_data)\n",
261+
" y_predict = regressor.predict(data_in)\n",
262+
" return y_predict"
263+
]
264+
},
265+
{
266+
"cell_type": "markdown",
267+
"metadata": {},
268+
"source": [
269+
"XGBoost"
270+
]
271+
},
272+
{
273+
"cell_type": "code",
274+
"execution_count": 4,
275+
"metadata": {},
276+
"outputs": [],
277+
"source": [
278+
"import xgboost as xgb\n",
279+
"def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n",
280+
" \"\"\"\n",
281+
" Run xgboost regression fitted with x_data and y_data, and predict using data_in\n",
282+
"\n",
283+
" Parameters\n",
284+
" ----------\n",
285+
" data_in : array or float\n",
286+
" data to be predicted from regression\n",
287+
" x_data : array\n",
288+
" x values of data for regression\n",
289+
" y_data : array\n",
290+
" y values of data for regression\n",
291+
" n_estimators : int\n",
292+
" Number of gradient boosted trees. Equivalent to number of boosting rounds.\n",
293+
" max_depth : int\n",
294+
" maximum tree depth\n",
295+
" max_leaves : int\n",
296+
" Maximum number of leaves; 0 indicates no limit.\n",
297+
" max_bin : int\n",
298+
" If using histogram-based algorithm, maximum number of bins per feature\n",
299+
" grow_policy : 0 or 1\n",
300+
" Tree growing policy. \n",
301+
" 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n",
302+
" 1: favor splitting at nodes with highest loss change.\n",
303+
" learning_rate : float\n",
304+
" boosting learning rate\n",
305+
" verbosity : int\n",
306+
" The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n",
307+
" gamma : float\n",
308+
" Minimum loss reduction required to make a further partition on a leaf node of the tree.\n",
309+
"\n",
310+
" Returns\n",
311+
" -------\n",
312+
" array or float\n",
313+
" predicted values from data_in after regression\n",
314+
" \"\"\"\n",
315+
" regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n",
316+
" regressor.fit(x_data, y_data)\n",
317+
" pred = regressor.predict(data_in)\n",
318+
" return pred"
319+
]
320+
}
321+
],
322+
"metadata": {
323+
"interpreter": {
324+
"hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad"
325+
},
326+
"kernelspec": {
327+
"display_name": "Python 3.10.1 64-bit",
328+
"language": "python",
329+
"name": "python3"
330+
},
331+
"language_info": {
332+
"codemirror_mode": {
333+
"name": "ipython",
334+
"version": 3
335+
},
336+
"file_extension": ".py",
337+
"mimetype": "text/x-python",
338+
"name": "python",
339+
"nbconvert_exporter": "python",
340+
"pygments_lexer": "ipython3",
341+
"version": "3.10.1"
342+
},
343+
"orig_nbformat": 4
344+
},
345+
"nbformat": 4,
346+
"nbformat_minor": 2
347+
}

0 commit comments

Comments
 (0)