DigitalBiomarkerDiscoveryPipeline
diff --git a/‎ml_regression.ipynb‎
Lines changed: 347 additions & 0 deletions b/‎ml_regression.ipynb‎
Lines changed: 347 additions & 0 deletions
@@ -0,0 +1,347 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Linear Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LinearRegression():\n",
+    "    \"\"\"\n",
+    "    Regression class takes in a dataframe of values with two columns, which are respectively x and y\n",
+    "    User can call respective functions to get regression analysis outputs\n",
+    "    \n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second\n",
+    "    being y-values\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, data) -> None:\n",
+    "        self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})\n",
+    "        self.beta = None\n",
+    "        self.alpha = None\n",
+    "    \n",
+    "    def get_alpha_beta(self):\n",
+    "        \"\"\"\n",
+    "        Function that gets alpha and beta of the data in DataFrame\n",
+    "        \n",
+    "        Returns\n",
+    "        -------\n",
+    "        a tuple (paried values) of beta and alpha, with beta first, alpha second\"\"\"\n",
+    "        x_mean = np.mean(self.df['x'])\n",
+    "        y_mean = np.mean(self.df['y'])\n",
+    "        self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)\n",
+    "        self.df['x_var'] = (self.df['x'] - x_mean)**2\n",
+    "        beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()\n",
+    "        alpha = y_mean - (beta * x_mean)\n",
+    "        self.beta, self.alpha = beta, alpha\n",
+    "        \n",
+    "        return beta, alpha\n",
+    "\n",
+    "    def predict_y(self):\n",
+    "        \"\"\"\n",
+    "        Obtain regression results, store into data frame, and return as an output\n",
+    "        \n",
+    "        Returns\n",
+    "        -------\n",
+    "        A column of DataFrame of predicted y-values\n",
+    "        \"\"\"\n",
+    "        self.get_alpha_beta()\n",
+    "        self.df['y_pred'] = self.alpha + self.beta*self.df['x']\n",
+    "        return self.df['y_pred']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Support Vector Regression from Sklearn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.svm import SVR\n",
+    "def run_svr(data_in, x_data, y_data, kernel='rbf', degree=3, gamma='scale', tol=1e-3, c=1.0, epsilon=0.1, cache_size=200, verbose=False):\n",
+    "    \"\"\"\n",
+    "    run support vector regression using library from scikit learn\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be analyzed and predicted based on model\n",
+    "    x_data : array\n",
+    "        x values of data\n",
+    "    y_data : array\n",
+    "        y values of data\n",
+    "    kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , optional\n",
+    "       Specifies the kernel type to be used in the algorithm. If none is given, ‘rbf’ will be used. \n",
+    "       If a callable is given it is used to precompute the kernel matrix., by default 'rbf'\n",
+    "    degree : int, optional\n",
+    "        Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3\n",
+    "    gamma : {‘scale’, ‘auto’} or float, optional\n",
+    "        Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’., by default 'scale'\n",
+    "    tol : float, optional\n",
+    "        tolerance for stopping criterion, by default 1e-3\n",
+    "    c : float, optional\n",
+    "        Regularization parameter. The strength of the regularization is inversely proportional to C. \n",
+    "        Must be strictly positive. The penalty is a squared l2 penalty., by default 1.0\n",
+    "    epsilon : float, optional\n",
+    "        Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in \n",
+    "        the training loss function with points predicted within a distance epsilon from the actual value., by default 0.1\n",
+    "    cache_size : int, optional\n",
+    "        Specify the size of the kernel cache (in MB)., by default 200\n",
+    "    verbose : bool, optional\n",
+    "        Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm \n",
+    "        that, if enabled, may not work properly in a multithreaded context., by default False\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted values from data_in\n",
+    "    \"\"\"\n",
+    "    svr = SVR(kernel, degree, gamma, tol, c, epsilon, cache_size, verbose)\n",
+    "    svr.fit(x_data, y_data)\n",
+    "    y_pred = svr.predict(data_in)\n",
+    "    return y_pred\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Decision Tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "def run_decision_tree(data_in, x_data, y_data, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1):\n",
+    "    \"\"\"\n",
+    "    Run regression with decision tree from scikit learn\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be predicted from fitted model\n",
+    "    x_data : array\n",
+    "        x values for the regression\n",
+    "    y_data : array\n",
+    "        y values for the regression\n",
+    "    criterion : {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, optional\n",
+    "        The function to measure the quality of a split. \n",
+    "        Supported criteria are “squared_error” for the mean squared error, which is equal to variance reduction as \n",
+    "        feature selection criterion and minimizes the L2 loss using the mean of each terminal node, “friedman_mse”, \n",
+    "        which uses mean squared error with Friedman’s improvement score for potential splits, “absolute_error” for \n",
+    "        the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and “poisson” \n",
+    "        which uses reduction in Poisson deviance to find splits., by default 'squared_error'\n",
+    "        \n",
+    "    splitter : {“best”, “random”}, optional\n",
+    "       The strategy used to choose the split at each node. \n",
+    "       Supported strategies are “best” to choose the best split and “random” to choose the best random split., by default 'best'\n",
+    "       \n",
+    "    max_depth : int, optional\n",
+    "        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
+    "        \n",
+    "    min_samples_split : int or float, optional\n",
+    "        The minimum number of samples required to split an internal node:\n",
+    "\n",
+    "        If int, then consider min_samples_split as the minimum number.\n",
+    "        If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
+    "        \n",
+    "    min_samples_leaf : int or float, optional\n",
+    "        The minimum number of samples required to be at a leaf node. \n",
+    "        A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples \n",
+    "        in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted values from data_in\n",
+    "    \"\"\"\n",
+    "    regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf)\n",
+    "    regressor.fit(x_data, y_data)\n",
+    "    y_predict = regressor.predict(data_in)\n",
+    "    return y_predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Random Forest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "def run_random_foreset(data_in, x_data, y_data, n_estimators=100, criterion='squared error', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=1.0):\n",
+    "    \"\"\"\n",
+    "    run random forest regression with fitted data and data_in\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be predicted from the learned models\n",
+    "    x_data : array\n",
+    "        array of x values of data to be fitted\n",
+    "    y_data : array\n",
+    "        array of y values of data to be fitted\n",
+    "    n_estimators : int, optional\n",
+    "        number of trees in the forest, by default 100\n",
+    "    criterion : {“squared_error”, “absolute_error”, “poisson”}, optional\n",
+    "        The function to measure the quality of a split. Supported criteria are “squared_error” for the mean squared error, \n",
+    "        which is equal to variance reduction as feature selection criterion, “absolute_error” for the mean absolute error, \n",
+    "        and “poisson” which uses reduction in Poisson deviance to find splits. \n",
+    "\n",
+    "        Training using “absolute_error” is significantly slower than when using “squared_error”., by default 'squared error'\n",
+    "        \n",
+    "    max_depth : int, optional\n",
+    "        The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples., by default None\n",
+    "        \n",
+    "    min_samples_split : int or float, optional\n",
+    "        The minimum number of samples required to split an internal node:\n",
+    "\n",
+    "            If int, then consider min_samples_split as the minimum number.\n",
+    "            If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split., by default 2\n",
+    "            \n",
+    "    min_samples_leaf : int or float, optional\n",
+    "        The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression., by default 1\n",
+    "        \n",
+    "    max_features : {“sqrt”, “log2”, None} int or float, optional\n",
+    "        The number of features to consider when looking for the best split:\n",
+    "\n",
+    "            If int, then consider max_features features at each split.\n",
+    "            If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.\n",
+    "            If “auto”, then max_features=n_features.\n",
+    "            If “sqrt”, then max_features=sqrt(n_features).\n",
+    "            If “log2”, then max_features=log2(n_features).\n",
+    "            If None or 1.0, then max_features=n_features.\n",
+    "        \n",
+    "        , by default 1.0\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted data from random forest regressor using data_in passed by user\n",
+    "    \"\"\"\n",
+    "    regressor = RandomForestRegressor(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features)\n",
+    "    regressor.fit(x_data, y_data)\n",
+    "    y_predict = regressor.predict(data_in)\n",
+    "    return y_predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "XGBoost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xgboost as xgb\n",
+    "def run_xgboost(data_in, x_data, y_data, n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma):\n",
+    "    \"\"\"\n",
+    "    Run xgboost regression fitted with x_data and y_data, and predict using data_in\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    data_in : array or float\n",
+    "        data to be predicted from regression\n",
+    "    x_data : array\n",
+    "        x values of data for regression\n",
+    "    y_data : array\n",
+    "        y values of data for regression\n",
+    "    n_estimators : int\n",
+    "        Number of gradient boosted trees. Equivalent to number of boosting rounds.\n",
+    "    max_depth : int\n",
+    "        maximum tree depth\n",
+    "    max_leaves : int\n",
+    "        Maximum number of leaves; 0 indicates no limit.\n",
+    "    max_bin : int\n",
+    "        If using histogram-based algorithm, maximum number of bins per feature\n",
+    "    grow_policy : 0 or 1\n",
+    "        Tree growing policy. \n",
+    "        0: favor splitting at nodes closest to the node, i.e. grow depth-wise. \n",
+    "        1: favor splitting at nodes with highest loss change.\n",
+    "    learning_rate : float\n",
+    "        boosting learning rate\n",
+    "    verbosity : int\n",
+    "        The degree of verbosity. Valid values are 0 (silent) - 3 (debug).\n",
+    "    gamma : float\n",
+    "         Minimum loss reduction required to make a further partition on a leaf node of the tree.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    array or float\n",
+    "        predicted values from data_in after regression\n",
+    "    \"\"\"\n",
+    "    regressor = xgb.XGBRegressor(n_estimators, max_depth, max_leaves, max_bin, grow_policy, learning_rate, verbosity, gamma=gamma)\n",
+    "    regressor.fit(x_data, y_data)\n",
+    "    pred = regressor.predict(data_in)\n",
+    "    return pred"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.10.1 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.1"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}