From e2d743b4026ffc3f9bcf860550967deab43847cd Mon Sep 17 00:00:00 2001 From: Jasper Tielmann Date: Sun, 26 Nov 2023 22:46:14 +0000 Subject: [PATCH] Lab done --- auto-mpg.csv => your-code/auto-mpg.csv | 0 your-code/main.ipynb | 881 ++++++++++++++++++++++--- 2 files changed, 784 insertions(+), 97 deletions(-) rename auto-mpg.csv => your-code/auto-mpg.csv (100%) diff --git a/auto-mpg.csv b/your-code/auto-mpg.csv similarity index 100% rename from auto-mpg.csv rename to your-code/auto-mpg.csv diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 8a9fa9e..388b417 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,11 +12,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "import pyforest\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error" ] }, { @@ -37,11 +38,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "from sklearn.datasets import load_diabetes\n", + "diabetes = load_diabetes()" ] }, { @@ -53,11 +55,90 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': array([[ 0.03807591, 0.05068012, 0.06169621, ..., -0.00259226,\n", + " 0.01990749, -0.01764613],\n", + " [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,\n", + " -0.06833155, -0.09220405],\n", + " [ 0.08529891, 0.05068012, 0.04445121, ..., -0.00259226,\n", + " 0.00286131, -0.02593034],\n", + " ...,\n", + " [ 0.04170844, 0.05068012, -0.01590626, ..., -0.01107952,\n", + " -0.04688253, 0.01549073],\n", + " [-0.04547248, -0.04464164, 0.03906215, ..., 0.02655962,\n", + " 0.04452873, -0.02593034],\n", + " [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,\n", + " -0.00422151, 0.00306441]]),\n", + " 'target': array([151., 75., 141., 206., 135., 97., 138., 63., 110., 310., 101.,\n", + " 69., 179., 185., 118., 171., 166., 144., 97., 168., 68., 49.,\n", + " 68., 245., 184., 202., 137., 85., 131., 283., 129., 59., 341.,\n", + " 87., 65., 102., 265., 276., 252., 90., 100., 55., 61., 92.,\n", + " 259., 53., 190., 142., 75., 142., 155., 225., 59., 104., 182.,\n", + " 128., 52., 37., 170., 170., 61., 144., 52., 128., 71., 163.,\n", + " 150., 97., 160., 178., 48., 270., 202., 111., 85., 42., 170.,\n", + " 200., 252., 113., 143., 51., 52., 210., 65., 141., 55., 134.,\n", + " 42., 111., 98., 164., 48., 96., 90., 162., 150., 279., 92.,\n", + " 83., 128., 102., 302., 198., 95., 53., 134., 144., 232., 81.,\n", + " 104., 59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,\n", + " 173., 180., 84., 121., 161., 99., 109., 115., 268., 274., 158.,\n", + " 107., 83., 103., 272., 85., 280., 336., 281., 118., 317., 235.,\n", + " 60., 174., 259., 178., 128., 96., 126., 288., 88., 292., 71.,\n", + " 197., 186., 25., 84., 96., 195., 53., 217., 172., 131., 214.,\n", + " 59., 70., 220., 268., 152., 47., 74., 295., 101., 151., 127.,\n", + " 237., 225., 81., 151., 107., 64., 138., 185., 265., 101., 137.,\n", + " 143., 141., 79., 292., 178., 91., 116., 86., 122., 72., 129.,\n", + " 142., 90., 158., 39., 196., 222., 277., 99., 196., 202., 155.,\n", + " 77., 191., 70., 73., 49., 65., 263., 248., 296., 214., 185.,\n", + " 78., 93., 252., 150., 77., 208., 77., 108., 160., 53., 220.,\n", + " 154., 259., 90., 246., 124., 67., 72., 257., 262., 275., 177.,\n", + " 71., 47., 187., 125., 78., 51., 258., 215., 303., 243., 91.,\n", + " 150., 310., 153., 346., 63., 89., 50., 39., 103., 308., 116.,\n", + " 145., 74., 45., 115., 264., 87., 202., 127., 182., 241., 66.,\n", + " 94., 283., 64., 102., 200., 265., 94., 230., 181., 156., 233.,\n", + " 60., 219., 80., 68., 332., 248., 84., 200., 55., 85., 89.,\n", + " 31., 129., 83., 275., 65., 198., 236., 253., 124., 44., 172.,\n", + " 114., 142., 109., 180., 144., 163., 147., 97., 220., 190., 109.,\n", + " 191., 122., 230., 242., 248., 249., 192., 131., 237., 78., 135.,\n", + " 244., 199., 270., 164., 72., 96., 306., 91., 214., 95., 216.,\n", + " 263., 178., 113., 200., 139., 139., 88., 148., 88., 243., 71.,\n", + " 77., 109., 272., 60., 54., 221., 90., 311., 281., 182., 321.,\n", + " 58., 262., 206., 233., 242., 123., 167., 63., 197., 71., 168.,\n", + " 140., 217., 121., 235., 245., 40., 52., 104., 132., 88., 69.,\n", + " 219., 72., 201., 110., 51., 277., 63., 118., 69., 273., 258.,\n", + " 43., 198., 242., 232., 175., 93., 168., 275., 293., 281., 72.,\n", + " 140., 189., 181., 209., 136., 261., 113., 131., 174., 257., 55.,\n", + " 84., 42., 146., 212., 233., 91., 111., 152., 120., 67., 310.,\n", + " 94., 183., 66., 173., 72., 49., 64., 48., 178., 104., 132.,\n", + " 220., 57.]),\n", + " 'frame': None,\n", + " 'DESCR': '.. _diabetes_dataset:\\n\\nDiabetes dataset\\n----------------\\n\\nTen baseline variables, age, sex, body mass index, average blood\\npressure, and six blood serum measurements were obtained for each of n =\\n442 diabetes patients, as well as the response of interest, a\\nquantitative measure of disease progression one year after baseline.\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 442\\n\\n :Number of Attributes: First 10 columns are numeric predictive values\\n\\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\\n\\n :Attribute Information:\\n - age age in years\\n - sex\\n - bmi body mass index\\n - bp average blood pressure\\n - s1 tc, total serum cholesterol\\n - s2 ldl, low-density lipoproteins\\n - s3 hdl, high-density lipoproteins\\n - s4 tch, total cholesterol / HDL\\n - s5 ltg, possibly log of serum triglycerides level\\n - s6 glu, blood sugar level\\n\\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\\n\\nSource URL:\\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\\n\\nFor more information see:\\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\\n',\n", + " 'feature_names': ['age',\n", + " 'sex',\n", + " 'bmi',\n", + " 'bp',\n", + " 's1',\n", + " 's2',\n", + " 's3',\n", + " 's4',\n", + " 's5',\n", + " 's6'],\n", + " 'data_filename': 'diabetes_data_raw.csv.gz',\n", + " 'target_filename': 'diabetes_target.csv.gz',\n", + " 'data_module': 'sklearn.datasets.data'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diabetes" ] }, { @@ -73,13 +154,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - age age in years\n", + " - sex\n", + " - bmi body mass index\n", + " - bp average blood pressure\n", + " - s1 tc, total serum cholesterol\n", + " - s2 ldl, low-density lipoproteins\n", + " - s3 hdl, high-density lipoproteins\n", + " - s4 tch, total cholesterol / HDL\n", + " - s5 ltg, possibly log of serum triglycerides level\n", + " - s6 glu, blood sugar level\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n", + "\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes[\"DESCR\"])" ] }, { @@ -97,11 +225,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "# there are 10 attributes in the data. They're numeric and needed to predict the target.\n", + "# data shows the feautes, target the target the features are supposed to predict\n", + "# there are 442 instances in the data" ] }, { @@ -115,11 +246,53 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442,)\n", + "(442, 10)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "features = pd.DataFrame(diabetes[\"data\"], columns=diabetes[\"feature_names\"])\n", + "target = pd.Series(diabetes[\"target\"], name=\"target\")\n", + "\n", + "print(target.shape)\n", + "print(features.shape)" ] }, { @@ -156,11 +329,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +345,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +363,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd\\nfrom sklearn.model_selection import train_test_split'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# Your code here:\n" + "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(features, target)" ] }, { @@ -206,11 +394,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Interception: 153.29025971764074\n", + "Coefficents: [ 14.19144904 -287.81159219 485.90819878 370.64689156\n", + " -1230.15768889 752.49586282 213.31951642 324.96723948\n", + " 882.76864042 44.73301878]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "print(\"Interception:\", diabetes_model.intercept_)\n", + "print(\"Coefficents: \", diabetes_model.coef_)" ] }, { @@ -231,11 +433,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "pred = diabetes_model.predict(diabetes_data_test)" ] }, { @@ -247,11 +450,168 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import pandas as pd\\nfrom sklearn.model_selection import train_test_split'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
target
131.0165.9
55.041.8
341.0282.1
90.050.6
103.0124.5
......
262.0170.7
77.057.1
264.0253.2
151.0173.2
150.0206.5
\n", + "

111 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " 0\n", + "target \n", + "131.0 165.9\n", + "55.0 41.8\n", + "341.0 282.1\n", + "90.0 50.6\n", + "103.0 124.5\n", + "... ...\n", + "262.0 170.7\n", + "77.0 57.1\n", + "264.0 253.2\n", + "151.0 173.2\n", + "150.0 206.5\n", + "\n", + "[111 rows x 1 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Your code here:\n", + "pred = [round(i,1) for i in pred]\n", + "display(pd.DataFrame(pred, diabetes_target_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r^2: 0.3626947436331116\n" + ] + }, + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import numpy as np\\nimport pandas as pd\\nfrom sklearn.model_selection import train_test_split'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rmse: 61.253116853955476\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "\n", + "print(\"r^2: \", diabetes_model.score(diabetes_data_test, diabetes_target_test))\n", + "print(\"rmse: \", np.sqrt(mean_squared_error(pred, diabetes_target_test)))" ] }, { @@ -263,11 +623,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "# no its not there is actually quite a big variation between both.\n", + "# it is very rare that LinearRegression predict the exact accurate data because of its fragmentency\n", + "# the goal is to get as close as possible to make meaningful prediction and insides" ] }, { @@ -351,11 +714,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import numpy as np\\nimport pandas as pd\\nfrom sklearn.model_selection import train_test_split'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Your code here:\n", + "auto = pd.read_csv(\"auto-mpg.csv\")" ] }, { @@ -367,11 +746,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head()" ] }, { @@ -383,11 +875,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg float64\n", + "cylinders int64\n", + "displacement float64\n", + "horse_power float64\n", + "weight int64\n", + "acceleration float64\n", + "model_year int64\n", + "car_name object\n", + "dtype: object" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.dtypes" ] }, { @@ -399,11 +911,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "oldest: 70\n", + "newest: 82\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(\"oldest: \", auto[\"model_year\"].min())\n", + "print(\"newest: \", auto[\"model_year\"].max())" ] }, { @@ -415,11 +938,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 0\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "dtype: int64" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.dropna(inplace=True)\n", + "auto.isnull().sum()" ] }, { @@ -431,11 +974,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 204\n", + "8 103\n", + "6 84\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto[\"cylinders\"].value_counts()" ] }, { @@ -451,11 +1011,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.drop(\"car_name\", axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import numpy as np\\nimport pandas as pd\\nfrom sklearn.model_selection import train_test_split'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "features = auto.drop(\"mpg\", axis=1)\n", + "label = auto[\"mpg\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.80)" ] }, { @@ -469,11 +1057,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto_model = LinearRegression()\n", + "\n", + "auto_model.fit(X_train, y_train)" ] }, { @@ -493,11 +1098,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r^2: 0.8322920653782885\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.metrics import r2_score\n", + "y_pred = auto_model.predict(X_train)\n", + "print(\"r^2: \", r2_score(y_pred, y_train))" ] }, { @@ -513,11 +1129,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r^2: 0.7992202382383298\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_data = auto_model.predict(X_test)\n", + "print(\"r^2: \", r2_score(y_test_data, y_test))" ] }, { @@ -542,11 +1168,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (window._pyforest_update_imports_cell) { window._pyforest_update_imports_cell('import numpy as np\\nimport pandas as pd\\nfrom sklearn.model_selection import train_test_split'); }\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(features, label, test_size=0.10)" ] }, { @@ -558,11 +1200,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto_model09 = LinearRegression()\n", + "auto_model09.fit(X_train09, y_train09)" ] }, { @@ -574,11 +1232,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r^2: 0.7538710984983046\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred = auto_model09.predict(X_train09)\n", + "print(\"r^2: \", r2_score(y_pred, y_train09))" ] }, { @@ -590,11 +1258,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r^2: 0.8718352161114574\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "y_pred = auto_model09.predict(X_test09)\n", + "print(\"r^2: \", r2_score(y_pred, y_test09))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# while the r^2 for the training data actually got worse, the r^2 for the test data improved a lot to 0.87" ] }, { @@ -703,7 +1390,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -717,7 +1404,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.11.3" } }, "nbformat": 4,