From 838743a358d2c8a85ad381dafb1913523de7787a Mon Sep 17 00:00:00 2001 From: Desmond Date: Fri, 18 Apr 2025 01:18:52 -0500 Subject: [PATCH 1/4] add ignore new --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 15201ac..88432a6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ __pycache__/ # C extensions *.so - # Distribution / packaging .Python build/ @@ -169,3 +168,4 @@ cython_debug/ # PyPI configuration file .pypirc +data/Extracted_values_4yearys.csv From 25c41a6c2d5b70b4e685985ab06533bdfd52ed61 Mon Sep 17 00:00:00 2001 From: Desmond Date: Fri, 18 Apr 2025 01:19:11 -0500 Subject: [PATCH 2/4] add data --- .pre-commit-config.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..84af8e5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-toml + - id: check-yaml + - id: end-of-file-fixer + types: [python] + - id: trailing-whitespace + - id: requirements-txt-fixer + - id: check-added-large-files + args: ["--maxkb=500"] + + - repo: https://github.com/psf/black + rev: 25.1.0 + hooks: + - id: black-jupyter + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + args: + [ + "--ignore-words-list=aci,acount,acounts,fallow,ges,hart,hist,nd,ned,ois,wqs,watermask,tre,mape", + "--skip=*.csv,*.geojson,*.json,*.yml*.js,*.html,*cff,*.pdf", + ] + + - repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout From 915fe7f0ec8eb350444f9433de6854545ae649a8 Mon Sep 17 00:00:00 2001 From: Desmond Date: Sun, 20 Apr 2025 04:08:07 -0500 Subject: [PATCH 3/4] uodate code --- data/insar.ipynb | 502 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 502 insertions(+) create mode 100644 data/insar.ipynb diff --git a/data/insar.ipynb b/data/insar.ipynb new file mode 100644 index 0000000..9b051b2 --- /dev/null +++ b/data/insar.ipynb @@ -0,0 +1,502 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import matplotlib.dates as mdates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\n", + " \"/home/kangah/Desktop/GIS_programming/Geospatial/data/Extracted_values_4yearys.csv\"\n", + ")\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "Real = data.iloc[:, 3:13]\n", + "Real" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "missing_percent = (Real.isnull().sum() / len(Real)) * 100\n", + "print(missing_percent)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "Real_clean = Real.dropna()\n", + "Real_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "# Real_clean.columns = ['Velocity', 'Top_Wetness_Index', 'Precipitation', 'LULC', 'DistanceFromFault', 'DistanceFromRoad', 'DistanceFromRiver','DEM' 'Geology', 'Aspect']\n", + "Real_clean.rename(\n", + " columns={\n", + " \"velocity\": \"Velocity\",\n", + " \"TWI\": \"Top_Wetness_Index\",\n", + " \"extract_prec1\": \"Precipitation\",\n", + " \"extract_lulc1\": \"LULC\",\n", + " \"eucdist_faul1\": \"DistanceFromFault\",\n", + " \"distanceFromRoad\": \"DistanceFromRoad\",\n", + " \"distanceFromriver\": \"DistanceFromRiver\",\n", + " \"dem\": \"DEM\",\n", + " \"Geology_CONUS_Clip_PolygonToRaster1\": \"Geology\",\n", + " \"Aspect_DEM2\": \"Aspect\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "Real_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "pearson_correlation_matrix = Real_clean.corr(method=\"pearson\")\n", + "\n", + "plt.figure(figsize=(12, 10))\n", + "sns.heatmap(\n", + " pearson_correlation_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\", cbar=True\n", + ")\n", + "plt.title(\"Pearson Correlation Matrix Heatmap\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "X = Real_clean.drop(columns=[\"Velocity\"])\n", + "y = Real_clean[\"Velocity\"]\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "scaler = StandardScaler()\n", + "\n", + "## By Kangah (Surveyor, Civil and Geospatial Engineer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "X_train.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "y_train.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "X_test.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "y_test.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": {}, + "outputs": [], + "source": [ + "x_train = scaler.fit_transform(X_train)\n", + "x_test = scaler.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error, r2_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": {}, + "outputs": [], + "source": [ + "RS_model = RandomForestRegressor(n_estimators=100, random_state=42)\n", + "RS_model.fit(x_train, y_train)\n", + "y_pred = RS_model.predict(x_test)\n", + "mse = mean_squared_error(y_test, y_pred)\n", + "r2 = r2_score(y_test, y_pred)\n", + "print(f\"Mean Squared Error: {mse}\")\n", + "print(f\"R^2 Score: {r2}\") # By Kangah" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "# Plotting the training and testing curves\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "# Training data\n", + "plt.scatter(\n", + " y_train, RS_model.predict(x_train), color=\"blue\", alpha=0.5, label=\"Train Data\"\n", + ")\n", + "# Testing data\n", + "plt.scatter(y_test, y_pred, color=\"red\", alpha=0.5, label=\"Test Data\")\n", + "\n", + "# Plotting the ideal line\n", + "plt.plot([y.min(), y.max()], [y.min(), y.max()], \"k--\", lw=2, label=\"Ideal Fit\")\n", + "\n", + "# Labels and legend\n", + "plt.xlabel(\"True Values\")\n", + "plt.ylabel(\"Predicted Values\")\n", + "plt.title(\"Train and Test Curves\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": {}, + "outputs": [], + "source": [ + "# Get feature importances from the model\n", + "importances = RS_model.feature_importances_\n", + "feature_names = X.columns\n", + "\n", + "# Create DataFrame\n", + "gini_df = pd.DataFrame(\n", + " {\"Feature\": feature_names, \"Importance\": importances}\n", + ").sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# Plot\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "bars = ax.barh(\n", + " gini_df[\"Feature\"], gini_df[\"Importance\"], color=\"coral\", alpha=0.8, height=0.4\n", + ")\n", + "\n", + "# Add central black dot\n", + "for i, imp in enumerate(gini_df[\"Importance\"]):\n", + " ax.plot(imp, i, \"ko\")\n", + "\n", + "# Add a box showing the method\n", + "ax.text(\n", + " 0.95,\n", + " 0.05,\n", + " \"■ Mean Decrease Gini\",\n", + " transform=ax.transAxes,\n", + " fontsize=12,\n", + " verticalalignment=\"bottom\",\n", + " horizontalalignment=\"right\",\n", + " color=\"OrangeRed\",\n", + ")\n", + "\n", + "# Labels\n", + "ax.set_xlabel(\"Mean Decrease in Gini (Feature Importance)\", fontsize=12)\n", + "ax.set_ylabel(\"Land Susceptibility Influencing Factors\", fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.inspection import permutation_importance\n", + "\n", + "# Evaluate permutation importance\n", + "result = permutation_importance(\n", + " RS_model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=-1\n", + ")\n", + "\n", + "# Create DataFrame\n", + "perm_df = pd.DataFrame(\n", + " {\n", + " \"Feature\": X.columns,\n", + " \"Importance\": result.importances_mean,\n", + " \"Std\": result.importances_std,\n", + " }\n", + ").sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# Create plot\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "# Plot bars with error bars\n", + "ax.barh(\n", + " perm_df[\"Feature\"],\n", + " perm_df[\"Importance\"],\n", + " xerr=perm_df[\"Std\"],\n", + " alpha=0.7,\n", + " height=0.4,\n", + " color=\"coral\",\n", + ")\n", + "\n", + "# Add label with square bullet\n", + "ax.text(\n", + " 0.95,\n", + " 0.05,\n", + " \"■ Mean Decrease Accuracy\",\n", + " transform=ax.transAxes,\n", + " fontsize=12,\n", + " verticalalignment=\"bottom\",\n", + " horizontalalignment=\"right\",\n", + " color=\"OrangeRed\",\n", + ")\n", + "\n", + "# Labels\n", + "ax.set_xlabel(\"Mean Decrease in Accuracy (Permutation Importance)\")\n", + "ax.set_title(\"Permutation Feature Importance\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVR\n", + "\n", + "# Initialize the SVR model\n", + "svr_model = SVR(kernel=\"rbf\", C=1.0, epsilon=0.1)\n", + "\n", + "# Fit the model to the training data\n", + "svr_model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_svr = svr_model.predict(x_test)\n", + "mse_svr = mean_squared_error(y_test, y_pred_svr)\n", + "r2_svr = r2_score(y_test, y_pred_svr)\n", + "print(f\"SVR Mean Squared Error: {mse_svr}\")\n", + "print(f\"SVR R^2 Score: {r2_svr}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": {}, + "outputs": [], + "source": [ + "from xgboost import XGBRegressor\n", + "\n", + "# Initialize the XGBoost Regressor\n", + "xgb_model = XGBRegressor(\n", + " objective=\"reg:squarederror\", n_estimators=100, random_state=42\n", + ")\n", + "\n", + "# Fit the model to the training data\n", + "xgb_model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_xgb = xgb_model.predict(x_test)\n", + "mse_xgb = mean_squared_error(y_test, y_pred_xgb)\n", + "r2_xgb = r2_score(y_test, y_pred_xgb)\n", + "print(f\"XGBoost Mean Squared Error: {mse_xgb}\")\n", + "print(f\"XGBoost R^2 Score: {r2_xgb}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import roc_curve, auc\n", + "from sklearn.preprocessing import Binarizer\n", + "\n", + "# Binarize the y_test and predictions\n", + "threshold = 0 # Define a threshold for binarization\n", + "binarizer = Binarizer(threshold=threshold)\n", + "\n", + "y_test_binary = binarizer.fit_transform(y_test.values.reshape(-1, 1)).ravel()\n", + "y_pred_binary = binarizer.transform(y_pred.reshape(-1, 1)).ravel()\n", + "y_pred_svr_binary = binarizer.transform(y_pred_svr.reshape(-1, 1)).ravel()\n", + "y_pred_xgb_binary = binarizer.transform(y_pred_xgb.reshape(-1, 1)).ravel()\n", + "\n", + "# Compute ROC curve and AUC for Random Forest\n", + "fpr_rf, tpr_rf, _ = roc_curve(y_test_binary, y_pred_binary)\n", + "roc_auc_rf = auc(fpr_rf, tpr_rf)\n", + "\n", + "# Compute ROC curve and AUC for SVR\n", + "fpr_svr, tpr_svr, _ = roc_curve(y_test_binary, y_pred_svr_binary)\n", + "roc_auc_svr = auc(fpr_svr, tpr_svr)\n", + "\n", + "# Compute ROC curve and AUC for XGBoost\n", + "fpr_xgb, tpr_xgb, _ = roc_curve(y_test_binary, y_pred_xgb_binary)\n", + "roc_auc_xgb = auc(fpr_xgb, tpr_xgb)\n", + "\n", + "# Plot the ROC curves\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(\n", + " fpr_rf, tpr_rf, color=\"blue\", lw=2, label=f\"Random Forest (AUC = {roc_auc_rf:.2f})\"\n", + ")\n", + "plt.plot(fpr_svr, tpr_svr, color=\"green\", lw=2, label=f\"SVR (AUC = {roc_auc_svr:.2f})\")\n", + "plt.plot(\n", + " fpr_xgb, tpr_xgb, color=\"red\", lw=2, label=f\"XGBoost (AUC = {roc_auc_xgb:.2f})\"\n", + ")\n", + "\n", + "# Plot the diagonal line\n", + "plt.plot([0, 1], [0, 1], \"k--\", lw=2)\n", + "\n", + "# Labels and legend\n", + "plt.xlabel(\"1 - False Positive Rate(Specificity)\")\n", + "plt.ylabel(\"Sensitivity (True Positive Rate)\")\n", + "plt.title(\"Receiver Operating Characteristic (ROC) Curves\")\n", + "plt.legend(loc=\"lower right\")\n", + "plt.legend(loc=\"lower right\")\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24", + "metadata": {}, + "outputs": [], + "source": [ + "import shap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25", + "metadata": {}, + "outputs": [], + "source": [ + "explainer = shap.TreeExplainer(RS_model)\n", + "shap_values = explainer.shap_values(X_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "geo", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2933ca837091c2678cd1a2af42835cf446b02d84 Mon Sep 17 00:00:00 2001 From: Desmond Date: Wed, 14 May 2025 12:52:58 -0500 Subject: [PATCH 4/4] add code new --- .gitignore | 4 + data/{insar.ipynb => insar_2.ipynb} | 277 +++++++++++----------------- 2 files changed, 110 insertions(+), 171 deletions(-) rename data/{insar.ipynb => insar_2.ipynb} (59%) diff --git a/.gitignore b/.gitignore index 88432a6..1d45c3f 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,7 @@ cython_debug/ # PyPI configuration file .pypirc data/Extracted_values_4yearys.csv +data/insar.ipynb +data/final_results.csv +data/data2.dbf-20250420T151602Z-001/data2.dbf +data/insar.ipynb diff --git a/data/insar.ipynb b/data/insar_2.ipynb similarity index 59% rename from data/insar.ipynb rename to data/insar_2.ipynb index 9b051b2..30c2a50 100644 --- a/data/insar.ipynb +++ b/data/insar_2.ipynb @@ -22,10 +22,10 @@ "metadata": {}, "outputs": [], "source": [ - "data = pd.read_csv(\n", - " \"/home/kangah/Desktop/GIS_programming/Geospatial/data/Extracted_values_4yearys.csv\"\n", + "data2 = gpd.read_file(\n", + " \"/home/kangah/Desktop/GIS_programming/Geospatial/data/data2.dbf-20250420T151602Z-001/data2.dbf\"\n", ")\n", - "data" + "data2" ] }, { @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "Real = data.iloc[:, 3:13]\n", + "Real = data2.iloc[:, 2:13]\n", "Real" ] }, @@ -68,23 +68,23 @@ "metadata": {}, "outputs": [], "source": [ - "# Real_clean.columns = ['Velocity', 'Top_Wetness_Index', 'Precipitation', 'LULC', 'DistanceFromFault', 'DistanceFromRoad', 'DistanceFromRiver','DEM' 'Geology', 'Aspect']\n", - "Real_clean.rename(\n", - " columns={\n", - " \"velocity\": \"Velocity\",\n", - " \"TWI\": \"Top_Wetness_Index\",\n", - " \"extract_prec1\": \"Precipitation\",\n", - " \"extract_lulc1\": \"LULC\",\n", - " \"eucdist_faul1\": \"DistanceFromFault\",\n", - " \"distanceFromRoad\": \"DistanceFromRoad\",\n", - " \"distanceFromriver\": \"DistanceFromRiver\",\n", - " \"dem\": \"DEM\",\n", - " \"Geology_CONUS_Clip_PolygonToRaster1\": \"Geology\",\n", - " \"Aspect_DEM2\": \"Aspect\",\n", - " },\n", - " inplace=True,\n", - ")\n", - "Real_clean" + "# # Real_clean.columns = ['Velocity', 'Top_Wetness_Index', 'Precipitation', 'LULC', 'DistanceFromFault', 'DistanceFromRoad', 'DistanceFromRiver','DEM' 'Geology', 'Aspect']\n", + "# Real_clean.rename(\n", + "# columns={\n", + "# \"velocity\": \"Velocity\",\n", + "# \"TWI\": \"Top_Wetness_Index\",\n", + "# \"extract_prec1\": \"Precipitation\",\n", + "# \"extract_lulc1\": \"LULC\",\n", + "# \"eucdist_faul1\": \"DistanceFromFault\",\n", + "# \"distanceFromRoad\": \"DistanceFromRoad\",\n", + "# \"distanceFromriver\": \"DistanceFromRiver\",\n", + "# \"dem\": \"DEM\",\n", + "# \"Geology_CONUS_Clip_PolygonToRaster1\": \"Geology\",\n", + "# \"Aspect_DEM2\": \"Aspect\",\n", + "# },\n", + "# inplace=True,\n", + "# )\n", + "# Real_clean" ] }, { @@ -122,8 +122,8 @@ "metadata": {}, "outputs": [], "source": [ - "X = Real_clean.drop(columns=[\"Velocity\"])\n", - "y = Real_clean[\"Velocity\"]\n", + "X = Real_clean.drop(columns=[\"velocity\"])\n", + "y = Real_clean[\"velocity\"]\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")\n", @@ -216,34 +216,6 @@ "id": "16", "metadata": {}, "outputs": [], - "source": [ - "# Plotting the training and testing curves\n", - "plt.figure(figsize=(12, 6))\n", - "\n", - "# Training data\n", - "plt.scatter(\n", - " y_train, RS_model.predict(x_train), color=\"blue\", alpha=0.5, label=\"Train Data\"\n", - ")\n", - "# Testing data\n", - "plt.scatter(y_test, y_pred, color=\"red\", alpha=0.5, label=\"Test Data\")\n", - "\n", - "# Plotting the ideal line\n", - "plt.plot([y.min(), y.max()], [y.min(), y.max()], \"k--\", lw=2, label=\"Ideal Fit\")\n", - "\n", - "# Labels and legend\n", - "plt.xlabel(\"True Values\")\n", - "plt.ylabel(\"Predicted Values\")\n", - "plt.title(\"Train and Test Curves\")\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": {}, - "outputs": [], "source": [ "# Get feature importances from the model\n", "importances = RS_model.feature_importances_\n", @@ -287,57 +259,32 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "17", "metadata": {}, "outputs": [], "source": [ - "from sklearn.inspection import permutation_importance\n", - "\n", - "# Evaluate permutation importance\n", - "result = permutation_importance(\n", - " RS_model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=-1\n", - ")\n", - "\n", - "# Create DataFrame\n", - "perm_df = pd.DataFrame(\n", + "# Combine the actual and predicted values with coordinates\n", + "results_df = pd.DataFrame(\n", " {\n", - " \"Feature\": X.columns,\n", - " \"Importance\": result.importances_mean,\n", - " \"Std\": result.importances_std,\n", + " \"Longitude\": data2.loc[y_test.index, \"long\"],\n", + " \"Latitude\": data2.loc[y_test.index, \"lat\"],\n", + " \"Actual Velocity\": y_test.values,\n", + " \"Predicted Velocity\": y_pred,\n", " }\n", - ").sort_values(by=\"Importance\", ascending=True)\n", - "\n", - "# Create plot\n", - "fig, ax = plt.subplots(figsize=(10, 6))\n", - "\n", - "# Plot bars with error bars\n", - "ax.barh(\n", - " perm_df[\"Feature\"],\n", - " perm_df[\"Importance\"],\n", - " xerr=perm_df[\"Std\"],\n", - " alpha=0.7,\n", - " height=0.4,\n", - " color=\"coral\",\n", - ")\n", - "\n", - "# Add label with square bullet\n", - "ax.text(\n", - " 0.95,\n", - " 0.05,\n", - " \"■ Mean Decrease Accuracy\",\n", - " transform=ax.transAxes,\n", - " fontsize=12,\n", - " verticalalignment=\"bottom\",\n", - " horizontalalignment=\"right\",\n", - " color=\"OrangeRed\",\n", ")\n", "\n", - "# Labels\n", - "ax.set_xlabel(\"Mean Decrease in Accuracy (Permutation Importance)\")\n", - "ax.set_title(\"Permutation Feature Importance\")\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" + "# Display the DataFrame\n", + "print(results_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "results_df.to_csv(\"final_results.csv\", index=False)" ] }, { @@ -347,13 +294,53 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.svm import SVR\n", + "# from sklearn.inspection import permutation_importance\n", "\n", - "# Initialize the SVR model\n", - "svr_model = SVR(kernel=\"rbf\", C=1.0, epsilon=0.1)\n", + "# # Evaluate permutation importance\n", + "# result = permutation_importance(\n", + "# RS_model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=-1\n", + "# )\n", "\n", - "# Fit the model to the training data\n", - "svr_model.fit(x_train, y_train)" + "# # Create DataFrame\n", + "# perm_df = pd.DataFrame(\n", + "# {\n", + "# \"Feature\": X.columns,\n", + "# \"Importance\": result.importances_mean,\n", + "# \"Std\": result.importances_std,\n", + "# }\n", + "# ).sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# # Create plot\n", + "# fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "# # Plot bars with error bars\n", + "# ax.barh(\n", + "# perm_df[\"Feature\"],\n", + "# perm_df[\"Importance\"],\n", + "# xerr=perm_df[\"Std\"],\n", + "# alpha=0.7,\n", + "# height=0.4,\n", + "# color=\"coral\",\n", + "# )\n", + "\n", + "# # Add label with square bullet\n", + "# ax.text(\n", + "# 0.95,\n", + "# 0.05,\n", + "# \"■ Mean Decrease Accuracy\",\n", + "# transform=ax.transAxes,\n", + "# fontsize=12,\n", + "# verticalalignment=\"bottom\",\n", + "# horizontalalignment=\"right\",\n", + "# color=\"OrangeRed\",\n", + "# )\n", + "\n", + "# # Labels\n", + "# ax.set_xlabel(\"Mean Decrease in Accuracy (Permutation Importance)\")\n", + "# ax.set_title(\"Permutation Feature Importance\")\n", + "\n", + "# plt.tight_layout()\n", + "# plt.show()" ] }, { @@ -363,11 +350,24 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred_svr = svr_model.predict(x_test)\n", - "mse_svr = mean_squared_error(y_test, y_pred_svr)\n", - "r2_svr = r2_score(y_test, y_pred_svr)\n", - "print(f\"SVR Mean Squared Error: {mse_svr}\")\n", - "print(f\"SVR R^2 Score: {r2_svr}\")" + "from sklearn.inspection import permutation_importance\n", + "\n", + "# Evaluate permutation importance\n", + "result = permutation_importance(\n", + " RS_model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=-1\n", + ")\n", + "\n", + "# Create DataFrame\n", + "perm_df = pd.DataFrame(\n", + " {\n", + " \"Feature\": X.columns,\n", + " \"Importance\": result.importances_mean,\n", + " \"Std\": result.importances_std,\n", + " }\n", + ").sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# Display the DataFrame\n", + "print(perm_df)" ] }, { @@ -409,72 +409,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.metrics import roc_curve, auc\n", - "from sklearn.preprocessing import Binarizer\n", - "\n", - "# Binarize the y_test and predictions\n", - "threshold = 0 # Define a threshold for binarization\n", - "binarizer = Binarizer(threshold=threshold)\n", - "\n", - "y_test_binary = binarizer.fit_transform(y_test.values.reshape(-1, 1)).ravel()\n", - "y_pred_binary = binarizer.transform(y_pred.reshape(-1, 1)).ravel()\n", - "y_pred_svr_binary = binarizer.transform(y_pred_svr.reshape(-1, 1)).ravel()\n", - "y_pred_xgb_binary = binarizer.transform(y_pred_xgb.reshape(-1, 1)).ravel()\n", - "\n", - "# Compute ROC curve and AUC for Random Forest\n", - "fpr_rf, tpr_rf, _ = roc_curve(y_test_binary, y_pred_binary)\n", - "roc_auc_rf = auc(fpr_rf, tpr_rf)\n", - "\n", - "# Compute ROC curve and AUC for SVR\n", - "fpr_svr, tpr_svr, _ = roc_curve(y_test_binary, y_pred_svr_binary)\n", - "roc_auc_svr = auc(fpr_svr, tpr_svr)\n", - "\n", - "# Compute ROC curve and AUC for XGBoost\n", - "fpr_xgb, tpr_xgb, _ = roc_curve(y_test_binary, y_pred_xgb_binary)\n", - "roc_auc_xgb = auc(fpr_xgb, tpr_xgb)\n", - "\n", - "# Plot the ROC curves\n", - "plt.figure(figsize=(10, 6))\n", - "plt.plot(\n", - " fpr_rf, tpr_rf, color=\"blue\", lw=2, label=f\"Random Forest (AUC = {roc_auc_rf:.2f})\"\n", - ")\n", - "plt.plot(fpr_svr, tpr_svr, color=\"green\", lw=2, label=f\"SVR (AUC = {roc_auc_svr:.2f})\")\n", - "plt.plot(\n", - " fpr_xgb, tpr_xgb, color=\"red\", lw=2, label=f\"XGBoost (AUC = {roc_auc_xgb:.2f})\"\n", - ")\n", - "\n", - "# Plot the diagonal line\n", - "plt.plot([0, 1], [0, 1], \"k--\", lw=2)\n", - "\n", - "# Labels and legend\n", - "plt.xlabel(\"1 - False Positive Rate(Specificity)\")\n", - "plt.ylabel(\"Sensitivity (True Positive Rate)\")\n", - "plt.title(\"Receiver Operating Characteristic (ROC) Curves\")\n", - "plt.legend(loc=\"lower right\")\n", - "plt.legend(loc=\"lower right\")\n", - "plt.grid()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24", - "metadata": {}, - "outputs": [], - "source": [ - "import shap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25", - "metadata": {}, - "outputs": [], - "source": [ - "explainer = shap.TreeExplainer(RS_model)\n", - "shap_values = explainer.shap_values(X_test)" + "y_train" ] } ],