diff --git a/.gitignore b/.gitignore index 15201ac..1d45c3f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ __pycache__/ # C extensions *.so - # Distribution / packaging .Python build/ @@ -169,3 +168,8 @@ cython_debug/ # PyPI configuration file .pypirc +data/Extracted_values_4yearys.csv +data/insar.ipynb +data/final_results.csv +data/data2.dbf-20250420T151602Z-001/data2.dbf +data/insar.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..84af8e5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-toml + - id: check-yaml + - id: end-of-file-fixer + types: [python] + - id: trailing-whitespace + - id: requirements-txt-fixer + - id: check-added-large-files + args: ["--maxkb=500"] + + - repo: https://github.com/psf/black + rev: 25.1.0 + hooks: + - id: black-jupyter + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + args: + [ + "--ignore-words-list=aci,acount,acounts,fallow,ges,hart,hist,nd,ned,ois,wqs,watermask,tre,mape", + "--skip=*.csv,*.geojson,*.json,*.yml*.js,*.html,*cff,*.pdf", + ] + + - repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout diff --git a/data/insar_2.ipynb b/data/insar_2.ipynb new file mode 100644 index 0000000..30c2a50 --- /dev/null +++ b/data/insar_2.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import geopandas as gpd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import matplotlib.dates as mdates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "data2 = gpd.read_file(\n", + " \"/home/kangah/Desktop/GIS_programming/Geospatial/data/data2.dbf-20250420T151602Z-001/data2.dbf\"\n", + ")\n", + "data2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "Real = data2.iloc[:, 2:13]\n", + "Real" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "missing_percent = (Real.isnull().sum() / len(Real)) * 100\n", + "print(missing_percent)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "Real_clean = Real.dropna()\n", + "Real_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "# # Real_clean.columns = ['Velocity', 'Top_Wetness_Index', 'Precipitation', 'LULC', 'DistanceFromFault', 'DistanceFromRoad', 'DistanceFromRiver','DEM' 'Geology', 'Aspect']\n", + "# Real_clean.rename(\n", + "# columns={\n", + "# \"velocity\": \"Velocity\",\n", + "# \"TWI\": \"Top_Wetness_Index\",\n", + "# \"extract_prec1\": \"Precipitation\",\n", + "# \"extract_lulc1\": \"LULC\",\n", + "# \"eucdist_faul1\": \"DistanceFromFault\",\n", + "# \"distanceFromRoad\": \"DistanceFromRoad\",\n", + "# \"distanceFromriver\": \"DistanceFromRiver\",\n", + "# \"dem\": \"DEM\",\n", + "# \"Geology_CONUS_Clip_PolygonToRaster1\": \"Geology\",\n", + "# \"Aspect_DEM2\": \"Aspect\",\n", + "# },\n", + "# inplace=True,\n", + "# )\n", + "# Real_clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "pearson_correlation_matrix = Real_clean.corr(method=\"pearson\")\n", + "\n", + "plt.figure(figsize=(12, 10))\n", + "sns.heatmap(\n", + " pearson_correlation_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\", cbar=True\n", + ")\n", + "plt.title(\"Pearson Correlation Matrix Heatmap\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "X = Real_clean.drop(columns=[\"velocity\"])\n", + "y = Real_clean[\"velocity\"]\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "scaler = StandardScaler()\n", + "\n", + "## By Kangah (Surveyor, Civil and Geospatial Engineer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "X_train.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "y_train.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "X_test.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "y_test.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13", + "metadata": {}, + "outputs": [], + "source": [ + "x_train = scaler.fit_transform(X_train)\n", + "x_test = scaler.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error, r2_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": {}, + "outputs": [], + "source": [ + "RS_model = RandomForestRegressor(n_estimators=100, random_state=42)\n", + "RS_model.fit(x_train, y_train)\n", + "y_pred = RS_model.predict(x_test)\n", + "mse = mean_squared_error(y_test, y_pred)\n", + "r2 = r2_score(y_test, y_pred)\n", + "print(f\"Mean Squared Error: {mse}\")\n", + "print(f\"R^2 Score: {r2}\") # By Kangah" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "# Get feature importances from the model\n", + "importances = RS_model.feature_importances_\n", + "feature_names = X.columns\n", + "\n", + "# Create DataFrame\n", + "gini_df = pd.DataFrame(\n", + " {\"Feature\": feature_names, \"Importance\": importances}\n", + ").sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# Plot\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "bars = ax.barh(\n", + " gini_df[\"Feature\"], gini_df[\"Importance\"], color=\"coral\", alpha=0.8, height=0.4\n", + ")\n", + "\n", + "# Add central black dot\n", + "for i, imp in enumerate(gini_df[\"Importance\"]):\n", + " ax.plot(imp, i, \"ko\")\n", + "\n", + "# Add a box showing the method\n", + "ax.text(\n", + " 0.95,\n", + " 0.05,\n", + " \"■ Mean Decrease Gini\",\n", + " transform=ax.transAxes,\n", + " fontsize=12,\n", + " verticalalignment=\"bottom\",\n", + " horizontalalignment=\"right\",\n", + " color=\"OrangeRed\",\n", + ")\n", + "\n", + "# Labels\n", + "ax.set_xlabel(\"Mean Decrease in Gini (Feature Importance)\", fontsize=12)\n", + "ax.set_ylabel(\"Land Susceptibility Influencing Factors\", fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": {}, + "outputs": [], + "source": [ + "# Combine the actual and predicted values with coordinates\n", + "results_df = pd.DataFrame(\n", + " {\n", + " \"Longitude\": data2.loc[y_test.index, \"long\"],\n", + " \"Latitude\": data2.loc[y_test.index, \"lat\"],\n", + " \"Actual Velocity\": y_test.values,\n", + " \"Predicted Velocity\": y_pred,\n", + " }\n", + ")\n", + "\n", + "# Display the DataFrame\n", + "print(results_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "results_df.to_csv(\"final_results.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19", + "metadata": {}, + "outputs": [], + "source": [ + "# from sklearn.inspection import permutation_importance\n", + "\n", + "# # Evaluate permutation importance\n", + "# result = permutation_importance(\n", + "# RS_model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=-1\n", + "# )\n", + "\n", + "# # Create DataFrame\n", + "# perm_df = pd.DataFrame(\n", + "# {\n", + "# \"Feature\": X.columns,\n", + "# \"Importance\": result.importances_mean,\n", + "# \"Std\": result.importances_std,\n", + "# }\n", + "# ).sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# # Create plot\n", + "# fig, ax = plt.subplots(figsize=(10, 6))\n", + "\n", + "# # Plot bars with error bars\n", + "# ax.barh(\n", + "# perm_df[\"Feature\"],\n", + "# perm_df[\"Importance\"],\n", + "# xerr=perm_df[\"Std\"],\n", + "# alpha=0.7,\n", + "# height=0.4,\n", + "# color=\"coral\",\n", + "# )\n", + "\n", + "# # Add label with square bullet\n", + "# ax.text(\n", + "# 0.95,\n", + "# 0.05,\n", + "# \"■ Mean Decrease Accuracy\",\n", + "# transform=ax.transAxes,\n", + "# fontsize=12,\n", + "# verticalalignment=\"bottom\",\n", + "# horizontalalignment=\"right\",\n", + "# color=\"OrangeRed\",\n", + "# )\n", + "\n", + "# # Labels\n", + "# ax.set_xlabel(\"Mean Decrease in Accuracy (Permutation Importance)\")\n", + "# ax.set_title(\"Permutation Feature Importance\")\n", + "\n", + "# plt.tight_layout()\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.inspection import permutation_importance\n", + "\n", + "# Evaluate permutation importance\n", + "result = permutation_importance(\n", + " RS_model, x_test, y_test, n_repeats=10, random_state=42, n_jobs=-1\n", + ")\n", + "\n", + "# Create DataFrame\n", + "perm_df = pd.DataFrame(\n", + " {\n", + " \"Feature\": X.columns,\n", + " \"Importance\": result.importances_mean,\n", + " \"Std\": result.importances_std,\n", + " }\n", + ").sort_values(by=\"Importance\", ascending=True)\n", + "\n", + "# Display the DataFrame\n", + "print(perm_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": {}, + "outputs": [], + "source": [ + "from xgboost import XGBRegressor\n", + "\n", + "# Initialize the XGBoost Regressor\n", + "xgb_model = XGBRegressor(\n", + " objective=\"reg:squarederror\", n_estimators=100, random_state=42\n", + ")\n", + "\n", + "# Fit the model to the training data\n", + "xgb_model.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_xgb = xgb_model.predict(x_test)\n", + "mse_xgb = mean_squared_error(y_test, y_pred_xgb)\n", + "r2_xgb = r2_score(y_test, y_pred_xgb)\n", + "print(f\"XGBoost Mean Squared Error: {mse_xgb}\")\n", + "print(f\"XGBoost R^2 Score: {r2_xgb}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": {}, + "outputs": [], + "source": [ + "y_train" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "geo", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}