From e7d1847167fc83561a3a4f8397f32f6415b85bec Mon Sep 17 00:00:00 2001
From: Aaryan-549 <aaryan.beniwal09@gmail.com>
Date: Wed, 8 Oct 2025 17:25:51 +0530
Subject: [PATCH] Added detailed docstrings to all functions in
 src/model/models.py

---
 frontend/package.json |  32 ++--
 package-lock.json     |  16 ++
 src/model/models.py   | 357 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 381 insertions(+), 24 deletions(-)

diff --git a/frontend/package.json b/frontend/package.json
index 8a9c2088..37306d36 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -14,34 +14,36 @@
     "test:coverage": "vitest run --coverage"
   },
   "dependencies": {
-    "react": "^18.3.1",
-    "react-dom": "^18.3.1",
-    "react-router-dom": "^6.30.1",
-    "lucide-react": "^0.539.0",
+    "-": "^0.0.1",
+    "@radix-ui/react-slot": "^1.2.3",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
-    "tailwind-merge": "^2.6.0",
+    "i": "^0.3.7",
+    "leaflet": "^1.9.4",
+    "lucide-react": "^0.539.0",
     "next-themes": "^0.4.6",
-    "@radix-ui/react-slot": "^1.2.3",
-    "leaflet": "^1.9.4"
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "react-router-dom": "^6.30.1",
+    "tailwind-merge": "^2.6.0"
   },
   "devDependencies": {
+    "@testing-library/jest-dom": "^6.5.0",
+    "@testing-library/react": "^16.0.1",
+    "@testing-library/user-event": "^14.5.2",
+    "@types/leaflet": "^1.9.12",
     "@types/react": "^18.3.23",
     "@types/react-dom": "^18.3.7",
     "@vitejs/plugin-react": "^4.3.1",
+    "@vitest/coverage-v8": "^2.0.5",
+    "@vitest/ui": "^2.0.5",
     "autoprefixer": "^10.4.21",
+    "jsdom": "^25.0.1",
     "postcss": "^8.5.6",
     "tailwindcss": "^3.4.17",
     "tailwindcss-animate": "^1.0.7",
     "typescript": "^5.9.2",
     "vite": "^7.1.2",
-    "@types/leaflet": "^1.9.12",
-    "vitest": "^2.0.5",
-    "@testing-library/react": "^16.0.1",
-    "@testing-library/jest-dom": "^6.5.0",
-    "@testing-library/user-event": "^14.5.2",
-    "jsdom": "^25.0.1",
-    "@vitest/ui": "^2.0.5",
-    "@vitest/coverage-v8": "^2.0.5"
+    "vitest": "^2.0.5"
   }
 }
diff --git a/package-lock.json b/package-lock.json
index f93401ae..fb258838 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -46,9 +46,11 @@
       "name": "gopredict-frontend",
       "version": "1.0.0",
       "dependencies": {
+        "-": "^0.0.1",
         "@radix-ui/react-slot": "^1.2.3",
         "class-variance-authority": "^0.7.1",
         "clsx": "^2.1.1",
+        "i": "^0.3.7",
         "leaflet": "^1.9.4",
         "lucide-react": "^0.539.0",
         "next-themes": "^0.4.6",
@@ -2955,6 +2957,12 @@
         "node": ">= 14.6"
       }
     },
+    "node_modules/-": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/-/-/--0.0.1.tgz",
+      "integrity": "sha512-3HfneK3DGAm05fpyj20sT3apkNcvPpCuccOThOPdzz8sY7GgQGe0l93XH9bt+YzibcTIgUAIMoyVJI740RtgyQ==",
+      "license": "UNLICENSED"
+    },
     "node_modules/@babel/code-frame": {
       "version": "7.27.1",
       "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
@@ -7714,6 +7722,14 @@
         "node": ">=10.17.0"
       }
     },
+    "node_modules/i": {
+      "version": "0.3.7",
+      "resolved": "https://registry.npmjs.org/i/-/i-0.3.7.tgz",
+      "integrity": "sha512-FYz4wlXgkQwIPqhzC5TdNMLSE5+GS1IIDJZY/1ZiEPCT2S3COUVZeT5OW4BmW4r5LHLQuOosSwsvnroG9GR59Q==",
+      "engines": {
+        "node": ">=0.4"
+      }
+    },
     "node_modules/iconv-lite": {
       "version": "0.4.24",
       "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
diff --git a/src/model/models.py b/src/model/models.py
index 1e758f6e..872526f8 100644
--- a/src/model/models.py
+++ b/src/model/models.py
@@ -29,7 +29,32 @@ def tqdm(iterable=None, desc=None, total=None, unit=None, leave=True):
 # Utilities and normalizers
 # ==========================
 def normalize_features(X):
-    """Normalize features into different ranges for training"""
+    """
+    Normalize features into different ranges for model training.
+    
+    This function applies different normalization ranges to different feature groups:
+    coordinates, distances, precipitation, time features, and binary flags.
+    
+    Args:
+        X (pd.DataFrame): Input features containing coordinates (start_lng, start_lat, 
+                         end_lng, end_lat), distances (manhattan, euclidean, gmaps_distance, 
+                         gmaps_duration), precipitation, time features (weekday, hour), 
+                         and flags (holiday, airport, citycenter, standalone, routing_error, 
+                         short_trip).
+    
+    Returns:
+        pd.DataFrame: Normalized features with:
+            - Coordinates scaled to (-1, 1)
+            - Distances scaled to (0, 10)
+            - Precipitation scaled to (0, 1)
+            - Time features scaled to (0, 5)
+            - Flags left unchanged
+    
+    Examples:
+        >>> X_normalized = normalize_features(train_df.drop('duration', axis=1))
+        >>> print(X_normalized.shape)
+        (10000, 18)
+    """
     features = []
 
     coords = X[['start_lng', 'start_lat', 'end_lng', 'end_lat']]
@@ -66,7 +91,25 @@ def normalize_features(X):
     return pd.concat(features, axis=1)
 
 def plot_feature_importance(model, X):
-    """Plot feature importance for tree based models"""
+    """
+    Plot feature importance for tree-based models.
+    
+    Creates a horizontal bar chart showing the importance of each feature
+    as determined by the model's feature_importances_ attribute.
+    
+    Args:
+        model: A trained tree-based model (e.g., RandomForestRegressor, XGBRegressor)
+               that has a feature_importances_ attribute.
+        X (pd.DataFrame): Input features DataFrame used to get column names.
+    
+    Returns:
+        None: Displays the plot directly using matplotlib.
+    
+    Examples:
+        >>> model = XGBRegressor()
+        >>> model.fit(X_train, y_train)
+        >>> plot_feature_importance(model, X_train)
+    """
     imp = pd.DataFrame(
         model.feature_importances_,
         index=X.columns,
@@ -76,7 +119,24 @@ def plot_feature_importance(model, X):
     plt.show()
 
 def plot_loss_curve(history):
-    """Plot training vs validation loss for neural networks."""
+    """
+    Plot training vs validation loss curves for neural networks.
+    
+    Creates a line plot showing how training and validation loss
+    evolved across epochs during model training.
+    
+    Args:
+        history: A Keras History object returned by model.fit() containing
+                loss and val_loss in its history dictionary.
+    
+    Returns:
+        None: Displays the plot directly using matplotlib.
+    
+    Examples:
+        >>> model = Sequential([...])
+        >>> history = model.fit(X_train, y_train, validation_split=0.2, epochs=100)
+        >>> plot_loss_curve(history)
+    """
     plt.figure()
     plt.plot(history.history['loss'])
     plt.plot(history.history['val_loss'])
@@ -91,7 +151,27 @@ def plot_loss_curve(history):
 # =========================================
 def predict_duration(model, test_df, model_name="Model"):
     """
-    Make predictions on test data
+    Make duration predictions on test data with automatic feature alignment.
+    
+    This function handles feature alignment by ensuring the test data has
+    the same features in the same order as expected by the model.
+    
+    Args:
+        model: A trained sklearn-compatible model with a predict() method.
+        test_df (pd.DataFrame): Test dataset, optionally containing 'duration' column
+                               which will be dropped if present.
+        model_name (str, optional): Name of the model for logging purposes. 
+                                   Defaults to "Model".
+    
+    Returns:
+        np.ndarray: Array of predicted duration values.
+    
+    Examples:
+        >>> model = XGBRegressor()
+        >>> model.fit(X_train, y_train)
+        >>> predictions = predict_duration(model, test_df, "XGBoost")
+        >>> print(predictions[:5])
+        [450.2, 523.1, 380.5, 612.3, 295.8]
     """
     logging.info(f"Making predictions with {model_name}...")
 
@@ -135,7 +215,27 @@ def predict_duration(model, test_df, model_name="Model"):
     return predictions
 
 def compare_predictions(pred_1, pred_2, title="Prediction 1 vs Prediction 2", save_plot=True):
-    """Compare two sets of predictions using histograms"""
+    """
+    Compare two sets of predictions using overlaid histograms.
+    
+    Creates a histogram visualization comparing the distribution of two
+    prediction sets, useful for analyzing model agreement or differences.
+    
+    Args:
+        pred_1 (np.ndarray or list): First set of predictions.
+        pred_2 (np.ndarray or list): Second set of predictions.
+        title (str, optional): Plot title. Defaults to "Prediction 1 vs Prediction 2".
+        save_plot (bool, optional): Whether to save the plot to the output directory.
+                                   Defaults to True.
+    
+    Returns:
+        None: Displays the plot and optionally saves it to output/prediction_comparison_TIMESTAMP.png.
+    
+    Examples:
+        >>> pred_xgb = model_xgb.predict(X_test)
+        >>> pred_rf = model_rf.predict(X_test)
+        >>> compare_predictions(pred_xgb, pred_rf, "XGBoost vs Random Forest")
+    """
     bins = np.histogram(np.hstack((pred_1, pred_2)), bins=100)[1]  # get the bin edges
 
     plt.figure(figsize=(10, 6))
@@ -155,7 +255,26 @@ def compare_predictions(pred_1, pred_2, title="Prediction 1 vs Prediction 2", sa
     plt.show()
 
 def to_submission(prediction, output_dir="output"):
-    """Create submission file from predictions"""
+    """
+    Create a CSV submission file from model predictions.
+    
+    Generates a timestamped CSV file with predictions formatted for
+    competition submission, with row_id as index and duration as the column.
+    
+    Args:
+        prediction (np.ndarray or list): Array of predicted duration values.
+        output_dir (str, optional): Directory to save the submission file.
+                                   Defaults to "output".
+    
+    Returns:
+        str: Path to the saved submission file.
+    
+    Examples:
+        >>> predictions = model.predict(test_df)
+        >>> file_path = to_submission(predictions)
+        >>> print(file_path)
+        'output/test_prediction_20250930_103835.csv'
+    """
     os.makedirs(output_dir, exist_ok=True)
     date_string = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     file_string = f"{output_dir}/test_prediction_{date_string}.csv"
@@ -175,6 +294,25 @@ def to_submission(prediction, output_dir="output"):
 # Individual model trainers
 # ===========================
 def train_linear_regression(Xn_train, Yn_train, Xn_val, Yn_val):
+    """
+    Train a Linear Regression model and evaluate on validation set.
+    
+    Fits a standard Linear Regression model using Ordinary Least Squares (OLS)
+    and logs the RMSE performance on the validation set along with training time.
+    
+    Args:
+        Xn_train (pd.DataFrame or np.ndarray): Normalized training features.
+        Yn_train (pd.Series or np.ndarray): Training target values (durations).
+        Xn_val (pd.DataFrame or np.ndarray): Normalized validation features.
+        Yn_val (pd.Series or np.ndarray): Validation target values (durations).
+    
+    Returns:
+        LinearRegression: Trained Linear Regression model.
+    
+    Examples:
+        >>> model = train_linear_regression(Xn_train, y_train, Xn_val, y_val)
+        >>> predictions = model.predict(Xn_test)
+    """
     start_time = time.time()
     model = LinearRegression()
     model.fit(Xn_train, Yn_train)
@@ -188,6 +326,27 @@ def train_linear_regression(Xn_train, Yn_train, Xn_val, Yn_val):
     return model
 
 def train_ridge_regression(Xn_train, Yn_train, Xn_val, Yn_val, alpha=0.5):
+    """
+    Train a Ridge Regression model with L2 regularization.
+    
+    Fits a Ridge Regression model with L2 penalty to prevent overfitting
+    and logs the RMSE performance on the validation set along with training time.
+    
+    Args:
+        Xn_train (pd.DataFrame or np.ndarray): Normalized training features.
+        Yn_train (pd.Series or np.ndarray): Training target values (durations).
+        Xn_val (pd.DataFrame or np.ndarray): Normalized validation features.
+        Yn_val (pd.Series or np.ndarray): Validation target values (durations).
+        alpha (float, optional): Regularization strength. Higher values mean
+                                stronger regularization. Defaults to 0.5.
+    
+    Returns:
+        Ridge: Trained Ridge Regression model.
+    
+    Examples:
+        >>> model = train_ridge_regression(Xn_train, y_train, Xn_val, y_val, alpha=1.0)
+        >>> predictions = model.predict(Xn_test)
+    """
     start_time = time.time()
     model = Ridge(alpha=alpha)
     model.fit(Xn_train, Yn_train)
@@ -201,6 +360,28 @@ def train_ridge_regression(Xn_train, Yn_train, Xn_val, Yn_val, alpha=0.5):
     return model
 
 def train_lasso_regression(Xn_train, Yn_train, Xn_val, Yn_val, alpha=0.1):
+    """
+    Train a Lasso Regression model with L1 regularization.
+    
+    Fits a Lasso Regression model with L1 penalty that can perform feature
+    selection by driving some coefficients to zero, and logs the RMSE performance
+    on the validation set along with training time.
+    
+    Args:
+        Xn_train (pd.DataFrame or np.ndarray): Normalized training features.
+        Yn_train (pd.Series or np.ndarray): Training target values (durations).
+        Xn_val (pd.DataFrame or np.ndarray): Normalized validation features.
+        Yn_val (pd.Series or np.ndarray): Validation target values (durations).
+        alpha (float, optional): Regularization strength. Higher values mean
+                                stronger regularization. Defaults to 0.1.
+    
+    Returns:
+        Lasso: Trained Lasso Regression model.
+    
+    Examples:
+        >>> model = train_lasso_regression(Xn_train, y_train, Xn_val, y_val, alpha=0.05)
+        >>> predictions = model.predict(Xn_test)
+    """
     start_time = time.time()
     model = Lasso(alpha=alpha, max_iter=5000)
     model.fit(Xn_train, Yn_train)
@@ -214,6 +395,26 @@ def train_lasso_regression(Xn_train, Yn_train, Xn_val, Yn_val, alpha=0.1):
     return model
 
 def train_svr(X_train, Y_train, X_val, Y_val):
+    """
+    Train a Support Vector Regression (SVR) model.
+    
+    Fits a Support Vector Regression model using default RBF kernel
+    and logs the RMSE performance on the validation set along with training time.
+    Note: SVR can be computationally expensive on large datasets.
+    
+    Args:
+        X_train (pd.DataFrame or np.ndarray): Training features (non-normalized).
+        Y_train (pd.Series or np.ndarray): Training target values (durations).
+        X_val (pd.DataFrame or np.ndarray): Validation features (non-normalized).
+        Y_val (pd.Series or np.ndarray): Validation target values (durations).
+    
+    Returns:
+        SVR: Trained Support Vector Regression model.
+    
+    Examples:
+        >>> model = train_svr(X_train, y_train, X_val, y_val)
+        >>> predictions = model.predict(X_test)
+    """
     start_time = time.time()
     model = SVR()
     model.fit(X_train, Y_train)
@@ -227,6 +428,27 @@ def train_svr(X_train, Y_train, X_val, Y_val):
     return model
 
 def train_xgb(X_train, Y_train, X_val, Y_val):
+    """
+    Train an XGBoost Regression model with predefined hyperparameters.
+    
+    Fits an XGBoost gradient boosting model with specific hyperparameters
+    optimized for this problem, logs RMSE performance on validation set,
+    and displays feature importance plot.
+    
+    Args:
+        X_train (pd.DataFrame or np.ndarray): Training features (non-normalized).
+        Y_train (pd.Series or np.ndarray): Training target values (durations).
+        X_val (pd.DataFrame or np.ndarray): Validation features (non-normalized).
+        Y_val (pd.Series or np.ndarray): Validation target values (durations).
+    
+    Returns:
+        XGBRegressor: Trained XGBoost model with n_estimators=500, 
+                     learning_rate=0.045, max_depth=9, reg_lambda=0.5.
+    
+    Examples:
+        >>> model = train_xgb(X_train, y_train, X_val, y_val)
+        >>> predictions = model.predict(X_test)
+    """
     start_time = time.time()
     model = XGBRegressor(n_estimators=500, learning_rate=0.045, max_depth=9, reg_lambda=0.5, verbosity=0)
     model.fit(X_train, Y_train)
@@ -241,6 +463,25 @@ def train_xgb(X_train, Y_train, X_val, Y_val):
     return model
 
 def train_random_forest(X_train, Y_train, X_val, Y_val):
+    """
+    Train a Random Forest Regression model.
+    
+    Fits a Random Forest ensemble model with 500 trees, logs RMSE performance
+    on validation set, and displays feature importance plot.
+    
+    Args:
+        X_train (pd.DataFrame or np.ndarray): Training features (non-normalized).
+        Y_train (pd.Series or np.ndarray): Training target values (durations).
+        X_val (pd.DataFrame or np.ndarray): Validation features (non-normalized).
+        Y_val (pd.Series or np.ndarray): Validation target values (durations).
+    
+    Returns:
+        RandomForestRegressor: Trained Random Forest model with 500 estimators.
+    
+    Examples:
+        >>> model = train_random_forest(X_train, y_train, X_val, y_val)
+        >>> predictions = model.predict(X_test)
+    """
     start_time = time.time()
     model = RandomForestRegressor(n_estimators=500)
     model.fit(X_train, Y_train)
@@ -255,6 +496,32 @@ def train_random_forest(X_train, Y_train, X_val, Y_val):
     return model
 
 def train_neural_network(Xn_train, Yn_train, Xn_val, Yn_val):
+    """
+    Train a Deep Neural Network for regression using Keras.
+    
+    Builds and trains a fully connected neural network with 3 hidden layers
+    and L2 regularization. Displays loss curves and logs RMSE performance
+    on validation set.
+    
+    Architecture:
+        - Input layer: 20 neurons (ReLU)
+        - Hidden layer 1: 150 neurons (ReLU, L2=0.2)
+        - Hidden layer 2: 60 neurons (ReLU, L2=0.2)
+        - Output layer: 1 neuron (Linear)
+    
+    Args:
+        Xn_train (pd.DataFrame or np.ndarray): Normalized training features.
+        Yn_train (pd.Series or np.ndarray): Training target values (durations).
+        Xn_val (pd.DataFrame or np.ndarray): Normalized validation features.
+        Yn_val (pd.Series or np.ndarray): Validation target values (durations).
+    
+    Returns:
+        Sequential: Trained Keras Sequential model with MSE loss and Adam optimizer.
+    
+    Examples:
+        >>> model = train_neural_network(Xn_train, y_train, Xn_val, y_val)
+        >>> predictions = model.predict(Xn_test)
+    """
     start_time = time.time()
     model = Sequential()
     model.add(Dense(20, kernel_initializer='normal', input_dim=Xn_train.shape[1], activation='relu'))
@@ -277,7 +544,28 @@ def train_neural_network(Xn_train, Yn_train, Xn_val, Yn_val):
 # Multi-model training (tqdm)
 # ===========================
 def run_regression_models(train_df, models_to_run=None):
-    """Train multiple models on train_df and return them as a dictionary"""
+    """
+    Train multiple regression models and return them as a dictionary.
+    
+    This function orchestrates training of multiple models with progress tracking,
+    automatically handling data splitting and normalization where needed.
+    
+    Args:
+        train_df (pd.DataFrame): Training dataset containing features and 'duration' column.
+        models_to_run (list of str, optional): List of model identifiers to train.
+                                               Available options: 'LINREG', 'RIDGE', 'LASSO',
+                                               'SVR', 'XGB', 'RF', 'NN'.
+                                               Defaults to ['XGB'].
+    
+    Returns:
+        dict: Dictionary mapping model names to trained model objects.
+              Keys are descriptive names (e.g., 'XGBoost', 'Random Forest').
+    
+    Examples:
+        >>> models = run_regression_models(train_df, ['XGB', 'RF', 'LINREG'])
+        >>> xgb_model = models['XGBoost']
+        >>> predictions = xgb_model.predict(test_features)
+    """
     if models_to_run is None:
         models_to_run = ['XGB']
 
@@ -322,7 +610,28 @@ def run_regression_models(train_df, models_to_run=None):
 # =========================================
 def hyperparameter_tuning_xgb(train_df, test_size=0.2, random_state=1):
     """
-    Perform hyperparameter tuning for XGBoost
+    Perform grid search hyperparameter tuning for XGBoost model.
+    
+    Searches over max_depth and learning_rate parameters to find the optimal
+    combination that minimizes RMSE on validation set. Displays progress and
+    tracks top 3 parameter combinations.
+    
+    Args:
+        train_df (pd.DataFrame): Training dataset containing features and 'duration' column.
+        test_size (float, optional): Proportion of data to use for validation.
+                                    Defaults to 0.2.
+        random_state (int, optional): Random seed for reproducibility. Defaults to 1.
+    
+    Returns:
+        tuple: A tuple containing:
+            - XGBRegressor: Best tuned model trained with optimal parameters
+            - dict: Dictionary of best hyperparameters
+            - float: RMSE of the best model on validation set
+    
+    Examples:
+        >>> best_model, best_params, rmse = hyperparameter_tuning_xgb(train_df)
+        >>> print(f"Best parameters: {best_params}")
+        >>> print(f"Best RMSE: {rmse:.4f}")
     """
     logging.info("Starting XGBoost hyperparameter tuning...")
     logging.info("=" * 50)
@@ -409,7 +718,37 @@ def hyperparameter_tuning_xgb(train_df, test_size=0.2, random_state=1):
 def run_complete_pipeline(train_df, test_df, models_to_run=None,
                           tune_xgb=False, create_submission=True):
     """
-    Run the complete ML pipeline including training and prediction
+    Run the complete machine learning pipeline from training to prediction.
+    
+    This end-to-end function orchestrates model training, optional hyperparameter
+    tuning for XGBoost, and predictions on test data for all specified models.
+    
+    Args:
+        train_df (pd.DataFrame): Training dataset containing features and 'duration' column.
+        test_df (pd.DataFrame): Test dataset for making predictions.
+        models_to_run (list of str, optional): List of model identifiers to train.
+                                               Available: 'LINREG', 'RIDGE', 'LASSO',
+                                               'SVR', 'XGB', 'RF', 'NN'.
+                                               Defaults to None (uses default from run_regression_models).
+        tune_xgb (bool, optional): Whether to perform hyperparameter tuning for XGBoost.
+                                  If True, creates an additional 'XGBoost_Tuned' model.
+                                  Defaults to False.
+        create_submission (bool, optional): Parameter for future submission file creation.
+                                           Currently not used. Defaults to True.
+    
+    Returns:
+        dict: Dictionary containing:
+            - 'models' (dict): Trained model objects keyed by model name
+            - 'predictions' (dict): Prediction arrays keyed by model name
+    
+    Examples:
+        >>> results = run_complete_pipeline(
+        ...     train_df, test_df, 
+        ...     models_to_run=['XGB', 'RF'],
+        ...     tune_xgb=True
+        ... )
+        >>> xgb_predictions = results['predictions']['XGBoost']
+        >>> tuned_xgb_model = results['models']['XGBoost_Tuned']
     """
     logging.info("Starting Complete ML Pipeline...")
     logging.info("=" * 60)