Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions BOOTSTRAPPING.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


n_bootstraps = 50 # Number of bootstrap iterations
bootstrap_sample_size = int(0.5 * len(X_train_dense)) # Use 50% of the training data per iteration
bootstrap_scores = []

for _ in range(n_bootstraps):
indices = np.random.choice(len(X_train_dense), size=bootstrap_sample_size, replace=True)
X_bootstrap, y_bootstrap = X_train_dense[indices], y_train.iloc[indices]

# Train and predict
lr.fit(X_bootstrap, y_bootstrap)
y_pred_bootstrap = lr.predict(X_test_dense)

rss = np.sum((y_test - y_pred_bootstrap)**2)
tss = np.sum((y_test - np.mean(y_test))**2)
r2_bootstrap = 1 - (rss / tss)
bootstrap_scores.append(r2_bootstrap)

# Calculate the mean and standard deviation of R²
bootstrap_mean_r2 = np.mean(bootstrap_scores)
bootstrap_std_r2 = np.std(bootstrap_scores)
print(f"Bootstrapping Mean R² Score: {bootstrap_mean_r2:.4f}")

26 changes: 26 additions & 0 deletions KFOLD CROSS VALIDATION.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


kf = KFold(n_splits=10, shuffle=True, random_state=0)
cv_scores = []

for train_index, test_index in kf.split(X_train_dense):
# Split data manually for each fold
X_train_kf, X_test_kf = X_train_dense[train_index], X_train_dense[test_index]
y_train_kf, y_test_kf = y_train.iloc[train_index], y_train.iloc[test_index]

lr.fit(X_train_kf, y_train_kf)
y_pred_kf = lr.predict(X_test_kf)

rss = np.sum((y_test_kf - y_pred_kf)**2)
tss = np.sum((y_test_kf - np.mean(y_test_kf))**2)
r2_kf = 1 - (rss / tss)
cv_scores.append(r2_kf)

# Calculate the mean R² score across folds
cv_mean_r2 = np.mean(cv_scores)
print(f"K-Fold Cross-Validation Mean R² Score: {cv_mean_r2:.4f}")

Binary file added ML_PROJECT 2.pdf
Binary file not shown.
208 changes: 208 additions & 0 deletions ML_PROJECT 2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#!/usr/bin/env python
# coding: utf-8

# Ekta Shukla - A20567127
#
# Rithika Kavitha Suresh - A20564346
#
# Roger Kewin Samson - A20563057
#
# Jude Rosun - A20564339

# # Model Selection k-fold cross-validation and bootstrapping

# In[11]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.utils import resample


# # Load the dataset
#

# In[12]:


file_path = "flight prediction.csv" # Replace with the correct file path
data = pd.read_csv(file_path)

# Drop unnecessary columns and clean missing values
data = data.drop(columns=['Unnamed: 0']) # Drop index-like column
data = data.dropna() # Remove rows with missing values

X = data.drop(columns=['price']) # Features
y = data['price'] # Target variable



# In[13]:


categorical_features = ['airline', 'source_city', 'departure_time', 'stops',
'arrival_time', 'destination_city', 'class']
numerical_features = ['duration', 'days_left']

preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
]
)

# Transform the dataset
X_transformed = preprocessor.fit_transform(X)


# # Split the data into training and testing sets
#

# In[14]:


X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()


# # Initialize Linear Regression
#

# In[15]:


lr = LinearRegression()

# Implementing K-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=0)
cv_scores = []

for train_index, test_index in kf.split(X_train_dense):
# Split data manually for each fold
X_train_kf, X_test_kf = X_train_dense[train_index], X_train_dense[test_index]
y_train_kf, y_test_kf = y_train.iloc[train_index], y_train.iloc[test_index]

lr.fit(X_train_kf, y_train_kf)
y_pred_kf = lr.predict(X_test_kf)

rss = np.sum((y_test_kf - y_pred_kf)**2)
tss = np.sum((y_test_kf - np.mean(y_test_kf))**2)
r2_kf = 1 - (rss / tss)
cv_scores.append(r2_kf)

# Calculate the mean R² score across folds
cv_mean_r2 = np.mean(cv_scores)
print(f"K-Fold Cross-Validation Mean R² Score: {cv_mean_r2:.4f}")


# # Bootstrapping Implementation
#

# In[16]:


n_bootstraps = 50 # Number of bootstrap iterations
bootstrap_sample_size = int(0.5 * len(X_train_dense)) # Use 50% of the training data per iteration
bootstrap_scores = []

for _ in range(n_bootstraps):
indices = np.random.choice(len(X_train_dense), size=bootstrap_sample_size, replace=True)
X_bootstrap, y_bootstrap = X_train_dense[indices], y_train.iloc[indices]

# Train and predict
lr.fit(X_bootstrap, y_bootstrap)
y_pred_bootstrap = lr.predict(X_test_dense)

rss = np.sum((y_test - y_pred_bootstrap)**2)
tss = np.sum((y_test - np.mean(y_test))**2)
r2_bootstrap = 1 - (rss / tss)
bootstrap_scores.append(r2_bootstrap)

# Calculate the mean and standard deviation of R²
bootstrap_mean_r2 = np.mean(bootstrap_scores)
bootstrap_std_r2 = np.std(bootstrap_scores)
print(f"Bootstrapping Mean R² Score: {bootstrap_mean_r2:.4f}")


# # Print final results
#

# In[24]:


print(f"Final Results:")
print(f" K-Fold Cross-Validation Mean R²: {cv_mean_r2:.4f}")
print(f" Bootstrapping Mean R²: {bootstrap_mean_r2:.4f}")
print(f" Bootstrapping R² Std Dev: {bootstrap_std_r2:.4f}")


# In[17]:


# AIC
def calculate_aic(n, rss, k):
return n * np.log(rss / n) + 2 * k


# In[20]:


kf_aic_scores = []

for train_index, test_index in kf.split(X_train_dense):
# Split data manually for each fold
X_train_kf, X_test_kf = X_train_dense[train_index], X_train_dense[test_index]
y_train_kf, y_test_kf = y_train.iloc[train_index], y_train.iloc[test_index]
lr.fit(X_train_kf, y_train_kf)
y_pred_kf = lr.predict(X_test_kf)

rss = np.sum((y_test_kf - y_pred_kf)**2)
tss = np.sum((y_test_kf - np.mean(y_test_kf))**2)
r2_kf = 1 - (rss / tss)
cv_scores.append(r2_kf)
n = len(y_test_kf)
k = X_train_kf.shape[1] + 1
aic = calculate_aic(n, rss, k)
kf_aic_scores.append(aic)
# Print average AIC for K-Fold
mean_aic_kf = np.mean(kf_aic_scores)
print(f"K-Fold Cross-Validation Mean AIC: {mean_aic_kf:.4f}")


# In[22]:


bootstrap_aic_scores = []

for _ in range(n_bootstraps):
indices = np.random.choice(len(X_train_dense), size=bootstrap_sample_size, replace=True)
X_bootstrap, y_bootstrap = X_train_dense[indices], y_train.iloc[indices]
lr.fit(X_bootstrap, y_bootstrap)
y_pred_bootstrap = lr.predict(X_test_dense)
rss = np.sum((y_test - y_pred_bootstrap)**2)
tss = np.sum((y_test - np.mean(y_test))**2)
r2_bootstrap = 1 - (rss / tss)
bootstrap_scores.append(r2_bootstrap)

n = len(y_test)
k = X_bootstrap.shape[1] + 1
aic = calculate_aic(n, rss, k)
bootstrap_aic_scores.append(aic)

# Print average AIC for Bootstrapping
mean_aic_bootstrap = np.mean(bootstrap_aic_scores)
print(f"Bootstrapping Mean AIC: {mean_aic_bootstrap:.4f}")


# In[ ]:




106 changes: 89 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,29 +1,101 @@
# Project 2
------
# README - Flight Prediction

Select one of the following two options:
# Team Name : Machine Learning Crew
Roger Kewin Samson - A20563057

## Boosting Trees
Ekta Shukla - A20567127

Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
Rithika Kavitha Suresh - A20564346

Put your README below. Answer the following questions.
Jude Rosun - A20564339

* What does the model you have implemented do and when should it be used?
* How did you test your model to determine if it is working reasonably correctly?
* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
Analysis Overview
This project evaluates different model selection techniques for predicting flight ticket prices. Specifically, we applied cross-validation, bootstrapping, and Akaike Information Criterion (AIC) for model evaluation and selection in a linear regression setting.

## Model Selection
Results Summary
1. K-Fold Cross-Validation:
- Mean (R^2): 0.9115

Implement generic k-fold cross-validation and bootstrapping model selection methods.
2. Bootstrapping:
- Mean (R^2): 0.9113

In your README, answer the following questions:
3. AIC (Akaike Information Criterion):
- AIC was calculated based on the residual sum of squares (RSS) for model evaluation.


## Question 1: Do Cross-Validation and Bootstrapping Agree with Simpler Model Selectors Like AIC in Simple Cases?

### Answer -

Yes, in this linear regression case, cross-validation and bootstrapping provide similar estimates, both indicating a high degree of model fit ACCURACY - 9.11. This agreement suggests that all three methods—cross-validation, bootstrapping, and AIC—are consistent in simple regression scenarios.

Observations from Different Models:
We experimented with advanced models to evaluate their performance:
Using raw GPU data, the accuracy was only 60% due to poor data quality.
After data cleaning, the model improved to 76% accuracy.
Implementing Lasso and Ridge regression yielded similar results to linear regression, with accuracy around 76%, showing that regularization wasn’t necessary for this case.
Importing pre-cleaned data with better features and ensuring proper preprocessing improved the model performance further, achieving consistent results.
Focus on K-Fold Cross-Validation and Bootstrapping:
Cross-Validation: Directly evaluates the generalization error by dividing the data into folds, ensuring unbiased performance estimates. It highlighted areas for improvement when the model was overfitting.
Bootstrapping: Approximated the distribution of accuracy
scores, showing the robustness of the model and its reliability across multiple resampling iterations.
AIC (Akaike Information Criterion): Provided a simpler metric by penalizing overly complex models, confirming the linear regression model as the most appropriate choice.
Key Takeaways:
Cross-validation and bootstrapping both agreed with AIC, especially in this linear regression case, demonstrating consistency in their evaluation of the model's performance.
This consistency ensures that the project focuses on not only implementing these advanced evaluation techniques but also on achieving reliable results using these methods.
By cleaning the data and testing with advanced methods, we confirmed that cross-validation and bootstrapping align with AIC in simple regression scenarios, validating their reliability as model selection techniques.

## Question 2: In What Cases Might These Methods Fail or Give Incorrect/Undesirable Results?

Answer -

Cross-validation, bootstrapping, and AIC can fail or produce undesirable results in specific cases. Cross-validation may yield overly optimistic results if feature selection or preprocessing is performed before splitting the data, as this leaks information from test folds into training folds. Additionally, small datasets can result in high variance in fold splits, making performance estimates less reliable. Unclean or unsuitable data further exacerbates these issues, especially when proper preprocessing steps or required libraries are skipped. Bootstrapping, on the other hand, can fail when working with very small datasets or those that do not represent the population accurately, as resampling bias can skew results. It is also computationally intensive, especially with large datasets or a high number of iterations, though adjusting parameters can improve runtime at the cost of robustness. Finally, AIC assumes the model is correctly specified and does not account for prediction error as cross-validation does. This limitation may lead AIC to favor overly simple models that fail to capture the data's complexity, particularly when the model is misspecified or the data is intricate.

## Question 3: What Could Be Implemented to Mitigate These Cases or Help Users of These Methods?

Answer -

Cross-validation, bootstrapping, and AIC are powerful tools for model evaluation, but there are ways to improve their robustness and usability. Nested cross-validation could ensure unbiased performance estimates by separating feature selection and evaluation processes, while stratified splits and bootstrapping would help maintain the balance and representativeness of the data, especially in imbalanced scenarios. To handle computational challenges, parallelizing bootstrapping would make it feasible for larger datasets. For AIC, adding complementary metrics like BIC or diagnostics such as residual plots could provide deeper insights into model performance and assumptions. These enhancements would make the methods more reliable and user-friendly in diverse scenarios.

## Question 4. What Parameters Have You Exposed to Users for Model Selection?

Answer -

Users have control over several parameters to customize how the model is evaluated. For K-Fold Cross-Validation, they can set the number of folds (n_splits) to decide how the data is split for testing and training. In bootstrapping, they can adjust the number of iterations (n_bootstraps) and the size of each sample (bootstrap_sample_size) to balance between accuracy and runtime. Users can also choose which features to preprocess, such as categorical features (using one-hot encoding) and numerical features (scaled to have a mean of zero and standard deviation of one). If they want to test non-linear relationships, they can add polynomial features by specifying the degree (degree). These options let users fine-tune the model based on their dataset and requirements.


Explanation of Results:

The high (R^2) values from both cross-validation and bootstrapping indicate that the model explains over 91% of the variance in flight prices, demonstrating good performance. The near-zero standard deviation in bootstrapping implies stability in predictions across different subsets.

Practical Implications:

- Cross-validation and bootstrapping provide robust insights into model performance and generalization.
- AIC complements these methods by penalizing overfitting, ensuring a balance between complexity and fit.



Project Division and Contributions
Our project was divided into four main parts, with each team member contributing their expertise to ensure a successful and comprehensive analysis:

1. Data Preprocessing and Exploration
Contributor: Ekta Shukla (A20567127)
Ekta took the lead in cleaning and preparing the dataset for analysis. She handled missing values, normalized numerical data, and encoded categorical variables to make the dataset ready for modeling. She also conducted exploratory data analysis (EDA) to uncover trends, identify correlations, and detect outliers. Her work laid the foundation for building a reliable model by ensuring that the data was in the best possible shape.

2. Model Implementation and AIC Evaluation
Contributor: Rithika Kavitha Suresh (A20564346)
Rithika was responsible for building the linear regression model and evaluating it using the Akaike Information Criterion (AIC). She calculated the residual sum of squares (RSS) and applied AIC to measure the trade-off between model complexity and fit. Her meticulous attention ensured that the model was both efficient and aligned with statistical standards.

3. Cross-Validation and Bootstrapping
Contributor: Roger Kewin Samson (A20563057)
Roger focused on validating the model using advanced techniques like K-fold cross-validation and bootstrapping. He evaluated the model’s generalization performance by analyzing
𝑅^2 scores across folds and resampled datasets. His work provided robust evidence of the model’s stability and predictive power, ensuring it was not overfitted to the data.

4. Analysis, Documentation, and Integration
Contributor: Jude Rosun (A20564339)
Jude took charge of tying everything together by analyzing the results and preparing the final documentation. He explained the findings clearly, highlighting what the metrics and evaluations meant in practical terms. Jude also ensured the documentation was comprehensive and user-friendly, making it easy for others to understand the process and conclusions.

* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
* In what cases might the methods you've written fail or give incorrect or undesirable results?
* What could you implement given more time to mitigate these cases or help users of your methods?
* What parameters have you exposed to your users in order to use your model selectors.

See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.

As usual, above-and-beyond efforts will be considered for bonus points.
Loading