Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions ElasticNet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import numpy as np

class ElasticNetModel:

def __init__(
self,
alpha=1.0,
l1_ratio=0.5,
fit_intercept=True,
max_iter=1000,
tolerance=1e-4,
learning_rate=0.01,
optimization='batch',
random_state=None,
early_stopping=False,
patience=10,
learning_rate_schedule=None
):
self.alpha = alpha
self.l1_ratio = l1_ratio
self.fit_intercept = fit_intercept
self.max_iter = max_iter
self.tolerance = tolerance
self.learning_rate = learning_rate
self.optimization = optimization.lower()
self.random_state = random_state
self.early_stopping = early_stopping
self.patience = patience
self.learning_rate_schedule = learning_rate_schedule
self.coef_ = None
self.intercept_ = 0.0
self.mean_ = None
self.std_dev_ = None
self.y_mean_ = None
self.y_std_dev_ = None

def _initialize_weights(self, n_features):
rng = np.random.default_rng(self.random_state)
self.coef_ = rng.normal(loc=0.0, scale=0.01, size=n_features)
if self.fit_intercept:
self.intercept_ = 0.0

def _scale_features(self, X):
self.mean_ = np.mean(X, axis=0)
self.std_dev_ = np.std(X, axis=0)
self.std_dev_[self.std_dev_ == 0] = 1
return (X - self.mean_) / self.std_dev_

def _compute_loss(self, X_scaled, y_scaled):
predictions = X_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
residuals = y_scaled - predictions
mse_loss = np.mean(residuals ** 2)
l1_penalty = self.alpha * self.l1_ratio * np.sum(np.abs(self.coef_))
l2_penalty = self.alpha * (1 - self.l1_ratio) * np.sum(self.coef_ ** 2)
return mse_loss + l1_penalty + l2_penalty

def _learning_rate_decay(self, iteration):
if self.learning_rate_schedule == 'time_decay':
return self.learning_rate / (1 + iteration * 0.001)
elif self.learning_rate_schedule == 'step_decay':
return self.learning_rate * (0.5 ** (iteration // 500))
else:
return self.learning_rate

def fit(self, X, y):
print(f"Fitting model with X shape {X.shape}, y shape {y.shape}")

# Input validation
if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
raise ValueError("X and y must be NumPy arrays.")
if X.size == 0 or y.size == 0:
raise ValueError("Input data X and y must not be empty.")
if X.shape[0] != y.shape[0]:
raise ValueError("Number of samples in X and y must be equal.")
if not np.issubdtype(y.dtype, np.number) or not np.issubdtype(X.dtype, np.number):
raise ValueError("X and y must be numeric arrays.")
if self.optimization not in ['batch', 'stochastic']:
raise ValueError(f"Invalid optimization option: {self.optimization}")

X_scaled = self._scale_features(X)
self.y_mean_ = np.mean(y)
self.y_std_dev_ = np.std(y)
if self.y_std_dev_ == 0:
self.y_std_dev_ = 1
y_scaled = (y - self.y_mean_) / self.y_std_dev_

n_samples, n_features = X.shape
print(f"Number of samples: {n_samples}, Number of features: {n_features}")
self._initialize_weights(n_features)
print(f"Initialized coefficients with shape: {self.coef_.shape}")

previous_loss = self._compute_loss(X_scaled, y_scaled)

for iteration in range(1, self.max_iter + 1):
if self.optimization == 'batch':
predictions = X_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
errors = predictions - y_scaled
gradient_wrt_coef = (2 / n_samples) * X_scaled.T.dot(errors).flatten()
l1_grad = self.alpha * self.l1_ratio * np.sign(self.coef_)
l2_grad = 2 * self.alpha * (1 - self.l1_ratio) * self.coef_
total_grad_coef = gradient_wrt_coef + l1_grad + l2_grad
lr_adjusted = self._learning_rate_decay(iteration)

# Update coefficients
if total_grad_coef.shape == gradient_wrt_coef.shape:
self.coef_ -= lr_adjusted * total_grad_coef
else:
raise ValueError(f"Gradient shapes do not match: {gradient_wrt_coef.shape} vs {total_grad_coef.shape}")

if self.fit_intercept:
intercept_grad = (2 / n_samples) * np.sum(errors)
self.intercept_ -= lr_adjusted * intercept_grad

elif self.optimization == 'stochastic':
indices = np.random.permutation(n_samples)
for i in indices:
xi_scaled = X_scaled[i].reshape(1, -1)
yi_scaled = y_scaled[i]
prediction_i = xi_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
error_i = prediction_i - yi_scaled
gradient_wrt_coef_i = 2 * xi_scaled.T.dot(error_i).flatten()
l1_grad_i = self.alpha * self.l1_ratio * np.sign(self.coef_)
l2_grad_i = 2 * self.alpha * (1 - self.l1_ratio) * self.coef_
total_grad_coef_i = gradient_wrt_coef_i + l1_grad_i + l2_grad_i
lr_adjusted_i = self._learning_rate_decay(iteration)

# Update coefficients
if total_grad_coef_i.shape == gradient_wrt_coef_i.shape:
self.coef_ -= lr_adjusted_i * total_grad_coef_i
else:
raise ValueError(f"Gradient shapes do not match: {gradient_wrt_coef_i.shape} vs {total_grad_coef_i.shape}")

if self.fit_intercept:
intercept_grad_i = 2 * error_i
self.intercept_ -= lr_adjusted_i * intercept_grad_i.item()

loss_value = self._compute_loss(X_scaled, y_scaled)
if iteration % 100 == 0 or iteration == 1:
print(f"Iteration {iteration}: Loss value: {loss_value}")

if np.isnan(loss_value) or np.isinf(loss_value):
print(f"Numerical issue detected at iteration {iteration}: Loss value: {loss_value}")
break

if abs(previous_loss - loss_value) < self.tolerance:
print(f"Convergence reached at iteration {iteration}: Loss value: {loss_value}")
break

previous_loss = loss_value

def predict(self, X):
# Ensure that the model is fitted before making predictions.
if self.coef_ is None:
raise ValueError("Model has not been fitted yet.")
if not isinstance(X, np.ndarray):
raise ValueError("X must be a NumPy array.")
# Check for empty input data.
if X.size == 0:
raise ValueError("Input data X must not be empty.")
# Ensure that the number of features in the input matches the trained model.
if X.shape[1] != len(self.coef_):
raise ValueError("Number of features in X must match number of coefficients.")
# Scale features using the training data's scaling parameters.
X_scaled = (X - self.mean_) / self.std_dev_
# Handle zero variance features to prevent division by zero.
X_scaled[:, self.std_dev_ == 0] = 0
# Calculate predicted target values in scaled space.
y_pred_scaled = X_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
# Reverse scaling to obtain predictions in original target space.
y_pred = y_pred_scaled * self.y_std_dev_ + self.y_mean_
return y_pred
111 changes: 106 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,109 @@

# Project 1

Put your README here. Answer the following questions.
Project Members:

1.Satwik Sinha
A20547790
ssinha20@hawk.iit.edu

2.Aditya Ramchandra Kutre
CWID : #A20544809
akutre@hawk.iit.edu

3.Tejaswi Yerra
CWID : #A20545536
tyerra@hawk.iit.edu

# ElasticNet Linear Regression Implementation

## Overview

This project implements **Linear Regression with ElasticNet Regularization** from first principles. ElasticNet combines both L1 (Lasso) and L2 (Ridge) regularization to enhance model performance, especially in scenarios with high-dimensional data or multicollinearity among features.

## **What does the model you have implemented do and when should it be used?**

The implemented **ElasticNet** model performs linear regression while applying a combination of L1 and L2 penalties to the loss function. This approach offers several advantages:

- **Feature Selection:** L1 regularization encourages sparsity, effectively selecting relevant features.
- **Handling Multicollinearity:** L2 regularization mitigates issues arising from highly correlated predictors.
- **Improving Generalization:** The combined regularization prevents overfitting, enhancing the model’s ability to generalize to unseen data.

**When to use ElasticNet:**

- When dealing with datasets that have a large number of predictors.
- When there is multicollinearity among features.
- When feature selection is desired alongside regression.
- When seeking a balance between L1 and L2 regularization benefits.

## **How did you test your model to determine if it is working reasonably correctly?**

Testing was conducted through the following approaches:

- **Synthetic Data Generation:** Utilized the provided `generate_regression_data.py` script to create synthetic datasets with known coefficients and noise levels, validating the model's ability to recover the underlying parameters[3].
- **Performance Metrics:** Evaluated using Mean Squared Error (MSE) and R-squared metrics to quantify prediction accuracy.
- **Edge Case Analysis:** Tested the model with various data conditions, including:
- High-dimensional data.
- Data with multicollinearity.
- Datasets with varying noise levels.
- **Comparison with Baselines:** Compared the results against standard linear regression without regularization to demonstrate the benefits of ElasticNet.

## **What parameters have you exposed to users of your implementation in order to tune performance?**

The ElasticNet implementation exposes the following tunable parameters:

- **`alpha`**: Controls the overall strength of the regularization. Higher values impose more regularization.
- **`l1_ratio`**: Balances the contribution between L1 and L2 regularization. A value of 0 corresponds to only L2 regularization, while a value of 1 corresponds to only L1.
- **`fit_intercept`**: Boolean indicating whether to calculate the intercept for the model.
- **`max_iter`**: The maximum number of iterations for the optimization algorithm.
- **`tolerance`**: The tolerance for the optimization algorithm's convergence.
- **`learning_rate`**: Step size for gradient descent updates.
- **`random_state`**: Seed used by the random number generator for reproducibility.

These parameters allow users to fine-tune the model to achieve optimal performance based on their specific dataset characteristics.

## **Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental to the model?**

**Challenging Inputs:**

- **Highly Imbalanced Features:** Datasets where certain features dominate others in scale can affect the regularization effectiveness. Proper feature scaling is essential.
- **Non-linear Relationships:** The current implementation assumes linear relationships between predictors and the target variable. It may underperform on datasets with complex non-linear patterns.
- **Sparse Data with High Dimensionality:** While ElasticNet is suitable for high-dimensional data, extremely sparse datasets might require additional preprocessing or dimensionality reduction techniques.

**Potential Workarounds:**

- **Feature Scaling:** Implementing automatic feature scaling can mitigate issues with imbalanced feature scales.
- **Polynomial Features:** Extending the model to include polynomial or interaction terms can help capture non-linear relationships.
- **Dimensionality Reduction:** Techniques like PCA can be integrated to handle extremely high-dimensional sparse data more effectively.

With additional time, these enhancements can be incorporated to improve the model's robustness and applicability to a wider range of datasets.

## **Usage Examples**

Below are examples demonstrating how to use the implemented ElasticNet model:

### **Training the Model**

```python
from ElasticNet import ElasticNetModel
import numpy as np

# Generate synthetic data
from generate_regression_data import linear_data_generator

# Parameters for synthetic data
m = np.array([1.5, -2.0, 3.0])
b = 4.0
rnge = [0, 10]
N = 100
scale = 1.0
seed = 42

# Generate data
X, y = linear_data_generator(m, b, rnge, N, scale, seed)

# Initialize the model with desired parameters
model = ElasticNetModel(alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tolerance=1e-4, learning_rate=0.01, random_state=42)

* What does the model you have implemented do and when should it be used?
* How did you test your model to determine if it is working reasonably correctly?
* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
# Fit the model to the training data
model.fit(X, y)
285 changes: 285 additions & 0 deletions extracredit analysis.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions generate_regression_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@
def linear_data_generator(m, b, rnge, N, scale, seed):
rng = numpy.random.default_rng(seed=seed)
sample = rng.uniform(low=rnge[0], high=rnge[1], size=(N, m.shape[0]))
ys = numpy.dot(sample, numpy.reshape(m, (-1,1))) + b
ys = numpy.dot(sample, m) + b
noise = rng.normal(loc=0., scale=scale, size=ys.shape)
return (sample, ys+noise)

def write_data(filename, X, y):
with open(filename, "w") as file:
with open(filename, "w",newline='') as file:
# X column for every x
xs = [f"x_{n}" for n in range(X.shape[1])]
header = xs + ["y"]
writer = csv.writer(file)
writer.writerow(header)
for row in numpy.hstack((X,y)):
for row in numpy.hstack((X,y.reshape(-1,1))):
writer.writerow(row)

def main():
Expand Down
Loading