Fall2024CS584 · Satwik-Sinha · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/ElasticNet.py b/ElasticNet.py
@@ -0,0 +1,171 @@
+import numpy as np
+
+class ElasticNetModel:
+
+    def __init__(
+            self,
+            alpha=1.0,
+            l1_ratio=0.5,
+            fit_intercept=True,
+            max_iter=1000,
+            tolerance=1e-4,
+            learning_rate=0.01,
+            optimization='batch',
+            random_state=None,
+            early_stopping=False,
+            patience=10,
+            learning_rate_schedule=None
+    ):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.tolerance = tolerance
+        self.learning_rate = learning_rate
+        self.optimization = optimization.lower()
+        self.random_state = random_state
+        self.early_stopping = early_stopping
+        self.patience = patience
+        self.learning_rate_schedule = learning_rate_schedule
+        self.coef_ = None
+        self.intercept_ = 0.0
+        self.mean_ = None
+        self.std_dev_ = None
+        self.y_mean_ = None
+        self.y_std_dev_ = None
+
+    def _initialize_weights(self, n_features):
+        rng = np.random.default_rng(self.random_state)
+        self.coef_ = rng.normal(loc=0.0, scale=0.01, size=n_features)
+        if self.fit_intercept:
+            self.intercept_ = 0.0
+
+    def _scale_features(self, X):
+        self.mean_ = np.mean(X, axis=0)
+        self.std_dev_ = np.std(X, axis=0)
+        self.std_dev_[self.std_dev_ == 0] = 1
+        return (X - self.mean_) / self.std_dev_
+
+    def _compute_loss(self, X_scaled, y_scaled):
+        predictions = X_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
+        residuals = y_scaled - predictions
+        mse_loss = np.mean(residuals ** 2)
+        l1_penalty = self.alpha * self.l1_ratio * np.sum(np.abs(self.coef_))
+        l2_penalty = self.alpha * (1 - self.l1_ratio) * np.sum(self.coef_ ** 2)
+        return mse_loss + l1_penalty + l2_penalty
+
+    def _learning_rate_decay(self, iteration):
+        if self.learning_rate_schedule == 'time_decay':
+            return self.learning_rate / (1 + iteration * 0.001)
+        elif self.learning_rate_schedule == 'step_decay':
+            return self.learning_rate * (0.5 ** (iteration // 500))
+        else:
+            return self.learning_rate
+
+    def fit(self, X, y):
+        print(f"Fitting model with X shape {X.shape}, y shape {y.shape}")
+
+        # Input validation
+        if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
+            raise ValueError("X and y must be NumPy arrays.")
+        if X.size == 0 or y.size == 0:
+            raise ValueError("Input data X and y must not be empty.")
+        if X.shape[0] != y.shape[0]:
+            raise ValueError("Number of samples in X and y must be equal.")
+        if not np.issubdtype(y.dtype, np.number) or not np.issubdtype(X.dtype, np.number):
+            raise ValueError("X and y must be numeric arrays.")
+        if self.optimization not in ['batch', 'stochastic']:
+            raise ValueError(f"Invalid optimization option: {self.optimization}")
+
+        X_scaled = self._scale_features(X)
+        self.y_mean_ = np.mean(y)
+        self.y_std_dev_ = np.std(y)
+        if self.y_std_dev_ == 0:
+            self.y_std_dev_ = 1
+        y_scaled = (y - self.y_mean_) / self.y_std_dev_
+
+        n_samples, n_features = X.shape
+        print(f"Number of samples: {n_samples}, Number of features: {n_features}")
+        self._initialize_weights(n_features)
+        print(f"Initialized coefficients with shape: {self.coef_.shape}")
+
+        previous_loss = self._compute_loss(X_scaled, y_scaled)
+
+        for iteration in range(1, self.max_iter + 1):
+            if self.optimization == 'batch':
+                predictions = X_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
+                errors = predictions - y_scaled
+                gradient_wrt_coef = (2 / n_samples) * X_scaled.T.dot(errors).flatten()
+                l1_grad = self.alpha * self.l1_ratio * np.sign(self.coef_)
+                l2_grad = 2 * self.alpha * (1 - self.l1_ratio) * self.coef_
+                total_grad_coef = gradient_wrt_coef + l1_grad + l2_grad
+                lr_adjusted = self._learning_rate_decay(iteration)
+
+                # Update coefficients
+                if total_grad_coef.shape == gradient_wrt_coef.shape:
+                    self.coef_ -= lr_adjusted * total_grad_coef
+                else:
+                    raise ValueError(f"Gradient shapes do not match: {gradient_wrt_coef.shape} vs {total_grad_coef.shape}")
+
+                if self.fit_intercept:
+                    intercept_grad = (2 / n_samples) * np.sum(errors)
+                    self.intercept_ -= lr_adjusted * intercept_grad
+
+            elif self.optimization == 'stochastic':
+                indices = np.random.permutation(n_samples)
+                for i in indices:
+                    xi_scaled = X_scaled[i].reshape(1, -1)
+                    yi_scaled = y_scaled[i]
+                    prediction_i = xi_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
+                    error_i = prediction_i - yi_scaled
+                    gradient_wrt_coef_i = 2 * xi_scaled.T.dot(error_i).flatten()
+                    l1_grad_i = self.alpha * self.l1_ratio * np.sign(self.coef_)
+                    l2_grad_i = 2 * self.alpha * (1 - self.l1_ratio) * self.coef_
+                    total_grad_coef_i = gradient_wrt_coef_i + l1_grad_i + l2_grad_i
+                    lr_adjusted_i = self._learning_rate_decay(iteration)
+
+                    # Update coefficients
+                    if total_grad_coef_i.shape == gradient_wrt_coef_i.shape:
+                        self.coef_ -= lr_adjusted_i * total_grad_coef_i
+                    else:
+                        raise ValueError(f"Gradient shapes do not match: {gradient_wrt_coef_i.shape} vs {total_grad_coef_i.shape}")
+
+                    if self.fit_intercept:
+                        intercept_grad_i = 2 * error_i
+                        self.intercept_ -= lr_adjusted_i * intercept_grad_i.item()
+
+            loss_value = self._compute_loss(X_scaled, y_scaled)
+            if iteration % 100 == 0 or iteration == 1:
+                print(f"Iteration {iteration}: Loss value: {loss_value}")
+
+            if np.isnan(loss_value) or np.isinf(loss_value):
+                print(f"Numerical issue detected at iteration {iteration}: Loss value: {loss_value}")
+                break
+
+            if abs(previous_loss - loss_value) < self.tolerance:
+                print(f"Convergence reached at iteration {iteration}: Loss value: {loss_value}")
+                break
+
+            previous_loss = loss_value
+
+    def predict(self, X):
+        # Ensure that the model is fitted before making predictions.
+        if self.coef_ is None:
+            raise ValueError("Model has not been fitted yet.")
+        if not isinstance(X, np.ndarray):
+            raise ValueError("X must be a NumPy array.")
+        # Check for empty input data.
+        if X.size == 0:
+            raise ValueError("Input data X must not be empty.")
+        # Ensure that the number of features in the input matches the trained model.
+        if X.shape[1] != len(self.coef_):
+            raise ValueError("Number of features in X must match number of coefficients.")
+        # Scale features using the training data's scaling parameters.
+        X_scaled = (X - self.mean_) / self.std_dev_
+        # Handle zero variance features to prevent division by zero.
+        X_scaled[:, self.std_dev_ == 0] = 0
+        # Calculate predicted target values in scaled space.
+        y_pred_scaled = X_scaled.dot(self.coef_) + (self.intercept_ if self.fit_intercept else 0)
+        # Reverse scaling to obtain predictions in original target space.
+        y_pred = y_pred_scaled * self.y_std_dev_ + self.y_mean_
+        return y_pred
diff --git a/README.md b/README.md
@@ -1,8 +1,109 @@
+
 # Project 1 
 
-Put your README here. Answer the following questions.
+Project Members:
+
+1.Satwik Sinha
+A20547790
+ssinha20@hawk.iit.edu
+
+2.Aditya Ramchandra Kutre  
+CWID : #A20544809
+akutre@hawk.iit.edu
+
+3.Tejaswi Yerra
+CWID : #A20545536
+tyerra@hawk.iit.edu
+
+# ElasticNet Linear Regression Implementation
+
+## Overview
+
+This project implements **Linear Regression with ElasticNet Regularization** from first principles. ElasticNet combines both L1 (Lasso) and L2 (Ridge) regularization to enhance model performance, especially in scenarios with high-dimensional data or multicollinearity among features.
+
+## **What does the model you have implemented do and when should it be used?**
+
+The implemented **ElasticNet** model performs linear regression while applying a combination of L1 and L2 penalties to the loss function. This approach offers several advantages:
+
+- **Feature Selection:** L1 regularization encourages sparsity, effectively selecting relevant features.
+- **Handling Multicollinearity:** L2 regularization mitigates issues arising from highly correlated predictors.
+- **Improving Generalization:** The combined regularization prevents overfitting, enhancing the model’s ability to generalize to unseen data.
+
+**When to use ElasticNet:**
+
+- When dealing with datasets that have a large number of predictors.
+- When there is multicollinearity among features.
+- When feature selection is desired alongside regression.
+- When seeking a balance between L1 and L2 regularization benefits.
+
+## **How did you test your model to determine if it is working reasonably correctly?**
+
+Testing was conducted through the following approaches:
+
+- **Synthetic Data Generation:** Utilized the provided `generate_regression_data.py` script to create synthetic datasets with known coefficients and noise levels, validating the model's ability to recover the underlying parameters[3].
+- **Performance Metrics:** Evaluated using Mean Squared Error (MSE) and R-squared metrics to quantify prediction accuracy.
+- **Edge Case Analysis:** Tested the model with various data conditions, including:
+  - High-dimensional data.
+  - Data with multicollinearity.
+  - Datasets with varying noise levels.
+- **Comparison with Baselines:** Compared the results against standard linear regression without regularization to demonstrate the benefits of ElasticNet.
+
+## **What parameters have you exposed to users of your implementation in order to tune performance?**
+
+The ElasticNet implementation exposes the following tunable parameters:
+
+- **`alpha`**: Controls the overall strength of the regularization. Higher values impose more regularization.
+- **`l1_ratio`**: Balances the contribution between L1 and L2 regularization. A value of 0 corresponds to only L2 regularization, while a value of 1 corresponds to only L1.
+- **`fit_intercept`**: Boolean indicating whether to calculate the intercept for the model.
+- **`max_iter`**: The maximum number of iterations for the optimization algorithm.
+- **`tolerance`**: The tolerance for the optimization algorithm's convergence.
+- **`learning_rate`**: Step size for gradient descent updates.
+- **`random_state`**: Seed used by the random number generator for reproducibility.
+
+These parameters allow users to fine-tune the model to achieve optimal performance based on their specific dataset characteristics.
+
+## **Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental to the model?**
+
+**Challenging Inputs:**
+
+- **Highly Imbalanced Features:** Datasets where certain features dominate others in scale can affect the regularization effectiveness. Proper feature scaling is essential.
+- **Non-linear Relationships:** The current implementation assumes linear relationships between predictors and the target variable. It may underperform on datasets with complex non-linear patterns.
+- **Sparse Data with High Dimensionality:** While ElasticNet is suitable for high-dimensional data, extremely sparse datasets might require additional preprocessing or dimensionality reduction techniques.
+
+**Potential Workarounds:**
+
+- **Feature Scaling:** Implementing automatic feature scaling can mitigate issues with imbalanced feature scales.
+- **Polynomial Features:** Extending the model to include polynomial or interaction terms can help capture non-linear relationships.
+- **Dimensionality Reduction:** Techniques like PCA can be integrated to handle extremely high-dimensional sparse data more effectively.
+
+With additional time, these enhancements can be incorporated to improve the model's robustness and applicability to a wider range of datasets.
+
+## **Usage Examples**
+
+Below are examples demonstrating how to use the implemented ElasticNet model:
+
+### **Training the Model**
+
+```python
+from ElasticNet import ElasticNetModel
+import numpy as np
+
+# Generate synthetic data
+from generate_regression_data import linear_data_generator
+
+# Parameters for synthetic data
+m = np.array([1.5, -2.0, 3.0])
+b = 4.0
+rnge = [0, 10]
+N = 100
+scale = 1.0
+seed = 42
+
+# Generate data
+X, y = linear_data_generator(m, b, rnge, N, scale, seed)
+
+# Initialize the model with desired parameters
+model = ElasticNetModel(alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tolerance=1e-4, learning_rate=0.01, random_state=42)
 
-* What does the model you have implemented do and when should it be used?
-* How did you test your model to determine if it is working reasonably correctly?
-* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
-* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
+# Fit the model to the training data
+model.fit(X, y)
diff --git a/extracredit analysis.ipynb b/extracredit analysis.ipynb
diff --git a/generate_regression_data.py b/generate_regression_data.py
@@ -6,18 +6,18 @@
 def linear_data_generator(m, b, rnge, N, scale, seed):
   rng = numpy.random.default_rng(seed=seed)
   sample = rng.uniform(low=rnge[0], high=rnge[1], size=(N, m.shape[0]))
-  ys = numpy.dot(sample, numpy.reshape(m, (-1,1))) + b
+  ys = numpy.dot(sample, m) + b
   noise = rng.normal(loc=0., scale=scale, size=ys.shape)
   return (sample, ys+noise)
 
 def write_data(filename, X, y):
-    with open(filename, "w") as file:
+    with open(filename, "w",newline='') as file:
         # X column for every x
         xs = [f"x_{n}" for n in range(X.shape[1])]
         header = xs + ["y"]
         writer = csv.writer(file)
         writer.writerow(header)
-        for row in numpy.hstack((X,y)):
+        for row in numpy.hstack((X,y.reshape(-1,1))):
             writer.writerow(row)
 
 def main():