From 707298037b4fe1f6d572983e056f380609b6cfbb Mon Sep 17 00:00:00 2001 From: rutujdv Date: Thu, 10 Oct 2024 20:46:37 -0500 Subject: [PATCH] Final Submissions Implementation of Elastic Net Model and Test file --- README.md | 147 +++++++++++++++- elasticnet/models/ElasticNet.py | 205 +++++++++++++++++++++-- elasticnet/tests/test_ElasticNetModel.py | 107 ++++++++++-- 3 files changed, 428 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index c1e8359..ad708d5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,143 @@ -# Project 1 +ElasticNet: Custom Elastic Net Regression Model -Put your README here. Answer the following questions. -* What does the model you have implemented do and when should it be used? -* How did you test your model to determine if it is working reasonably correctly? -* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.) -* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental? +Nidhi Shrivastav A20594009 nshrivastav@hawk.iit.edu +Rutuja Jadhav A20539073 rjadhav4@hawk.iit.edu +Pankaj Jagtap A20543260 pjagtap1@hawk.iit.edu + + +A. Overview: +This project implements a custom Elastic Net regression model in Python, combining L1 (Lasso) and L2 (Ridge) regularization techniques to balance feature selection and coefficient shrinkage. The Elastic Net model is particularly effective for handling multicollinearity (highly correlated features) and sparse features, making it well-suited for various regression tasks. The model is trained through iterative updates of its coefficients using both penalties. Additionally, the project includes methods for predicting target values, evaluating the model’s performance using metrics like MSE, MAE, and R-squared, and visualizing results. + +B. Files +• ElasticNet.py: Contains the implementation of the ElasticNet class for Elastic Net regression. +• test_ElasticNetModel.py: Contains unit tests for validating the functionality of the ElasticNet class using pytest. + +C. ElasticNet Class Details +The ElasticNet class implements an Elastic Net regression model that combines L1 and L2 regularization techniques. Here is a detailed breakdown of the functions and parameters involved: + +1. __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=1000, tol=1e-4) +• Purpose: Sets up the Elastic Net model with specified regularization parameters. +• Parameters: + - alpha: It determines the magnitude of the regularization. + - l1_ratio: Relative weight of L1 (Lasso) and L2 (Ridge) regularization. + - max_iter: Max No. of iterations for the optimization process. + - tol: Stopping criteria tolerance which is based on change in coefficients. + +2. fit(self, X, y) +• Purpose: Trains the Elastic Net model using the given features (X) and target values (y). +• Parameters: + - X: Input matrix consisting of data (2D array). + - y: The target values (1D array). +• Validation Checks: +- Ensure X is a 2D array and y is a 1D array. +- Check if the number of samples in X matches the number of samples in y. +- Making confirm that feature do not 0 variance. +• Key Operations: + - Handles regularization through iterative coefficient updates. + - Incorporates both L1 and L2 penalties when updating coefficients. + - Stops when the change in coefficients is below a set tolerance. +3. predict(self, X) +• Purpose: Predictions on input data +• Parameters: + - X: Input data for which predictions are to be made. +• Returns: Computes the dot product of the input features and the model's coefficients, adding the intercept to return the predicted values based on the model's learned coefficients. + +4. evaluate(self, X, y_true) +• Purpose: Assesses the model's performance by computing the following metrics: + - Mean Squared Error (MSE) + - Mean Absolute Error (MAE) + - R-squared (R²) +• Parameters: + - X: 2D array of input features for evaluation. + - y_true: 1D array with target values +• Returns: The calculated MSE, MAE, and R-squared values. + +5. plot_predictions(self, y_true, y_pred) +• Parameters: +- y_true: 1D array of actual target values. +- y_pred: 1D array of predicted target values. + +• Purpose: Plots a scatter plot comparing the actual vs predicted values with a line representing a perfect fit. +6. plot_residuals(self, y_true, y_pred) +• Purpose: Plots residuals (differences between actual and predicted values). + + + +Linear Data Generator: + +1. linear_data_generator(m, b, rnge, N, scale, seed) +• Purpose: Generates random linear data for testing purposes, with noise added. +• Parameters: +- n_samples: Number of samples to generate. +- n_features: No. features per sample +- noise: Standard deviation of the noise added to the target values. +- random_state: Seed for reproducibility.Returns: Generated sample data and noisy target values. +Usage +1. Testing: The test_ElasticNetModel.py file includes several unit tests to verify the correctness of the ElasticNet class. +2. Setup: pytest is used to test the model. +3. Install pytest via pip: pip install pytest +4. Run Tests +Go to elasticnet folder on command line/terminal +Use this command to run: +pytest tests/test_ElasticNetModel.py +5. Test Functions: + +a. test_fit_and_predict +• Purpose: Verifies that the model can correctly fit data and make accurate predictions. +• Test Input: Uses a simple dataset with known target values. +• Expected Output: Predictions should closely match the target values. + +b. test_invalid_input_shape +• Purpose: Ensures that a ValueError is raised when the number of samples in X and y do not match. +• Test Input: X with 2 samples and y with 1 sample. +• Expected Output: A `ValueError` with a specific error message. + +c. test_no_variance +• Purpose: Tests the model's behavior when the input features have no variance. +• Test Input: Data where all feature values are constant. +• Expected Output: Model should still make predictions and not fail. + +d. test_single_feature +• Purpose: Ensures the model can handle cases where only one feature is provided. +• Test Input: A one-dimensional X and target y. +• Expected output: Ensure that predictions match the target values by checking with np.testing.assert_almost_equal. + +e. test_fit_convergence() +• Purpose: To test the model’s robustness and convergence on larger datasets. +• Test Input: random data for X and noisy y values. +• Expected output: Calculate metrics like Mean Squared Error (MSE), Mean Absolute Error (MAE), and R-squared (evaluate function). Check that MSE is below a reasonable threshold (less than 1.0), indicating successful convergence. + +f. test_multiple_coefficients() +• Purpose: To test the model's ability to fit and predict when multiple features are present. +• Test Input: data that has two features +• Expected output: Ensure that the predictions match the expected target values by comparing with np.testing.assert_almost_equal. + + + +Questions + +1. What does the model you have implemented do and when should it be used? +The model used is an ElasticNet regression, a type of linear regression that combines L1 regularization (Lasso) and L2 regularization (Ridge). This is helpful when there are many features involved, that too having many correlations. It helps prevent overfitting by introducing a penalty on the coefficients, making the model more generalizable. + Used for: +• High dimensional data: When the number of features (predictors) is large compared to the number of data points. +• Multicollinearity: When some features are highly correlated with each other, the ElasticNet regularization can handle this better than a plain linear regression. +• Feature selection and shrinkage: ElasticNet can help reduce the dimensionality of the problem by zeroing out some coefficients (feature selection) while keeping the others. + +2. How did you test your model to determine if it is working reasonably correctly? +To ensure the ElasticNet regression model works correctly, several unit tests were conducted using pytest. These tests evaluated the model's ability to fit data and make accurate predictions, checked for appropriate error handling when input shapes were mismatched, and assessed the model's behavior with edge cases like features with no variance or single-feature data. Additionally, the model's performance was tested on larger datasets to verify convergence and was evaluated using metrics such as mean squared error (MSE), mean absolute error (MAE), and R-squared. These tests confirmed the model's reliability across various scenarios. + +3. What parameters have you exposed to users of your implementation in order to tune performance? +The ElasticNet model, like other regularized linear models, has two important parameters that help tune performance: +i. Alpha (also known as regularization strength): This parameter controls the total amount of regularization applied to the model. A higher alpha increases the regularization effect, shrinking the model's coefficients more and helping to prevent overfitting. Lower alpha values make the model behave more like a regular linear regression without regularization. +ii. L1_ratio: This parameter determines the balance between Lasso (L1) and Ridge (L2) penalties. A value of 1 applies only Lasso (L1), 0 applies only Ridge (L2), and any value in between represents a combination of both regularizations. + +4. Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental to the model? +• Zero variance in features: If one or more features in the input dataset have no variance (i.e., all values are the same), this can cause issues with the model as the regularization mechanism assumes variability in the input features. Such features can be dropped or handled separately. +• Multicollinearity: While ElasticNet can handle multicollinearity better than regular linear regression, extremely high correlations between predictors might still pose a challenge, as it can cause instability in the coefficient estimates. +• Very sparse data: If the dataset is too small or the number of features is much larger than the number of samples, the model may have difficulty finding meaningful patterns. + +In many cases, such problems can be mitigated. For example: +• Feature selection: Features with zero variance can be removed beforehand. +• Handling multicollinearity: If the correlation between features is too high, techniques like Principal Component Analysis (PCA) or feature scaling can be applied to reduce collinearity. +• Data augmentation: In cases of sparse data, more data can be collected, or data imputation techniques could be employed. \ No newline at end of file diff --git a/elasticnet/models/ElasticNet.py b/elasticnet/models/ElasticNet.py index 017e925..9b70ebe 100644 --- a/elasticnet/models/ElasticNet.py +++ b/elasticnet/models/ElasticNet.py @@ -1,17 +1,202 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt - -class ElasticNetModel(): - def __init__(self): - pass +class ElasticNet(): + """ + This class implements the Elastic Net regression model. + It merges L1 (Lasso) and L2 (Ridge) regularization methods. + """ + def __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=1000, tol=1e-4): + """ + Initializes the model with parameters: + alpha: Controls the strength of regularization. + l1_ratio: Regulates the relative contribution of L1 and L2 regularization techniques. + max_iter: Sets the maximum number of iterations for training. + tol: The tolerance for stopping the iteration. + """ + self.alpha = alpha + self.l1_ratio = l1_ratio + self.max_iter = max_iter + self.tol = tol + self.coef_ = None # Model's weight values + self.intercept_ = None # Intercept (bias) of the model def fit(self, X, y): - return ElasticNetModelResults() + """ + Trains the Elastic Net model using the data (X) and target values (y). + X: The input data (features). + y: The target values. + """ + # Check if the dimensions of X and y are valid + if X.ndim != 2 or y.ndim != 1: + raise ValueError("X : 2DARRAY and y : 1DARRAY.") + if X.shape[0] != y.shape[0]: + raise ValueError("Number of samples in X must match y.") + + # Check for zero variance in features + if np.any(np.var(X, axis=0) == 0): + raise ValueError("Feature variance cannot be 0.") + + #Append a column of ones to X to account for the intercept (bias term) + X = np.hstack([np.ones((X.shape[0], 1)), X]) + n_samples, n_features = X.shape # Get the shape of X + self.coef_ = np.zeros(n_features) # Initialize the coefficients with zeros + + # Iteratively update the coefficients + for iteration in range(self.max_iter): + coef_prev = self.coef_.copy() # Save the previous coefficients + + for j in range(n_features): + # Calculate the residual (error) without feature j + residual = y - np.dot(X, self.coef_) + X[:, j] * self.coef_[j] + rho = np.dot(X[:, j], residual) # Calculate rho, a key term + + if j == 0: + # Update intercept (no regularization) + self.coef_[j] = rho / np.sum(X[:, j]**2) + else: + z = np.sum(X[:, j]**2) # Sum of squares of feature j + l1_penalty = self.alpha * self.l1_ratio # L1 regularization term + l2_penalty = self.alpha * (1 - self.l1_ratio) # L2 regularization term + # Update the coefficient with L1 and L2 penalties + self.coef_[j] = np.sign(rho) * max(0, abs(rho) - l1_penalty) / (z + l2_penalty) + + ## Halt the process if coefficient change falls below the tolerance level. + if np.linalg.norm(self.coef_ - coef_prev) < self.tol: + break + + # Separate intercept from the rest of the coefficients + self.intercept_ = self.coef_[0] + self.coef_ = self.coef_[1:] + + return self # Allows method chaining by returning the instance + + def predict(self, X): + """ + Predicts the target values for the input data X using the trained model. + + X: The input data (features). + + Returns the predicted values. + """ + return np.dot(X, self.coef_) + self.intercept_ + + def evaluate(self, X, y_true): + """ + Evaluates the model's performance using MSE, MAE, and R-squared. + + X: Input data (features). + y_true: The actual target values. + + Returns the MSE, MAE, and R-squared values. + """ + y_pred = self.predict(X) # Get the predictions + + # Calculate Mean Squared Error (MSE) + mse = np.average((y_true - y_pred) ** 2) + + # Calculate Mean Absolute Error (MAE) + mae = np.mean(np.abs(y_true - y_pred)) + + # Calculate R-squared (R²) + sst_value = np.sum((y_true - np.mean(y_true)) ** 2) + ssr_value = np.sum((y_true - y_pred) ** 2) + r_squared_value = 1 - (ssr_value / sst_value) + + return mse, mae, r_squared_value + + def plot_predictions(self, y_true, y_pred): + """Plot(residuals) predicted vs actual values.""" + plt.figure(figsize=(10, 6)) + plt.scatter(y_true, y_pred, alpha=0.7, color='blue', label='Predicted vs Original') + plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label='Ideal Prediction') + plt.xlabel('Original Values') + plt.ylabel('Predicted Values') + plt.title('Original vs. Predicted Values') + plt.legend() + plt.grid() + plt.show() + + def plot_residuals(self, y_true, y_pred): + """Graph(residuals) predicted vs actual outcomes.""" + residuals = y_true - y_pred + plt.figure(figsize=(10, 6)) + plt.scatter(y_pred, residuals, alpha=0.7, color='green') + plt.axhline(0, color='red', linestyle='--', linewidth=2) + plt.xlabel('Predicted Values') + plt.ylabel('Residuals') + plt.title('Residuals vs. Predicted Values') + plt.grid() + plt.show() + +# Generating LDA(linear Random Data) +def linear_data_generator(m, b, rnge, N, scale, seed): + rng = np.random.default_rng(seed=seed) + sample = rng.uniform(low=rnge[0], high=rnge[1], size=(N, m.shape[0])) + ys = np.dot(sample, np.reshape(m, (-1, 1))) + b + noise = rng.normal(loc=0., scale=scale, size=ys.shape) + return sample, (ys + noise).flatten() + +def run_elastic_net(X, y, alpha=0.1, l1_ratio=0.2): + model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=1000, tol=1e-4) + model.fit(X, y) + predictions = model.predict(X) + mse, mae, r_squared = model.evaluate(X, y) + + print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R^2: {r_squared:.4f}") + + model.plot_predictions(y, predictions) + model.plot_residuals(y, predictions) + return model + +def main(): + while True: + print("\nSelect an option:") + print("1. Generate synthetic data") + print("2. Load data from CSV") + print("3. Exit") + choice = input("Enter choice: ") + + if choice == '1': + try: + m = np.array([1, 2]) + b = 5 + N = int(input("Number of samples: ")) + scale = float(input("Noise scale: ")) + seed = int(input("Random seed: ")) + + X, y = linear_data_generator(m, b, (0, 10), N, scale, seed) + run_elastic_net(X, y) + except ValueError as e: + print(f"Input error: {e}") + except Exception as e: + print(f"An error occurred: {e}") + + elif choice == '2': + try: + CSV_FILEDATA = input("Please enter CSV only file path: ") + df = pd.read_csv(CSV_FILEDATA) + if 'y' not in df.columns: + raise ValueError("'y' column not found in CSV.") + + X = df.drop('y', axis=1).values + y = df['y'].values + run_elastic_net(X, y) + except FileNotFoundError: + print("File not found.") + except ValueError as e: + print(f"Input error: {e}") + except Exception as e: + print(f"An error occurred: {e}") + elif choice == '3': + print("Exiting.") + break -class ElasticNetModelResults(): - def __init__(self): - pass + else: + print("Invalid choice. Try again.") - def predict(self, x): - return 0.5 +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/elasticnet/tests/test_ElasticNetModel.py b/elasticnet/tests/test_ElasticNetModel.py index 5022c3c..e742b6b 100644 --- a/elasticnet/tests/test_ElasticNetModel.py +++ b/elasticnet/tests/test_ElasticNetModel.py @@ -1,19 +1,96 @@ -import csv +import numpy as np +import pytest # Importing pytest for testing +# tests/test_ElasticNetModel.py +import sys +import os -import numpy +# Ensure that the models directory is in the sys.path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../models'))) +from ElasticNet import ElasticNet -from elasticnet.models.ElasticNet import ElasticNetModel +@pytest.fixture +def setup_model(): + """Create and set up the model.""" + model = ElasticNet(alpha=0.1, l1_ratio=0.5) # Instantiate the model with specified parameters + return model # Return the model to use in tests -def test_predict(): - model = ElasticNetModel() - data = [] - with open("small_test.csv", "r") as file: - reader = csv.DictReader(file) - for row in reader: - data.append(row) +def test_fit_and_predict(setup_model): + """Verify if trained and make predictions.""" + model = setup_model # Retrieve the model prepared by the setup fixture + X = np.array([[1, 2], [2, 3], [3, 4]]) # Input data (features) + y = np.array([1, 2, 3]) # Target values (what we want to predict) + model.fit(X, y) # Train the model using the feature data X and target variable y + predictions = model.predict(X) # Use the model to predict the same input + + expected = np.array([1, 2, 3]) # The expected predictions + np.testing.assert_almost_equal(predictions, expected, decimal=1) # Check if predictions match the expected values - X = numpy.array([[v for k,v in datum.items() if k.startswith('x')] for datum in data]) - y = numpy.array([[v for k,v in datum.items() if k=='y'] for datum in data]) - results = model.fit(X,y) - preds = results.predict(X) - assert preds == 0.5 +def test_invalid_input_shape(setup_model): + """Check whether error is raised when shapes do not match""" + model = setup_model + X = np.array([[1, 2], [2, 3]]) # Input data with two samples + y = np.array([1]) # Invalid target, only one sample instead of two + + # Anticipate a ValueError when the number of samples in X does not align with those in y + with pytest.raises(ValueError, match="Number of samples in X must match y."): # Update the error message + model.fit(X, y) # Try to fit the model and expect it to fail + + +def test_no_variance(setup_model): + """Check performance.""" + model = setup_model # Use the instance directly, don't call it + X = np.array([[1, 1], [1, 1], [1, 1]]) # Features with no variance + y = np.array([1, 2, 3]) # Any target values + + # Check for an error when fitting the model with no variance: + with pytest.raises(ValueError, match="Feature variance cannot be 0."): + model.fit(X, y) # Expecting a specific error when fitting + + # If you want to test predictions on valid input, you can do that separately: + X_valid = np.array([[1, 2], [2, 3], [3, 4]]) # Features with variance + y_valid = np.array([1, 2, 3]) + + model.fit(X_valid, y_valid) # Fit the model with valid data + predictions = model.predict(X_valid) # Make predictions + + assert predictions is not None # Verify that predictions are generated and not null + + +def test_single_feature(setup_model): + """Check capability""" + model = setup_model + X = np.array([[1], [2], [3]]) # Input data with one feature + y = np.array([1, 2, 3]) # Target values + + model.fit(X, y) + predictions = model.predict(X) # Make predictions + + np.testing.assert_almost_equal(predictions, y, decimal=1) # Check that predictions match the target + +def test_fit_convergence(setup_model): + """"Verify if the model reaches convergence when fitting a large dataset.""" + model = setup_model + X = np.random.rand(100, 2) # Randomly generate 100 samples with 2 features + y = X @ np.array([1.5, -2.0]) + 0.5 + np.random.normal(scale=0.1, size=100) # Generate target values with some noise + + model.fit(X, y) # Fit the model + predictions = model.predict(X) # Predict values + + # Evaluate model performance with some metrics + mse, mae, r_squared = model.evaluate(X, y) # Calculate metrics: mean squared error, mean absolute error, and R^2 + + assert mse < 1.0 # Check that the mean squared error is reasonably low + +def test_multiple_coefficients(setup_model): + """Test the model with multiple coefficients.""" + model = setup_model + X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]]) # Input data with two features + y = np.array([2, 3, 4, 5]) + + model.fit(X, y) # Fit the model + predictions = model.predict(X) # Make predictions + + np.testing.assert_almost_equal(predictions, y, decimal=1) # Check that predictions match the target + +if __name__ == "__main__": + pytest.main() # Run all the tests when the script is executed