Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
362 changes: 356 additions & 6 deletions README.md

Large diffs are not rendered by default.

Binary file added elasticnet/images/10_model_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/10_model_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/11_numpy_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/11_numpy_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/12_nan_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/12_nan_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/1_alpha_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/1_alpha_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/2_l1_ratio_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/2_l1_ratio_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/3_learning_rate_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/3_learning_rate_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/4_max_iter_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/4_max_iter_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/5_tol_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/5_tol_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/6_X_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/6_X_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/7_sample_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/7_sample_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/8_nan_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/8_nan_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/9_y_input.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/9_y_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/california_evaluation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added elasticnet/images/california_plot_1.png
Binary file added elasticnet/images/california_plot_2.jpeg
Binary file added elasticnet/images/netflix_evaluation_metrics.jpeg
Binary file added elasticnet/images/netflix_plot_1.jpeg
Binary file added elasticnet/images/netflix_plot_2.jpeg
Binary file added elasticnet/images/small_test_evaluation.jpeg
Binary file added elasticnet/images/small_test_plot.jpeg
Binary file added elasticnet/images/synthetic_evaluation.jpeg
Binary file added elasticnet/images/synthetic_plot.jpeg
96 changes: 88 additions & 8 deletions elasticnet/models/ElasticNet.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,97 @@

import numpy as np

class ElasticNetModel():
def __init__(self):
pass
def __init__(self, alpha=1.0, l1_ratio=0.5, learning_rate=0.01, max_iter=1000, tol=1e-4):
if not isinstance(alpha, (float, int)) or alpha < 0:
raise ValueError("alpha must be a non-negative float or integer.")
if not isinstance(l1_ratio, (float, int)) or not (0 <= l1_ratio <= 1):
raise ValueError("l1_ratio must be a float between 0 and 1.")
if not isinstance(learning_rate, (float, int)) or learning_rate <= 0:
raise ValueError("learning_rate must be a positive float.")
if not isinstance(max_iter, int) or max_iter <= 0:
raise ValueError("max_iter must be a positive integer.")
if not isinstance(tol, (float, int)) or tol <= 0:
raise ValueError("tol must be a positive float.")

self.alpha = alpha # Combined regularization strength
self.l1_ratio = l1_ratio # L1:L2 ratio (0 for Ridge, 1 for Lasso, between for ElasticNet)
self.learning_rate = learning_rate # Step size for gradient descent
self.max_iter = max_iter # Maximum iterations for gradient descent
self.tol = tol # Tolerance for stopping criterion
self.weight_ = None
self.bias_ = None
self.is_fitted = False

# L1 and L2 penalties
self.l1_penalty = self.alpha * self.l1_ratio
self.l2_penalty = self.alpha * (1 - self.l1_ratio)

def _validate_input(self, X, y=None):
if not isinstance(X, np.ndarray):
raise TypeError("X must be a numpy array")

if y is not None:
if not isinstance(y, np.ndarray):
raise TypeError("y must be a numpy array")
if len(X) != len(y):
raise ValueError("X and y must have the same number of samples")

if np.isnan(X).any():
raise ValueError("X contains NaN values")

if y is not None and np.isnan(y).any():
raise ValueError("y contains NaN values")

def fit(self, X, y):
return ElasticNetModelResults()
self._validate_input(X, y)

n_samples, n_features = X.shape
self.weight_ = np.zeros(n_features) # Initialize weights
self.bias_ = 0 # Initialize bias

self.is_fitted = True

for _ in range(self.max_iter):
y_pred = np.dot(X, self.weight_) + self.bias_

if np.isnan(y_pred).any():
raise ValueError("NaN values detected in predictions during gradient descent")

residuals = y - y_pred

# Gradient calculation based on the formula you provided
dW = np.zeros(n_features) # Initialize gradient for weights
for j in range(n_features):
if self.weight_[j] > 0:
dW[j] = (-(2 * (X[:, j].dot(residuals)) + self.l1_penalty +
2 * self.l2_penalty * self.weight_[j]) / n_samples)
else:
dW[j] = (-(2 * (X[:, j].dot(residuals)) - self.l1_penalty +
2 * self.l2_penalty * self.weight_[j]) / n_samples)

# Gradient for bias
db = -2 * np.sum(residuals) / n_samples

# Update weights and bias
self.weight_ -= self.learning_rate * dW
self.bias_ -= self.learning_rate * db

# Check stopping criterion
if np.linalg.norm(self.learning_rate * dW) < self.tol and abs(self.learning_rate * db) < self.tol:
break

return ElasticNetModelResults(self.weight_, self.bias_)

class ElasticNetModelResults():
def __init__(self):
pass
def __init__(self, weight_, bias_):
self.weight_ = weight_
self.bias_ = bias_

def predict(self, x):
return 0.5
def predict(self, X):
if not isinstance(X, np.ndarray):
raise TypeError("X must be a numpy array")

if np.isnan(X).any():
raise ValueError("X contains NaN values")

return np.dot(X, self.weight_) + self.bias_
20,641 changes: 20,641 additions & 0 deletions elasticnet/tests/california_housing.csv

Large diffs are not rendered by default.

83 changes: 83 additions & 0 deletions elasticnet/tests/california_housing_plot.ipynb

Large diffs are not rendered by default.

1,010 changes: 1,010 additions & 0 deletions elasticnet/tests/netflix.csv

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions elasticnet/tests/netflix_dataset_plot.ipynb

Large diffs are not rendered by default.

79 changes: 70 additions & 9 deletions elasticnet/tests/test_ElasticNetModel.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,80 @@
import csv

import numpy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from elasticnet.models.ElasticNet import ElasticNetModel

def test_predict():
model = ElasticNetModel()
model = ElasticNetModel(alpha=35, l1_ratio=0.7, learning_rate=0.001, max_iter=5000, tol=1e-5)
data = []
with open("small_test.csv", "r") as file:
column_names = None
with open("elasticnet/tests/small_test.csv", "r") as file:
reader = csv.DictReader(file)
column_names = [k for k in reader.fieldnames if k.startswith('x')]
for row in reader:
data.append(row)

X = numpy.array([[v for k,v in datum.items() if k.startswith('x')] for datum in data])
y = numpy.array([[v for k,v in datum.items() if k=='y'] for datum in data])
results = model.fit(X,y)
preds = results.predict(X)
assert preds == 0.5
X = np.array([[float(v) for k,v in datum.items() if k.startswith('x')] for datum in data])
y = np.array([float(v) for datum in data for k,v in datum.items() if k=='y'])

# Standardize the features to improve model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (to evaluate generalization performance)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

results = model.fit(X_train, y_train)

# Predict on both training and testing sets
preds_train = results.predict(X_train)
preds_test = results.predict(X_test)

# Calculate Mean Squared Error (MSE) for both train and test sets
mse_train = np.mean((preds_train - y_train) ** 2)
mse_test = np.mean((preds_test - y_test) ** 2)

# Calculate R-squared for both train and test sets
ss_res_train = np.sum((y_train - preds_train) ** 2)
ss_tot_train = np.sum((y_train - np.mean(y_train)) ** 2)
r_squared_train = 1 - (ss_res_train / ss_tot_train)

ss_res_test = np.sum((y_test - preds_test) ** 2)
ss_tot_test = np.sum((y_test - np.mean(y_test)) ** 2)
r_squared_test = 1 - (ss_res_test / ss_tot_test)

# Print MSE and R-squared for both training and testing sets
print(f"Train Mean Squared Error: {mse_train}")
print(f"Test Mean Squared Error: {mse_test}")
print(f"Train R-squared: {r_squared_train}")
print(f"Test R-squared: {r_squared_test}")

# Print first 5 predictions and actual values for the test set
print("First 5 test predictions:", preds_test[:5])
print("First 5 test actual values:", y_test[:5])

# Plot actual vs predicted values for test set against a selected feature (e.g., first feature in X_test)
feature_index = 0
X_test_feature = X_test[:, feature_index]
feature_name = column_names[feature_index]

plt.figure(figsize=(10, 6))

# Plot actual values (y_test) in green
plt.scatter(X_test_feature, y_test, color='green', label='Actual Values', alpha=0.6)

# Plot predicted values (y_pred) in blue
plt.scatter(X_test_feature, preds_test, color='blue', label='Predicted Values', alpha=0.6)

# Adding labels and title
plt.xlabel(f'Feature {feature_name}')
plt.ylabel('Target Value (y)')
plt.title(f'Feature {feature_name} vs Actual and Predicted Values')

plt.legend()
plt.show()

if __name__ == "__main__":
test_predict()
129 changes: 129 additions & 0 deletions elasticnet/tests/test_california_housing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from elasticnet.models.ElasticNet import ElasticNetModel

class ElasticNetTest:
def __init__(self, data_path):
self.data_path = data_path
self.model = None
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.scaler = StandardScaler()

def load_and_preprocess_data(self):
df = pd.read_csv(self.data_path)

# Drop the ocean_proximity column as it's categorical
df = df.drop('ocean_proximity', axis=1)

# Handle missing values if any
df = df.fillna(df.mean())

self.feature_names = df.drop('median_house_value', axis=1).columns.tolist()

# Separate features and target
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Scale the features
X_scaled = self.scaler.fit_transform(X)

# Split the data
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)

# Convert to numpy arrays and ensure correct shapes
self.X_train = np.array(self.X_train)
self.X_test = np.array(self.X_test)

# Reshape y values to match model expectations (flatten to 1D array)
self.y_train = np.array(self.y_train).flatten()
self.y_test = np.array(self.y_test).flatten()

return self

def train_and_evaluate(self):
# Initialize and train the model
self.model = ElasticNetModel(
alpha=30,
l1_ratio=0.7,
learning_rate=0.01,
max_iter=1000,
tol=1e-4
)

# Fit the model
results = self.model.fit(self.X_train, self.y_train)

# Make predictions
train_predictions = results.predict(self.X_train)
test_predictions = results.predict(self.X_test)

# Reshape predictions if needed for metric calculations
train_predictions = np.array(train_predictions).flatten()
test_predictions = np.array(test_predictions).flatten()

# Calculate metrics
metrics = {
'train_mse': mean_squared_error(self.y_train, train_predictions),
'test_mse': mean_squared_error(self.y_test, test_predictions),
'train_r2': r2_score(self.y_train, train_predictions),
'test_r2': r2_score(self.y_test, test_predictions)
}

return metrics

def plot_test_vs_pred(self, X_test, y_test, y_pred):
feature_index = 0
X_test_feature = X_test[:, feature_index]

feature_name = self.feature_names[feature_index]

# Plot y_test and y_pred against the selected feature from X_test
plt.figure(figsize=(10, 6))

# Plot actual values (y_test) in green
plt.scatter(X_test_feature, y_test, color='green', label='Actual Values', alpha=0.6)

# Plot predicted values (y_pred) in blue
plt.scatter(X_test_feature, y_pred, color='blue', label='Predicted Values', alpha=0.6)

plt.xlabel(f'Feature {feature_name}')
plt.ylabel('Median House Value')
plt.title(f'Feature {feature_name} vs Actual and Predicted Median House Value')

plt.legend()
plt.show()

def run_test():
# Initialize test
test = ElasticNetTest('elasticnet/tests/california_housing.csv')

# Preprocess data
test.load_and_preprocess_data()

# Train and evaluate
metrics = test.train_and_evaluate()

# Print results
print("\nModel Evaluation Metrics:")
print(f"Training MSE: {metrics['train_mse']:.2f}")
print(f"Test MSE: {metrics['test_mse']:.2f}")
print(f"Training R-squared: {metrics['train_r2']:.4f}")
print(f"Test R-squared: {metrics['test_r2']:.4f}")

# Generate test predictions again for plotting
test_predictions = test.model.fit(test.X_train, test.y_train).predict(test.X_test)

# Plot the results
test.plot_test_vs_pred(test.X_test, test.y_test, test_predictions)

if __name__ == "__main__":
run_test()
Loading