Fall2024CS584 · Jerry-zirui · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/Data_Generator.bat b/Data_Generator.bat
@@ -0,0 +1,2 @@
+python generate_regression_data.py -N 100 -m 2.5 1.5 -b 0.5 -scale 1.0 -rnge -10 10 -seed 42 -output_file train_data.csv
+python generate_regression_data.py -N 30 -m 2.5 1.5 -b 0.5 -scale 1.0 -rnge -10 10 -seed 42 -output_file test_data.csv
diff --git a/Data_Generator.sh b/Data_Generator.sh
@@ -0,0 +1,2 @@
+python generate_regression_data.py -N 100 -m 2.5 1.5 -b 0.5 -scale 1.0 -rnge -10 10 -seed 42 -output_file train_data.csv
+python generate_regression_data.py -N 30 -m 2.5 1.5 -b 0.5 -scale 1.0 -rnge -10 10 -seed 42 -output_file test_data.csv
diff --git a/README.md b/README.md
@@ -1,8 +1,41 @@
-# Project 1 
+## Group Member(s)
+ZIRUI OU A20516756
 
-Put your README here. Answer the following questions.
+### What does the model do and when should it be used?
 
-* What does the model you have implemented do and when should it be used?
-* How did you test your model to determine if it is working reasonably correctly?
-* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
-* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
+The ElasticNetModel is a type of linear regression that combines L1 and L2 regularization. It's ideal for predicting continuous outcomes, especially useful in datasets with irrelevant features or highly correlated features.
+
+This model should be used when you suspect that your data contains irrelevant features or when the features are highly correlated. It's particularly useful in scenarios where you need a model that's robust against issues like multicollinearity (where independent variables are correlated) and when you want to prevent overfitting in your predictive model.
+
+### How did you test your model to determine if it is working reasonably correctly?
+The script tests the ElasticNet model by training it on a set of training data, making predictions on a separate test dataset, and then calculating the Mean Squared Error (MSE) between the predicted and actual values to assess accuracy. It ensures the MSE is below a threshold of 1 to verify the model's performance.
+
+
+### What parameters are exposed to users to tune performance?
+
+- **`lr` (Learning Rate):** Controls the update magnitude of model coefficients.
+- **`n_iter` (Number of Iterations):** Determines how many times the model will process the entire dataset.
+- **`l1_ratio` (L1 Ratio):** Balances between L1 and L2 regularization.
+- **`alpha` (Regularization Strength):** Adjusts the overall strength of the regularization.
+
+#### Basic Usage Example
+```python
+
+model = ElasticNetModel(lr=0.01, n_iter=1000, l1_ratio=0.5, alpha=1.0)
+model.fit(X_train, y_train)
+predictions = model.predict(X_test)
+print(predictions)
+
+```
+
+### Are there specific inputs that your implementation has trouble with?
+Yes, the model struggles with non-numeric data, missing values, due to its basic implementation.
+
+### Given more time, could these issues be worked around?
+Yes, with more time, enhancements like automatic handling of non-numeric data and missing values, could be implemented to make the model more robust and efficient.
+
+
+### Before you RUN:
+1. please using `pip install numba numpy` to install numba and numpy before run it.
+1. And make sure `test_data.csv` and `train_data.csv` are in the correct location, if not there, use one of the `Data_Generator` scripts to generate it according to the platform you are using..
+2. Now you should ready to run the test program using `python elasticnet\tests\test_ElasticNetModel.py`.
diff --git a/elasticnet/models/ElasticNet.py b/elasticnet/models/ElasticNet.py
@@ -1,17 +1,67 @@
+import numpy as np
+from numba import jit
+from typing import Tuple
 
+class ElasticNetModel:
+    def __init__(
+            self,
+            learning_rate: float = 0.01,
+            iterations: int = 1000,
+            l1_ratio: float = 0.5,
+            alpha: float = 1.0) -> None:
+
+        self.learning_rate = learning_rate
+        self.iterations = iterations
+        self.l1_ratio = l1_ratio
+        self.alpha = alpha
+        self.weights = np.empty(0)
+        self.bias = 0.0
 
-class ElasticNetModel():
-    def __init__(self):
-        pass
+    def fit(
+            self,
+            features: np.ndarray,
+            target: np.ndarray) -> None:
+
+        num_samples, num_features = features.shape
+        self.weights = np.zeros(num_features)
+        self.bias = 0.0
 
+        self.weights, self.bias = self._optimize(
+            features,
+            target,
+            self.weights,
+            self.bias,
+            self.learning_rate,
+            self.iterations,
+            self.alpha,
+            self.l1_ratio,
+            num_samples)
 
-    def fit(self, X, y):
-        return ElasticNetModelResults()
+    @staticmethod
+    @jit(nopython=True, nogil=True)
+    def _optimize(
+        features: np.ndarray,
+        target: np.ndarray,
+        weights: np.ndarray,
+        bias: float,
+        learning_rate: float,
+        iterations: int,
+        alpha: float,
+        l1_ratio: float,
+        num_samples: int) -> Tuple[np.ndarray, float]:
 
+        for _ in range(iterations):
+            predictions = np.dot(features, weights) + bias
+            errors = predictions - target
 
-class ElasticNetModelResults():
-    def __init__(self):
-        pass
+            l2_gradient = 2 * weights
+            l1_gradient = np.sign(weights)
 
-    def predict(self, x):
-        return 0.5
+            weights -= learning_rate * ((1 / num_samples) * np.dot(features.T, errors) + alpha * ((1 - l1_ratio) * l2_gradient + l1_ratio * l1_gradient))
+            bias -= learning_rate * (1 / num_samples) * np.sum(errors)
+
+        return weights, bias
+
+    def predict(self, features: np.ndarray) -> np.ndarray:
+        predictions = np.dot(features, self.weights) + self.bias
+        return predictions
diff --git a/elasticnet/tests/small_test.csv b/elasticnet/tests/small_test.csv
diff --git a/elasticnet/tests/test_ElasticNetModel.py b/elasticnet/tests/test_ElasticNetModel.py
@@ -1,19 +1,45 @@
 import csv
+import numpy as np
+import sys
+import os
 
-import numpy
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(os.path.dirname(current_dir))
+sys.path.append(project_root)
 
 from elasticnet.models.ElasticNet import ElasticNetModel
 
-def test_predict():
-    model = ElasticNetModel()
+def load_data(filepath):
     data = []
-    with open("small_test.csv", "r") as file:
+    with open(filepath, "r") as file:
         reader = csv.DictReader(file)
         for row in reader:
             data.append(row)
+    X = np.array([[float(v) for k, v in datum.items() if k.startswith('x')] for datum in data])
+    y = np.array([float(datum['y']) for datum in data])
+    return X, y
+
+def test_predict():
+    model = ElasticNetModel()
+
+    train_X, train_y = load_data(os.path.join(project_root, 'train_data.csv'))
+
+    test_X, test_y = load_data(os.path.join(project_root, 'test_data.csv'))
+
+    model.fit(train_X, train_y)
+
+    preds = model.predict(test_X)
+    #print(f"prediction:\n {preds}")
+
+    mse = np.mean((preds - test_y) ** 2)
+    #print(mse)
+    assert mse < 1
+
+    print("Actual\tPredicted\tAbs Error")
+
+    [print(f"{a}\t{p}\t{abs(a-p)}") for a, p in zip(test_y, preds)]
+
+    print(f"MSE : {mse}")
 
-    X = numpy.array([[v for k,v in datum.items() if k.startswith('x')] for datum in data])
-    y = numpy.array([[v for k,v in datum.items() if k=='y'] for datum in data])
-    results = model.fit(X,y)
-    preds = results.predict(X)
-    assert preds == 0.5
+if __name__ == "__main__":
+    test_predict()
diff --git a/regularized_discriminant_analysis/models/RegularizedDiscriminantAnalysis.py b/regularized_discriminant_analysis/models/RegularizedDiscriminantAnalysis.py
diff --git a/regularized_discriminant_analysis/test_rdamodel.py b/regularized_discriminant_analysis/test_rdamodel.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 numpy
 pytest
 ipython
+numba
+scipy
diff --git a/test_data.csv b/test_data.csv
@@ -0,0 +1,31 @@
+x_0,x_1,y
+5.479120971119267,-1.2224312049589532,11.498324504666494
+7.171958398227648,3.9473605811872776,25.319215221941516
+-8.11645304224701,9.512447032735118,-7.205331828130651
+5.222794039807059,5.721286105539075,21.804029227840488
+-7.4377273464890825,-0.9922812420886569,-19.41998716425069
+-2.5840395153483753,8.535299776972035,7.429073208446392
+2.8773024016132904,6.455232265416598,18.087330981950977
+-1.131716023453377,-5.455225564304462,-9.718781169890208
+1.0916957403166965,-8.723654877916493,-10.204968038331437
+6.55262343985164,2.6332879824412974,20.36913878062648
+5.1617548017074775,-2.9094806374026323,9.8981419294219
+9.413960487898066,7.862422426443953,35.63723053452948
+5.567669941475238,-6.1072258429606485,3.9826497659091995
+-0.6655799254593155,-9.123924684255424,-15.983124054034906
+-6.914210158649043,3.6609790648490925,-12.21350908535058
+4.895243118156342,9.350194648684202,27.260560512470924
+-3.4834928372369607,-2.5908058793026223,-11.952515175975769
+-0.6088837744838411,-6.2105728183142865,-9.647583309613264
+-7.401569893290567,-0.48590147548132556,-19.16002959278494
+-5.461813018982317,3.396279893650206,-7.901573015903769
+-1.2569616225533853,6.653563921156749,7.9635322193189975
+4.005302040044983,-3.7526671723591782,4.574907801853451
+6.645196027904021,6.095287149936038,26.71269603222152
+-2.2504324193965104,-4.233437921395118,-12.138163871650603
+3.6499100794995094,-7.204950327813804,-1.5457041395370035
+-6.001835950497833,-9.85275460497989,-29.665459677712747
+5.738487550042768,3.2970171318406436,18.595904927278845
+4.1033075725267025,5.614580620439359,19.667112342761378
+-0.8216844892332009,1.3748239190578744,0.0386223153010854
+-7.204060037446851,-7.709398529280531,-29.061753768810235
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python generate_regression_data.py -N 100 -m 2.5 1.5 -b 0.5 -scale 1.0 -rnge -10 10 -seed 42 -output_file train_data.csv
		python generate_regression_data.py -N 30 -m 2.5 1.5 -b 0.5 -scale 1.0 -rnge -10 10 -seed 42 -output_file test_data.csv
-Original file line number
+Diff line change
@@ -1,3 +1,5 @@
     numpy
     pytest
     ipython
+    numba
+    scipy