From dad60ca1c4fae15493f149f955a6af5b8e496c71 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 21:22:15 +0530 Subject: [PATCH 1/9] Add files via upload --- gradient_boosting.py | 182 +++++++++++++++++++++++++++++++++++++++++++ readme.md | 81 +++++++++++++++++++ testing.py | 42 ++++++++++ 3 files changed, 305 insertions(+) create mode 100644 gradient_boosting.py create mode 100644 readme.md create mode 100644 testing.py diff --git a/gradient_boosting.py b/gradient_boosting.py new file mode 100644 index 0000000..efc634f --- /dev/null +++ b/gradient_boosting.py @@ -0,0 +1,182 @@ +import numpy as np + +# Define the function to calculate the gradient of the squared loss +def squared_loss_gradient(y, f): + """ + Compute the gradient for the squared loss function. + + Parameters: + - y (np.array): The target values. + - f (np.array): The predicted values. + + Returns: + - np.array: The gradient of the squared loss. + """ + return y - f + +# Define the Node class to represent each node in the decision tree +class Node: + """ + A node in the decision tree. + + Attributes: + - value (float): The value at the node, used for leaf nodes. + - left (Node): Left child node. + - right (Node): Right child node. + - threshold (float): The threshold for splitting. + - feature (int): The index of the feature used for splitting. + """ + def __init__(self, value=None, left=None, right=None, threshold=None, feature=None): + self.value = value + self.left = left + self.right = right + self.threshold = threshold + self.feature = feature + +# Define the DecisionTree class for building the regression tree +class DecisionTree: + """ + A simple decision tree for regression. + + Attributes: + - max_depth (int): The maximum depth of the tree. + - root (Node): The root node of the tree. + """ + def __init__(self, max_depth=3): + self.max_depth = max_depth + self.root = None + + def fit(self, X, residuals): + """ + Fit the decision tree to the residuals. + + Parameters: + - X (np.array): Feature matrix. + - residuals (np.array): Residuals to fit. + """ + self.root = self._build_tree(X, residuals, depth=0) + + def _build_tree(self, X, residuals, depth): + """ + Recursively build the decision tree. + + Parameters: + - X (np.array): Feature matrix. + - residuals (np.array): Residuals to fit. + - depth (int): Current depth of the tree. + + Returns: + - Node: The constructed tree node. + """ + num_samples = X.shape[0] + if depth >= self.max_depth or num_samples <= 1: + leaf_value = np.mean(residuals) + return Node(value=leaf_value) + + best_feature, best_threshold, best_var = None, None, np.inf + for feature in range(X.shape[1]): + thresholds = np.unique(X[:, feature]) + for threshold in thresholds: + left_mask = X[:, feature] <= threshold + right_mask = X[:, feature] > threshold + if np.sum(left_mask) == 0 or np.sum(right_mask) == 0: + continue + left_var = np.var(residuals[left_mask]) + right_var = np.var(residuals[right_mask]) + total_var = left_var + right_var + if total_var < best_var: + best_feature, best_threshold, best_var = feature, threshold, total_var + + left_mask = X[:, best_feature] <= best_threshold + right_mask = X[:, best_feature] > best_threshold + left_node = self._build_tree(X[left_mask], residuals[left_mask], depth + 1) + right_node = self._build_tree(X[right_mask], residuals[right_mask], depth + 1) + return Node(feature=best_feature, threshold=best_threshold, left=left_node, right=right_node) + + def predict(self, X): + """ + Make predictions using the decision tree. + + Parameters: + - X (np.array): Feature matrix. + + Returns: + - np.array: Predicted values. + """ + return np.array([self._predict(x, self.root) for x in X]) + + def _predict(self, x, node): + """ + Recursively predict by traversing the decision tree. + + Parameters: + - x (np.array): Single feature vector. + - node (Node): Current node of the tree. + + Returns: + - float: Predicted value. + """ + if node.value is not None: + return node.value + if x[node.feature] <= node.threshold: + return self._predict(x, node.left) + else: + return self._predict(x, node.right) + +# Define the GradientBoosting class for boosting decision trees +class GradientBoosting: + """ + Gradient Boosting for regression. + + Attributes: + - n_estimators (int): Number of boosting stages to perform. + - learning_rate (float): Learning rate shrinks the contribution of each tree. + - max_depth (int): Maximum depth of each decision tree. + - models (list): List of successive decision tree models. + - initial_prediction (float): Initial prediction to start the boosting. + """ + def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.trees = [] + self.initial_prediction = None + + def fit(self, X, y): + """ + Fit the gradient boosting model. + + Parameters: + - X (np.array): Feature matrix. + - y (np.array): Target values. + """ + # Initialize the first model to the mean of y + self.initial_prediction = np.mean(y) + f_m = np.full(y.shape, self.initial_prediction) + + for _ in range(self.n_estimators): + residuals = y - f_m + tree = DecisionTree(max_depth=self.max_depth) + tree.fit(X, residuals) + predictions = tree.predict(X) + f_m += self.learning_rate * predictions + self.trees.append(tree) # Store the tree instead of predictions + + def predict(self, X): + """ + Make predictions using the boosted model. + + Parameters: + - X (np.array): Feature matrix. + + Returns: + - np.array: Predicted values. + """ + # Start with the initial mean prediction + f_m = np.full(X.shape[0], self.initial_prediction) + + # Accumulate predictions from each tree + for tree in self.trees: + f_m += self.learning_rate * tree.predict(X) + + return f_m diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..851eb24 --- /dev/null +++ b/readme.md @@ -0,0 +1,81 @@ +# Gradient Boosting for Regression + +This repository contains a custom implementation of a Gradient Boosting model for regression tasks, using decision trees as base learners. The model is designed to be versatile and easily adjustable to fit various regression problems. + +## Model Description + +The Gradient Boosting model implemented here constructs an ensemble of decision trees in a sequential manner, where each tree is built to correct the errors made by the previous ones. The model is particularly useful for datasets where relationships between features and the target variable are complex and non-linear. + +### When to Use This Model + +This model should be used when: +- Dealing with regression tasks requiring robust predictive power. +- Handling datasets with complex and non-linear relationships. +- Situations where other simpler models (like linear regression) are insufficient. + +## Testing the Model + +The model has been rigorously tested using the California Housing dataset, which is a standard dataset for evaluating regression models. The testing involves: +- Splitting the data into training and testing sets. +- Scaling the feature matrix to standardize the input data. +- Training the Gradient Boosting model on the training data. +- Evaluating its performance using Mean Squared Error (MSE) on the test set. + +## Exposed Parameters + +Users can tune the following parameters to optimize the model's performance: +- `n_estimators`: The number of trees to build (default is 100). +- `learning_rate`: The step size at each iteration to control overfitting (default is 0.1). +- `max_depth`: The maximum depth of each decision tree (default is 3). + +### Basic Usage Example + +```python +from gradient_boosting import GradientBoosting +from sklearn.model_selection import train_test_split +from sklearn.datasets import fetch_california_housing +from sklearn.preprocessing import StandardScaler + +# Load data +data = fetch_california_housing() +X, y = data.data, data.target + +# Split data +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Scale features +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Initialize and train the gradient boosting model +model = GradientBoosting(n_estimators=50, learning_rate=0.1, max_depth=3) +model.fit(X_train_scaled, y_train) + +# Predict and evaluate +predictions = model.predict(X_test_scaled) +print(predictions) +``` + +## Potential Issues and Workarounds + +The model may encounter difficulties with specific types of inputs such as: + +- **Extremely Noisy Data**: High levels of noise can lead to overfitting, where the model learns the noise as patterns, degrading prediction accuracy on new data. +- **Outliers**: Outliers can disproportionately influence the decision boundaries established by the decision trees, leading to suboptimal models. + +### Workarounds + +To enhance model robustness and performance: +- **Preprocessing Steps**: Implement robust preprocessing steps to handle outliers and noise, such as outlier detection algorithms or robust scaling methods. +- **Advanced Techniques**: Explore integrating outlier detection algorithms and advanced noise filtering techniques before fitting the model to improve its generalization capabilities. + +## Contribution + +Contributions are highly appreciated and critical for the ongoing improvement of the model. If you are interested in enhancing functionality, improving efficiency, or extending the usability of this model, you are encouraged to: + +- **Fork the Repository**: Create your fork of the repository where you can make your changes. +- **Make Changes**: Implement your enhancements or fixes. +- **Create a Pull Request**: Submit a pull request for your changes to be reviewed and potentially merged into the main project. + +Your contributions not only help improve the project but also provide valuable learning opportunities through collaboration and feedback. diff --git a/testing.py b/testing.py new file mode 100644 index 0000000..bf54549 --- /dev/null +++ b/testing.py @@ -0,0 +1,42 @@ +# Import necessary libraries +import numpy as np +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +from sklearn.preprocessing import StandardScaler + +# GradientBoosting class is in a file named gradient_boosting.py +from gradient_boosting import GradientBoosting + +def main(): + # Load the California housing dataset + data = fetch_california_housing() + X, y = data.data, data.target + + # Split the dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Initialize the StandardScaler + scaler = StandardScaler() + + # Fit the scaler on the training data and transform it + X_train_scaled = scaler.fit_transform(X_train) + + # Transform the testing data with the same scaler + X_test_scaled = scaler.transform(X_test) + + # Initialize the GradientBoosting model + model = GradientBoosting(n_estimators=50, learning_rate=0.1, max_depth=3) + + # Train the model on the scaled training data + model.fit(X_train_scaled, y_train) + + # Predict the scaled test set + predictions = model.predict(X_test_scaled) + + # Evaluate the model using mean squared error + mse = mean_squared_error(y_test, predictions) + print("Mean Squared Error on Test Set:", mse) + +if __name__ == "__main__": + main() From cdd2a82402e6e9ffde4cff1f3ebb30d52fbda4c0 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 21:23:37 +0530 Subject: [PATCH 2/9] Update and rename readme.md to Readme.md --- readme.md => Readme.md | 3 +++ 1 file changed, 3 insertions(+) rename readme.md => Readme.md (99%) diff --git a/readme.md b/Readme.md similarity index 99% rename from readme.md rename to Readme.md index 851eb24..8ce3222 100644 --- a/readme.md +++ b/Readme.md @@ -1,3 +1,6 @@ +# Project 2 +By Usha Devaraju + # Gradient Boosting for Regression This repository contains a custom implementation of a Gradient Boosting model for regression tasks, using decision trees as base learners. The model is designed to be versatile and easily adjustable to fit various regression problems. From 80eb7fa8037d361097b54a579c269badbe6be56d Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 21:24:01 +0530 Subject: [PATCH 3/9] Rename README.md to question.md --- README.md => question.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename README.md => question.md (100%) diff --git a/README.md b/question.md similarity index 100% rename from README.md rename to question.md From a8f99b8c1a0747db39e11dcd5720030e80ccf9d5 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:36:36 -0600 Subject: [PATCH 4/9] Update Readme.md --- Readme.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Readme.md b/Readme.md index 8ce3222..3f55607 100644 --- a/Readme.md +++ b/Readme.md @@ -73,12 +73,3 @@ To enhance model robustness and performance: - **Preprocessing Steps**: Implement robust preprocessing steps to handle outliers and noise, such as outlier detection algorithms or robust scaling methods. - **Advanced Techniques**: Explore integrating outlier detection algorithms and advanced noise filtering techniques before fitting the model to improve its generalization capabilities. -## Contribution - -Contributions are highly appreciated and critical for the ongoing improvement of the model. If you are interested in enhancing functionality, improving efficiency, or extending the usability of this model, you are encouraged to: - -- **Fork the Repository**: Create your fork of the repository where you can make your changes. -- **Make Changes**: Implement your enhancements or fixes. -- **Create a Pull Request**: Submit a pull request for your changes to be reviewed and potentially merged into the main project. - -Your contributions not only help improve the project but also provide valuable learning opportunities through collaboration and feedback. From 7e59a56a545107f25a4abd150e439b19d2614165 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Fri, 22 Nov 2024 03:07:31 +0530 Subject: [PATCH 5/9] Update Readme.md --- Readme.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Readme.md b/Readme.md index 3f55607..a937f4d 100644 --- a/Readme.md +++ b/Readme.md @@ -31,6 +31,17 @@ Users can tune the following parameters to optimize the model's performance: - `learning_rate`: The step size at each iteration to control overfitting (default is 0.1). - `max_depth`: The maximum depth of each decision tree (default is 3). +### Prerequisites + +Ensure you have Python installed along with the following libraries: +- `numpy` +- `scikit-learn` + +To install missing dependencies, use: +```bash +pip install numpy scikit-learn +``` + ### Basic Usage Example ```python @@ -60,6 +71,16 @@ predictions = model.predict(X_test_scaled) print(predictions) ``` +## Running Tests +To test the model on the California Housing dataset, run: +```python +python testing.py +``` +The script will: +- **Load the dataset.** +- **Train and test the Gradient Boosting model.** +- **Output the Mean Squared Error (MSE) of the predictions.** + ## Potential Issues and Workarounds The model may encounter difficulties with specific types of inputs such as: From a2c2cfed146211e1003f1c923c0eb0d00b9d7f88 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 23:06:09 -0600 Subject: [PATCH 6/9] Update Readme.md --- Readme.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Readme.md b/Readme.md index a937f4d..8c374c5 100644 --- a/Readme.md +++ b/Readme.md @@ -1,10 +1,14 @@ # Project 2 -By Usha Devaraju +BY +A20539949-Usha Devaraju +A20548244-Roopashri Kommana +A20550565-Sai Sandeep Neerukonda # Gradient Boosting for Regression This repository contains a custom implementation of a Gradient Boosting model for regression tasks, using decision trees as base learners. The model is designed to be versatile and easily adjustable to fit various regression problems. +1.What does the model you have implemented do and when should it be used? ## Model Description The Gradient Boosting model implemented here constructs an ensemble of decision trees in a sequential manner, where each tree is built to correct the errors made by the previous ones. The model is particularly useful for datasets where relationships between features and the target variable are complex and non-linear. @@ -16,6 +20,7 @@ This model should be used when: - Handling datasets with complex and non-linear relationships. - Situations where other simpler models (like linear regression) are insufficient. +2.How did you test your model to determine if it is working reasonably correctly? ## Testing the Model The model has been rigorously tested using the California Housing dataset, which is a standard dataset for evaluating regression models. The testing involves: @@ -24,6 +29,7 @@ The model has been rigorously tested using the California Housing dataset, which - Training the Gradient Boosting model on the training data. - Evaluating its performance using Mean Squared Error (MSE) on the test set. +3.What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples. ## Exposed Parameters Users can tune the following parameters to optimize the model's performance: @@ -81,6 +87,7 @@ The script will: - **Train and test the Gradient Boosting model.** - **Output the Mean Squared Error (MSE) of the predictions.** +4.Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental? ## Potential Issues and Workarounds The model may encounter difficulties with specific types of inputs such as: From ad310969bd9bf51f0e73d75509e61fc247f1e7cb Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 23:06:58 -0600 Subject: [PATCH 7/9] Update Readme.md --- Readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Readme.md b/Readme.md index 8c374c5..fe642c0 100644 --- a/Readme.md +++ b/Readme.md @@ -1,7 +1,9 @@ # Project 2 BY A20539949-Usha Devaraju + A20548244-Roopashri Kommana + A20550565-Sai Sandeep Neerukonda # Gradient Boosting for Regression From d99ea4a74c0f2da317d3cbc7705f4be4891666e7 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Thu, 21 Nov 2024 23:10:09 -0600 Subject: [PATCH 8/9] Update Readme.md --- Readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/Readme.md b/Readme.md index fe642c0..b0eabfc 100644 --- a/Readme.md +++ b/Readme.md @@ -1,5 +1,6 @@ # Project 2 BY + A20539949-Usha Devaraju A20548244-Roopashri Kommana From 5b024c4fb51aa00b295fef0149b3cc280eb91047 Mon Sep 17 00:00:00 2001 From: Ushad2000 <167986189+Ushad2000@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:57:43 +0530 Subject: [PATCH 9/9] Update Readme.md --- Readme.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Readme.md b/Readme.md index b0eabfc..fb527cc 100644 --- a/Readme.md +++ b/Readme.md @@ -11,7 +11,7 @@ A20550565-Sai Sandeep Neerukonda This repository contains a custom implementation of a Gradient Boosting model for regression tasks, using decision trees as base learners. The model is designed to be versatile and easily adjustable to fit various regression problems. -1.What does the model you have implemented do and when should it be used? +# 1.What does the model you have implemented do and when should it be used? ## Model Description The Gradient Boosting model implemented here constructs an ensemble of decision trees in a sequential manner, where each tree is built to correct the errors made by the previous ones. The model is particularly useful for datasets where relationships between features and the target variable are complex and non-linear. @@ -23,7 +23,7 @@ This model should be used when: - Handling datasets with complex and non-linear relationships. - Situations where other simpler models (like linear regression) are insufficient. -2.How did you test your model to determine if it is working reasonably correctly? +# 2.How did you test your model to determine if it is working reasonably correctly? ## Testing the Model The model has been rigorously tested using the California Housing dataset, which is a standard dataset for evaluating regression models. The testing involves: @@ -32,7 +32,7 @@ The model has been rigorously tested using the California Housing dataset, which - Training the Gradient Boosting model on the training data. - Evaluating its performance using Mean Squared Error (MSE) on the test set. -3.What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples. +# 3.What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples. ## Exposed Parameters Users can tune the following parameters to optimize the model's performance: @@ -90,7 +90,7 @@ The script will: - **Train and test the Gradient Boosting model.** - **Output the Mean Squared Error (MSE) of the predictions.** -4.Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental? +# 4.Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental? ## Potential Issues and Workarounds The model may encounter difficulties with specific types of inputs such as: