Fall2024CS584 · carlotardelasen · Nov 25, 2024
diff --git a/GradientBoostingTree.py b/GradientBoostingTree.py
@@ -0,0 +1,179 @@
+import numpy as np
+import pandas as pd
+
+# Class for a single Decision Tree
+class DecisionTree:
+    def __init__(self, max_depth=5):
+        """
+        Initialize the Decision Tree.
+        Parameters:
+        - max_depth: Maximum depth of the tree to control overfitting.
+        """
+        self.max_depth = max_depth
+        self.feature = None          # Feature to split on
+        self.threshold = None        # Threshold value for splitting
+        self.left_value = None       # Value for the left leaf node
+        self.right_value = None      # Value for the right leaf node
+        self.left_tree = None        # Left subtree
+        self.right_tree = None       # Right subtree
+
+    def fit(self, X, y):
+        """
+        Fit the Decision Tree to the data.
+        Parameters:
+        - X: Features (input data).
+        - y: Target values (output data).
+        """
+        self._fit(X, y, depth=0)
+
+    def _fit(self, X, y, depth):
+        """
+        Recursive function to build the tree.
+        Parameters:
+        - X: Features at the current node.
+        - y: Target values at the current node.
+        - depth: Current depth of the tree.
+        """
+        # Stop if maximum depth is reached or no further splitting is possible
+        if depth >= self.max_depth or len(X) <= 1:
+            self.left_value = self.right_value = np.mean(y)  # Assign mean value
+            return
+
+        best_feature, best_threshold, best_loss = None, None, float("inf")
+        best_left_idx, best_right_idx = None, None
+
+        # Iterate over all features and their unique values
+        for feature in range(X.shape[1]):
+            thresholds = np.unique(X[:, feature])
+            for threshold in thresholds:
+                left_idx = X[:, feature] <= threshold
+                right_idx = ~left_idx
+
+                # Skip thresholds that create empty splits
+                if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
+                    continue
+
+                # Compute losses only if there are enough elements
+                left_loss = np.var(y[left_idx]) * np.sum(left_idx) if np.sum(left_idx) > 1 else 0
+                right_loss = np.var(y[right_idx]) * np.sum(right_idx) if np.sum(right_idx) > 1 else 0
+                total_loss = left_loss + right_loss
+
+                # Update the best split if this split improves the loss
+                if total_loss < best_loss:
+                    best_loss = total_loss
+                    best_feature = feature
+                    best_threshold = threshold
+                    best_left_idx = left_idx
+                    best_right_idx = right_idx
+
+        # Store the best feature and threshold for this node (best split)
+        self.feature = best_feature
+        self.threshold = best_threshold
+
+        # Recursively build left and right subtrees
+        if best_left_idx is not None:
+            self.left_tree = DecisionTree(self.max_depth - 1)
+            self.left_tree._fit(X[best_left_idx], y[best_left_idx], depth + 1)
+            self.right_tree = DecisionTree(self.max_depth - 1)
+            self.right_tree._fit(X[best_right_idx], y[best_right_idx], depth + 1)
+
+    def predict(self, X):
+        """
+        Predict the target values for the given input data.
+        Parameters:
+        - X: Features to predict.
+        Returns:
+        - Predictions for each row in X.
+        """
+        # If this is a leaf node, return the stored value
+        if self.feature is None:
+            return np.full(X.shape[0], self.left_value)
+
+        # Otherwise, divide data based on the threshold and recurse
+        left_idx = X[:, self.feature] <= self.threshold
+        right_idx = ~left_idx
+        predictions = np.zeros(X.shape[0])
+
+        if self.left_tree:
+            predictions[left_idx] = self.left_tree.predict(X[left_idx])
+        if self.right_tree:
+            predictions[right_idx] = self.right_tree.predict(X[right_idx])
+
+        return predictions
+
+
+# Class for Gradient Boosting using Decision Trees
+class GradientBoostingTree:
+    def __init__(self, M=100, max_depth=5, learning_rate=0.1):
+        """
+        Initialize the Gradient Boosting Tree model.
+        Parameters:
+        - M: Number of boosting iterations (trees).
+        - max_depth: Maximum depth of each tree.
+        - learning_rate: Step size for each tree's contribution.
+        """
+        self.M = M
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.models = []  
+        self.f_0 = None  
+
+    def fit(self, X, y):
+        """
+        Fit the Gradient Boosting model to the data.
+        Parameters:
+        - X: Features (input data).
+        - y: Target values (output data).
+        """
+        N = len(y)
+        # Loss function: Mean Squared Error
+        self.f_0 = np.mean(y) 
+        f = np.full(N, self.f_0) 
+
+        for m in range(self.M):
+            # Compute the gradient (residuals)
+            residuals = y - f  
+            tree = DecisionTree(max_depth=self.max_depth) 
+            tree.fit(X, residuals)  
+            self.models.append(tree) 
+
+            predictions = tree.predict(X) 
+            f += self.learning_rate * predictions  
+
+    def predict(self, X):
+        """
+        Predict the target values for the given input data.
+        Parameters:
+        - X: Features to predict.
+        Returns:
+        - Predictions for each row in X.
+        """
+        # Start with the initial prediction
+        f = np.full(X.shape[0], self.f_0)  
+        for tree in self.models:
+            # Add contributions from each tree
+            f += self.learning_rate * tree.predict(X)  
+        return f
+
+
+# Preprocessing function for the dataset
+def preprocess_data(df):
+    """
+    Preprocess the dataset.
+    - Encodes categorical variables using one-hot encoding.
+    - Splits the dataset into features (X) and target variable (y).
+    Parameters:
+    - df: DataFrame containing the dataset.
+    Returns:
+    - X: Processed feature matrix.
+    - y: Target variable.
+    """
+    # 1. Encode categorical variables using one-hot encoding
+    df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
+
+    # 2. Split the dataset into features and target variable
+    X = df.drop(columns=['charges'], errors='ignore') 
+    y = df['charges'] if 'charges' in df.columns else None  
+
+    return X.values, y.values
+
diff --git a/README.md b/README.md
@@ -1,29 +1,181 @@
-# Project 2
+# Project 2: Gradient-Boosting Trees Implementation
 
-Select one of the following two options:
+## **Authors**
+- Clara Aparicio Mendez (A20599326)
+- Juan Cantarero Angulo (A20598593)
+- Raquel Gimenez Pascual (A20599725)
+- Carlota Ruiz de Conejo de la Sen (A20600262)
 
-## Boosting Trees
+This project implements the gradient-boosting tree algorithm, following the methodology described in Sections 10.9-10.10 of *Elements of Statistical Learning* (2nd Edition). The implementation provides a fit-predict interface for training and predicting with boosted trees, along with configurable parameters for optimization. The project is organized into two main files:
 
-Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
+- **`GradientBoostingTree.py`**: Contains the implementation of the Gradient Boosting Tree model and its underlying Decision Tree class.
+- **`TestGradientBoostingTree.py`**: Contains the test script for evaluating the model using the `insurance.csv` dataset. It also allows users to adjust hyperparameters to test the model's performance.
 
-Put your README below. Answer the following questions.
+The dataset, `insurance.csv`, is also placed in the same directory as the Python files.
 
-* What does the model you have implemented do and when should it be used?
-* How did you test your model to determine if it is working reasonably correctly?
-* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
-* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
+---
 
-## Model Selection
+## **Overview**
 
-Implement generic k-fold cross-validation and bootstrapping model selection methods.
+### **What does the model do and when should it be used?**
 
-In your README, answer the following questions:
+The gradient-boosting tree model is a powerful ensemble learning method that builds a series of decision trees sequentially. Each subsequent tree corrects the errors of the previous ones by minimizing a loss function. The algorithm is widely used for regression and classification tasks due to its ability to model complex relationships and handle mixed data types effectively.
 
-* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
-* In what cases might the methods you've written fail or give incorrect or undesirable results?
-* What could you implement given more time to mitigate these cases or help users of your methods?
-* What parameters have you exposed to your users in order to use your model selectors.
+Gradient boosting is particularly useful in scenarios where:
 
-See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
+- The dataset exhibits complex, non-linear relationships.
+- High predictive accuracy is required, especially for regression or classification tasks.
+- Interpretability is not the primary concern, and performance takes precedence.
+- The data is structured or tabular, and the dataset size ranges from small to moderately large.
 
-As usual, above-and-beyond efforts will be considered for bonus points.
+For this project, we implemented the model specifically for regression tasks, using a medical insurance dataset to predict individual charges based on personal and lifestyle factors such as age, sex, bmi, children, smoker, region and charges.
+
+---
+
+### **Features**
+- Regression support.
+- Customizable hyperparameters:
+  - `learning_rate` to control the contribution of each tree.
+  - `max_depth` to limit tree complexity.
+  - `M` (number of trees) to balance bias-variance tradeoff.
+- Handles both numerical and categorical data seamlessly with preprocessing.
+
+---
+
+## **Dataset Description**
+
+The model was trained and evaluated using the `insurance.csv` dataset, which contains the following features:
+
+1. **age:** Age of the individual.
+2. **sex:** Gender (`male` or `female`).
+3. **bmi:** Body Mass Index.
+4. **children:** Number of children or dependents.
+5. **smoker:** Whether the individual is a smoker (`yes` or `no`).
+6. **region:** Geographical region (`northeast`, `southeast`, `southwest`, `northwest`).
+7. **charges:** Medical insurance charges (target variable).
+
+The dataset has 1,338 rows with no missing values, making it a clean and practical dataset for regression.
+
+---
+
+## **Training and Testing:**
+
+How did you test your model to determine if it is working reasonably correctly?
+
+### **Preprocessing**
+
+The preprocessing steps include:
+
+1. **Categorical Encoding:** 
+   - Used one-hot encoding for `sex`, `smoker`, and `region`.
+   - Dropped one category per feature to avoid multicollinearity.
+2. **Feature Selection:**
+   - Split `charges` as the target variable (`y`) and all other columns as features (`X`).
+
+### **Model Training**
+
+- The dataset was split into training (80%) and testing (20%) subsets using `train_test_split`.
+- A Gradient Boosting Tree model was trained with the following parameters:
+  - `M=200`: The number of boosting stages.
+  - `max_depth=5`: Maximum depth of each decision tree.
+  - `learning_rate=0.05`: Step size for each tree's contribution.
+- Training involved sequentially fitting trees to minimize the residuals of the previous stage.
+
+---
+
+### **Testing and Validation**
+
+The model was tested on unseen data (test subset) using the following metrics:
+
+1. **R² Score:**
+   - Measures the proportion of variance explained by the model.
+   - Higher values indicate better performance.
+2. **Mean Absolute Error (MAE):**
+   - The average magnitude of errors in predictions, measured in the same units as the target variable.
+
+---
+
+## **Results**
+
+On the insurance dataset:
+
+- **R² Score:** Achieved a value of ~0.88, indicating a reasonable fit to the data.
+- **Mean Absolute Error (MAE):** Approximately $2400, depending on the specific hyperparameters.
+
+---
+
+## **Model Parameters**
+
+What parameters have you exposed to users of your implementation in order to tune performance?
+
+The following parameters can be customized to tune the performance:
+
+- **`M` (number of trees):**
+  - Default: 100. Increasing this can reduce bias but may lead to overfitting.
+- **`max_depth`:**
+  - Default: 5. Controls the depth of each tree. Higher values allow for more complex splits.
+- **`learning_rate`:**
+  - Default: 0.1. Lower values improve accuracy but require more boosting stages.
+
+
+---
+
+## **Limitations**
+
+Are there specific inputs that your implementation has trouble with?
+
+Before arriving at the final version of our model, we experimented with a real estate dataset. This process highlighted some important limitations of our implementation:
+
+- **Large datasets:**  
+  The real estate dataset was considerably large and high-dimensional, leading to extended training times and increased memory usage. This made it challenging to efficiently fit our model within a reasonable timeframe.
+
+- **Outliers:**  
+  The dataset contained numerous outliers, especially in property prices, which heavily influenced the predictions. These outliers resulted in a lower \( R^2 \) score, as the model struggled to generalize well across the full range of values.
+
+- **Missing values:**  
+  Many features in the real estate dataset had missing values, requiring extensive preprocessing to handle imputations. Despite these efforts, the presence of missing data degraded overall model performance and increased complexity.
+
+These challenges with the real estate dataset helped us identify areas for improvement in our implementation, such as handling outliers more robustly and optimizing performance for larger datasets. For the final implementation, we selected the insurance dataset, which is cleaner and smaller, allowing us to focus on model performance without being hindered by these issues.
+
+---
+## **Usage Instructions**
+
+1. Ensure the files `GradientBoostingTree.py`, `TestGradientBoostingTree.py`, and `insurance.csv` are in the same directory.
+2. Open `TestGradientBoostingTree.py` to modify hyperparameters as needed (`M`, `max_depth`, `learning_rate`).
+3. Run the test script to train and evaluate the model:
+
+```bash
+python TestGradientBoostingTree.py
+```
+
+## **Usage Example**
+
+```python
+
+# Step 1: Load the dataset
+df = pd.read_csv('insurance.csv')
+
+# Step 2: Preprocess the data
+X, y = preprocess_data(df)
+
+# Step 3: Split the data into training and testing subsets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Step 4: Initialize the Gradient Boosting Tree model with the tuning parameters
+gb_tree = GradientBoostingTree(M=200, max_depth=5, learning_rate=0.05)
+
+# Step 5: Train the model
+gb_tree.fit(X_train, y_train)
+
+# Step 6: Make predictions on the test set
+predictions = gb_tree.predict(X_test)
+
+# Step 7: Evaluate the model
+r2 = r2_score(y_test, predictions)
+mae = mean_absolute_error(y_test, predictions)
+
+# Step 8: Print results
+print("Predictions:", predictions)
+print("R2 Score:", r2)  # Closer to 1 is better
+print("Mean Absolute Error (MAE):", mae)  # Lower values indicate better predictions
+```