Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions GradientBoostingTree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import numpy as np
import pandas as pd

# Class for a single Decision Tree
class DecisionTree:
def __init__(self, max_depth=5):
"""
Initialize the Decision Tree.
Parameters:
- max_depth: Maximum depth of the tree to control overfitting.
"""
self.max_depth = max_depth
self.feature = None # Feature to split on
self.threshold = None # Threshold value for splitting
self.left_value = None # Value for the left leaf node
self.right_value = None # Value for the right leaf node
self.left_tree = None # Left subtree
self.right_tree = None # Right subtree

def fit(self, X, y):
"""
Fit the Decision Tree to the data.
Parameters:
- X: Features (input data).
- y: Target values (output data).
"""
self._fit(X, y, depth=0)

def _fit(self, X, y, depth):
"""
Recursive function to build the tree.
Parameters:
- X: Features at the current node.
- y: Target values at the current node.
- depth: Current depth of the tree.
"""
# Stop if maximum depth is reached or no further splitting is possible
if depth >= self.max_depth or len(X) <= 1:
self.left_value = self.right_value = np.mean(y) # Assign mean value
return

best_feature, best_threshold, best_loss = None, None, float("inf")
best_left_idx, best_right_idx = None, None

# Iterate over all features and their unique values
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_idx = X[:, feature] <= threshold
right_idx = ~left_idx

# Skip thresholds that create empty splits
if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
continue

# Compute losses only if there are enough elements
left_loss = np.var(y[left_idx]) * np.sum(left_idx) if np.sum(left_idx) > 1 else 0
right_loss = np.var(y[right_idx]) * np.sum(right_idx) if np.sum(right_idx) > 1 else 0
total_loss = left_loss + right_loss

# Update the best split if this split improves the loss
if total_loss < best_loss:
best_loss = total_loss
best_feature = feature
best_threshold = threshold
best_left_idx = left_idx
best_right_idx = right_idx

# Store the best feature and threshold for this node (best split)
self.feature = best_feature
self.threshold = best_threshold

# Recursively build left and right subtrees
if best_left_idx is not None:
self.left_tree = DecisionTree(self.max_depth - 1)
self.left_tree._fit(X[best_left_idx], y[best_left_idx], depth + 1)
self.right_tree = DecisionTree(self.max_depth - 1)
self.right_tree._fit(X[best_right_idx], y[best_right_idx], depth + 1)

def predict(self, X):
"""
Predict the target values for the given input data.
Parameters:
- X: Features to predict.
Returns:
- Predictions for each row in X.
"""
# If this is a leaf node, return the stored value
if self.feature is None:
return np.full(X.shape[0], self.left_value)

# Otherwise, divide data based on the threshold and recurse
left_idx = X[:, self.feature] <= self.threshold
right_idx = ~left_idx
predictions = np.zeros(X.shape[0])

if self.left_tree:
predictions[left_idx] = self.left_tree.predict(X[left_idx])
if self.right_tree:
predictions[right_idx] = self.right_tree.predict(X[right_idx])

return predictions


# Class for Gradient Boosting using Decision Trees
class GradientBoostingTree:
def __init__(self, M=100, max_depth=5, learning_rate=0.1):
"""
Initialize the Gradient Boosting Tree model.
Parameters:
- M: Number of boosting iterations (trees).
- max_depth: Maximum depth of each tree.
- learning_rate: Step size for each tree's contribution.
"""
self.M = M
self.max_depth = max_depth
self.learning_rate = learning_rate
self.models = []
self.f_0 = None

def fit(self, X, y):
"""
Fit the Gradient Boosting model to the data.
Parameters:
- X: Features (input data).
- y: Target values (output data).
"""
N = len(y)
# Loss function: Mean Squared Error
self.f_0 = np.mean(y)
f = np.full(N, self.f_0)

for m in range(self.M):
# Compute the gradient (residuals)
residuals = y - f
tree = DecisionTree(max_depth=self.max_depth)
tree.fit(X, residuals)
self.models.append(tree)

predictions = tree.predict(X)
f += self.learning_rate * predictions

def predict(self, X):
"""
Predict the target values for the given input data.
Parameters:
- X: Features to predict.
Returns:
- Predictions for each row in X.
"""
# Start with the initial prediction
f = np.full(X.shape[0], self.f_0)
for tree in self.models:
# Add contributions from each tree
f += self.learning_rate * tree.predict(X)
return f


# Preprocessing function for the dataset
def preprocess_data(df):
"""
Preprocess the dataset.
- Encodes categorical variables using one-hot encoding.
- Splits the dataset into features (X) and target variable (y).
Parameters:
- df: DataFrame containing the dataset.
Returns:
- X: Processed feature matrix.
- y: Target variable.
"""
# 1. Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# 2. Split the dataset into features and target variable
X = df.drop(columns=['charges'], errors='ignore')
y = df['charges'] if 'charges' in df.columns else None

return X.values, y.values

188 changes: 170 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,29 +1,181 @@
# Project 2
# Project 2: Gradient-Boosting Trees Implementation

Select one of the following two options:
## **Authors**
- Clara Aparicio Mendez (A20599326)
- Juan Cantarero Angulo (A20598593)
- Raquel Gimenez Pascual (A20599725)
- Carlota Ruiz de Conejo de la Sen (A20600262)

## Boosting Trees
This project implements the gradient-boosting tree algorithm, following the methodology described in Sections 10.9-10.10 of *Elements of Statistical Learning* (2nd Edition). The implementation provides a fit-predict interface for training and predicting with boosted trees, along with configurable parameters for optimization. The project is organized into two main files:

Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
- **`GradientBoostingTree.py`**: Contains the implementation of the Gradient Boosting Tree model and its underlying Decision Tree class.
- **`TestGradientBoostingTree.py`**: Contains the test script for evaluating the model using the `insurance.csv` dataset. It also allows users to adjust hyperparameters to test the model's performance.

Put your README below. Answer the following questions.
The dataset, `insurance.csv`, is also placed in the same directory as the Python files.

* What does the model you have implemented do and when should it be used?
* How did you test your model to determine if it is working reasonably correctly?
* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
---

## Model Selection
## **Overview**

Implement generic k-fold cross-validation and bootstrapping model selection methods.
### **What does the model do and when should it be used?**

In your README, answer the following questions:
The gradient-boosting tree model is a powerful ensemble learning method that builds a series of decision trees sequentially. Each subsequent tree corrects the errors of the previous ones by minimizing a loss function. The algorithm is widely used for regression and classification tasks due to its ability to model complex relationships and handle mixed data types effectively.

* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
* In what cases might the methods you've written fail or give incorrect or undesirable results?
* What could you implement given more time to mitigate these cases or help users of your methods?
* What parameters have you exposed to your users in order to use your model selectors.
Gradient boosting is particularly useful in scenarios where:

See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
- The dataset exhibits complex, non-linear relationships.
- High predictive accuracy is required, especially for regression or classification tasks.
- Interpretability is not the primary concern, and performance takes precedence.
- The data is structured or tabular, and the dataset size ranges from small to moderately large.

As usual, above-and-beyond efforts will be considered for bonus points.
For this project, we implemented the model specifically for regression tasks, using a medical insurance dataset to predict individual charges based on personal and lifestyle factors such as age, sex, bmi, children, smoker, region and charges.

---

### **Features**
- Regression support.
- Customizable hyperparameters:
- `learning_rate` to control the contribution of each tree.
- `max_depth` to limit tree complexity.
- `M` (number of trees) to balance bias-variance tradeoff.
- Handles both numerical and categorical data seamlessly with preprocessing.

---

## **Dataset Description**

The model was trained and evaluated using the `insurance.csv` dataset, which contains the following features:

1. **age:** Age of the individual.
2. **sex:** Gender (`male` or `female`).
3. **bmi:** Body Mass Index.
4. **children:** Number of children or dependents.
5. **smoker:** Whether the individual is a smoker (`yes` or `no`).
6. **region:** Geographical region (`northeast`, `southeast`, `southwest`, `northwest`).
7. **charges:** Medical insurance charges (target variable).

The dataset has 1,338 rows with no missing values, making it a clean and practical dataset for regression.

---

## **Training and Testing:**

How did you test your model to determine if it is working reasonably correctly?

### **Preprocessing**

The preprocessing steps include:

1. **Categorical Encoding:**
- Used one-hot encoding for `sex`, `smoker`, and `region`.
- Dropped one category per feature to avoid multicollinearity.
2. **Feature Selection:**
- Split `charges` as the target variable (`y`) and all other columns as features (`X`).

### **Model Training**

- The dataset was split into training (80%) and testing (20%) subsets using `train_test_split`.
- A Gradient Boosting Tree model was trained with the following parameters:
- `M=200`: The number of boosting stages.
- `max_depth=5`: Maximum depth of each decision tree.
- `learning_rate=0.05`: Step size for each tree's contribution.
- Training involved sequentially fitting trees to minimize the residuals of the previous stage.

---

### **Testing and Validation**

The model was tested on unseen data (test subset) using the following metrics:

1. **R² Score:**
- Measures the proportion of variance explained by the model.
- Higher values indicate better performance.
2. **Mean Absolute Error (MAE):**
- The average magnitude of errors in predictions, measured in the same units as the target variable.

---

## **Results**

On the insurance dataset:

- **R² Score:** Achieved a value of ~0.88, indicating a reasonable fit to the data.
- **Mean Absolute Error (MAE):** Approximately $2400, depending on the specific hyperparameters.

---

## **Model Parameters**

What parameters have you exposed to users of your implementation in order to tune performance?

The following parameters can be customized to tune the performance:

- **`M` (number of trees):**
- Default: 100. Increasing this can reduce bias but may lead to overfitting.
- **`max_depth`:**
- Default: 5. Controls the depth of each tree. Higher values allow for more complex splits.
- **`learning_rate`:**
- Default: 0.1. Lower values improve accuracy but require more boosting stages.


---

## **Limitations**

Are there specific inputs that your implementation has trouble with?

Before arriving at the final version of our model, we experimented with a real estate dataset. This process highlighted some important limitations of our implementation:

- **Large datasets:**
The real estate dataset was considerably large and high-dimensional, leading to extended training times and increased memory usage. This made it challenging to efficiently fit our model within a reasonable timeframe.

- **Outliers:**
The dataset contained numerous outliers, especially in property prices, which heavily influenced the predictions. These outliers resulted in a lower \( R^2 \) score, as the model struggled to generalize well across the full range of values.

- **Missing values:**
Many features in the real estate dataset had missing values, requiring extensive preprocessing to handle imputations. Despite these efforts, the presence of missing data degraded overall model performance and increased complexity.

These challenges with the real estate dataset helped us identify areas for improvement in our implementation, such as handling outliers more robustly and optimizing performance for larger datasets. For the final implementation, we selected the insurance dataset, which is cleaner and smaller, allowing us to focus on model performance without being hindered by these issues.

---
## **Usage Instructions**

1. Ensure the files `GradientBoostingTree.py`, `TestGradientBoostingTree.py`, and `insurance.csv` are in the same directory.
2. Open `TestGradientBoostingTree.py` to modify hyperparameters as needed (`M`, `max_depth`, `learning_rate`).
3. Run the test script to train and evaluate the model:

```bash
python TestGradientBoostingTree.py
```

## **Usage Example**

```python

# Step 1: Load the dataset
df = pd.read_csv('insurance.csv')

# Step 2: Preprocess the data
X, y = preprocess_data(df)

# Step 3: Split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize the Gradient Boosting Tree model with the tuning parameters
gb_tree = GradientBoostingTree(M=200, max_depth=5, learning_rate=0.05)

# Step 5: Train the model
gb_tree.fit(X_train, y_train)

# Step 6: Make predictions on the test set
predictions = gb_tree.predict(X_test)

# Step 7: Evaluate the model
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

# Step 8: Print results
print("Predictions:", predictions)
print("R2 Score:", r2) # Closer to 1 is better
print("Mean Absolute Error (MAE):", mae) # Lower values indicate better predictions
```
Loading