Merge pull request #448 from hardik1408/main

moe18 · web-flow · commit 178f9d3c6949 · 2025-06-10T18:35:34.000-04:00
New problem: Gini impurity
diff --git a/Problems/138_gini_impurity/learn.md b/Problems/138_gini_impurity/learn.md
@@ -0,0 +1,50 @@
+# Learn: Gini Impurity and Best Split in Decision Trees
+
+## Overview
+
+A core concept in Decision Trees (and by extension, Random Forests) is how the model chooses where to split the data at each node. One popular criterion used for splitting is **Gini Impurity**.
+
+In this task, you will implement:
+- Gini impurity computation
+- Finding the best feature and threshold to split on based on impurity reduction
+
+This helps build the foundation for how trees grow in a Random Forest.
+
+---
+
+## Gini Impurity
+
+For a set of samples with class labels \( y \), the Gini Impurity is defined as:
+
+$$
+G(y) = 1 - \sum_{i=1}^{k} p_i^2
+$$
+
+Where \( p_i \) is the proportion of samples belonging to class \( i \).
+
+A pure node (all one class) has \( G = 0 \), and higher values indicate more class diversity.
+
+---
+
+## Gini Gain for a Split
+
+Given a feature and a threshold to split the dataset into left and right subsets:
+
+$$
+G_{\text{split}} = \frac{n_{\text{left}}}{n} G(y_{\text{left}}) + \frac{n_{\text{right}}}{n} G(y_{\text{right}})
+$$
+
+We choose the split that **minimizes** $( G_{\text{split}} )$.
+
+---
+
+## Problem Statement
+
+You are given a dataset $( X \in \mathbb{R}^{n \times d} )$ and labels $( y \in \{0, 1\}^n $). Implement the following functions:
+
+### Functions to Implement
+
+```python
+def find_best_split(X: np.ndarray, y: np.ndarray) -> Tuple[int, float]:
+    ...
+```
diff --git a/Problems/138_gini_impurity/solution.py b/Problems/138_gini_impurity/solution.py
@@ -0,0 +1,88 @@
+import numpy as np
+from typing import Tuple
+
+def find_best_split(X: np.ndarray, y: np.ndarray) -> Tuple[int, float]:
+    """
+    Find the best feature and threshold to split the dataset based on Gini impurity.
+
+    :param X: Feature matrix of shape (n_samples, n_features)
+    :param y: Labels array of shape (n_samples,), binary (0 or 1)
+    :return: (feature_index, threshold) with lowest weighted Gini impurity
+    """
+
+    def gini_impurity(y_subset: np.ndarray) -> float:
+        if len(y_subset) == 0:
+            return 0.0
+        p = np.mean(y_subset == 1)
+        return 1.0 - (p ** 2 + (1 - p) ** 2)
+
+    n_samples, n_features = X.shape
+    best_feature = -1
+    best_threshold = float('inf')
+    best_gini = float('inf')
+
+    for feature_index in range(n_features):
+        thresholds = np.unique(X[:, feature_index])
+        for threshold in thresholds:
+            left_mask = X[:, feature_index] <= threshold
+            right_mask = ~left_mask
+
+            y_left, y_right = y[left_mask], y[right_mask]
+            g_left, g_right = gini_impurity(y_left), gini_impurity(y_right)
+
+            weighted_gini = (len(y_left) * g_left + len(y_right) * g_right) / n_samples
+
+            if weighted_gini < best_gini:
+                best_gini = weighted_gini
+                best_feature = feature_index
+                best_threshold = threshold
+
+    return best_feature, best_threshold
+
+def test():
+    # Test 1: Balanced binary split
+    X1 = np.array([[2.5], [3.5], [1.0], [4.0]])
+    y1 = np.array([0, 1, 0, 1])
+    f1, t1 = find_best_split(X1, y1)
+    assert f1 == 0
+    assert 1.0 <= t1 <= 3.5
+
+    # Test 2: Pure set (Gini = 0)
+    X2 = np.array([[1], [2], [3]])
+    y2 = np.array([1, 1, 1])
+    f2, t2 = find_best_split(X2, y2)
+    assert f2 == 0
+    assert t2 in [1, 2, 3]
+
+    # Test 3: Alternating labels
+    X3 = np.array([[1], [2], [3], [4]])
+    y3 = np.array([0, 1, 0, 1])
+    f3, t3 = find_best_split(X3, y3)
+    assert f3 == 0
+    assert t3 in [1, 2, 3, 4]
+
+    # Test 4: No good split (non-separable)
+    X4 = np.array([[1], [1], [1]])
+    y4 = np.array([0, 1, 0])
+    f4, t4 = find_best_split(X4, y4)
+    assert f4 == 0
+    assert t4 == 1
+
+    # Test 5: Two features, first one irrelevant
+    X5 = np.array([[0, 1], [0, 2], [0, 3], [0, 4]])
+    y5 = np.array([0, 0, 1, 1])
+    f5, t5 = find_best_split(X5, y5)
+    assert f5 == 1
+    assert t5 in [1, 2, 3, 4]
+
+    # Test 6: Tiny dataset
+    X6 = np.array([[1], [2]])
+    y6 = np.array([0, 1])
+    f6, t6 = find_best_split(X6, y6)
+    assert f6 == 0
+    assert t6 in [1, 2]
+
+    print("All test cases passed.")
+
+if __name__ == "__main__":
+    test()