Added stratify option to train_test_split function. (#4322)

nandwalritik · mariosasko · lhoestq · web-flow · commit 961e596afa1a · 2022-05-25T22:43:51.000+02:00
* Add stratify option to train_test_split

* Add utility functions for performing stratified split

* Removed unused import

* Add suggested changes

* Remove unused import from splits.py

* Add example usage of train_test_split with stratify arg to docstring

* Add test cases to test stratified_train_test_split

* Move stratify functions to utils/stratify.py and refactor code.

* Fix test cases according to ClassLabel class

* Add changes for error handling and recommended changes

* Add error handling for KeyErr for stratify_by_column arg

* Add tests for checking error handling in stratified train_test_split

* Removed unwanted imports

* Remove `import datasets`

* Update src/datasets/arrow_dataset.py

Co-authored-by: Mario Šaško &lt;mario@huggingface.co&gt;
Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -93,6 +93,7 @@
 from .utils.file_utils import _retry, estimate_dataset_size
 from .utils.info_utils import is_small_dataset
 from .utils.py_utils import convert_file_size_to_int, temporary_assignment, unique_values
+from .utils.stratify import stratified_shuffle_split_generate_indices
 from .utils.typing import PathLike
 
 
@@ -3255,6 +3256,7 @@ def train_test_split(
         test_size: Union[float, int, None] = None,
         train_size: Union[float, int, None] = None,
         shuffle: bool = True,
+        stratify_by_column: Optional[str] = None,
         seed: Optional[int] = None,
         generator: Optional[np.random.Generator] = None,
         keep_in_memory: bool = False,
@@ -3281,6 +3283,7 @@ def train_test_split(
                 If int, represents the absolute number of train samples.
                 If None, the value is automatically set to the complement of the test size.
             shuffle (:obj:`bool`, optional, default `True`): Whether or not to shuffle the data before splitting.
+            stratify_by_column (:obj:`str`, optional, default `None`): The column name of labels to be used to perform stratified split of data.
             seed (:obj:`int`, optional): A seed to initialize the default BitGenerator if ``generator=None``.
                 If None, then fresh, unpredictable entropy will be pulled from the OS.
                 If an int or array_like[ints] is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.
@@ -3320,6 +3323,24 @@ def train_test_split(
 
         # set a seed
         >>> ds = ds.train_test_split(test_size=0.2, seed=42)
+
+        # stratified split
+        >>> ds = load_dataset("imdb",split="train")
+        Dataset({
+            features: ['text', 'label'],
+            num_rows: 25000
+        })
+        >>> ds = ds.train_test_split(test_size=0.2, stratify_by_column="label")
+        DatasetDict({
+            train: Dataset({
+                features: ['text', 'label'],
+                num_rows: 20000
+            })
+            test: Dataset({
+                features: ['text', 'label'],
+                num_rows: 5000
+            })
+        })
         ```
         """
         from .dataset_dict import DatasetDict  # import here because of circular dependency
@@ -3437,15 +3458,42 @@ def train_test_split(
                         ),
                     }
                 )
-
         if not shuffle:
+            if stratify_by_column is not None:
+                raise ValueError("Stratified train/test split is not implemented for `shuffle=False`")
             train_indices = np.arange(n_train)
             test_indices = np.arange(n_train, n_train + n_test)
         else:
+            # stratified partition
+            if stratify_by_column is not None:
+                if stratify_by_column not in self.features.keys():
+                    raise ValueError(f"Key {stratify_by_column} not found in {self.features.keys()}")
+                if not isinstance(self.features[stratify_by_column], ClassLabel):
+                    raise ValueError(
+                        f"Stratifying by column is only supported for {ClassLabel.__name__} column, and column {stratify_by_column} is {type(self.features[stratify_by_column]).__name__}."
+                    )
+                try:
+                    train_indices, test_indices = next(
+                        stratified_shuffle_split_generate_indices(
+                            self.with_format("numpy")[stratify_by_column], n_train, n_test, rng=generator
+                        )
+                    )
+                except Exception as error:
+                    if str(error) == "Minimum class count error":
+                        raise ValueError(
+                            f"The least populated class in {stratify_by_column} column has only 1"
+                            " member, which is too few. The minimum"
+                            " number of groups for any class cannot"
+                            " be less than 2."
+                        )
+                    else:
+                        raise error
+
             # random partition
-            permutation = generator.permutation(len(self))
-            test_indices = permutation[:n_test]
-            train_indices = permutation[n_test : (n_test + n_train)]
+            else:
+                permutation = generator.permutation(len(self))
+                test_indices = permutation[:n_test]
+                train_indices = permutation[n_test : (n_test + n_train)]
 
         train_split = self.select(
             indices=train_indices,
diff --git a/src/datasets/utils/stratify.py b/src/datasets/utils/stratify.py
@@ -0,0 +1,107 @@
+import numpy as np
+
+
+def approximate_mode(class_counts, n_draws, rng):
+    """Computes approximate mode of multivariate hypergeometric.
+    This is an approximation to the mode of the multivariate
+    hypergeometric given by class_counts and n_draws.
+    It shouldn't be off by more than one.
+    It is the mostly likely outcome of drawing n_draws many
+    samples from the population given by class_counts.
+    Args
+    ----------
+    class_counts : ndarray of int
+        Population per class.
+    n_draws : int
+        Number of draws (samples to draw) from the overall population.
+    rng : random state
+        Used to break ties.
+    Returns
+    -------
+    sampled_classes : ndarray of int
+        Number of samples drawn from each class.
+        np.sum(sampled_classes) == n_draws
+
+    """
+    # this computes a bad approximation to the mode of the
+    # multivariate hypergeometric given by class_counts and n_draws
+    continuous = n_draws * class_counts / class_counts.sum()
+    # floored means we don't overshoot n_samples, but probably undershoot
+    floored = np.floor(continuous)
+    # we add samples according to how much "left over" probability
+    # they had, until we arrive at n_samples
+    need_to_add = int(n_draws - floored.sum())
+    if need_to_add > 0:
+        remainder = continuous - floored
+        values = np.sort(np.unique(remainder))[::-1]
+        # add according to remainder, but break ties
+        # randomly to avoid biases
+        for value in values:
+            (inds,) = np.where(remainder == value)
+            # if we need_to_add less than what's in inds
+            # we draw randomly from them.
+            # if we need to add more, we add them all and
+            # go to the next value
+            add_now = min(len(inds), need_to_add)
+            inds = rng.choice(inds, size=add_now, replace=False)
+            floored[inds] += 1
+            need_to_add -= add_now
+            if need_to_add == 0:
+                break
+    return floored.astype(np.int)
+
+
+def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10):
+    """
+
+    Provides train/test indices to split data in train/test sets.
+    It's reference is taken from StratifiedShuffleSplit implementation
+    of scikit-learn library.
+
+    Args
+    ----------
+
+    n_train : int,
+        represents the absolute number of train samples.
+
+    n_test : int,
+        represents the absolute number of test samples.
+
+    random_state : int or RandomState instance, default=None
+        Controls the randomness of the training and testing indices produced.
+        Pass an int for reproducible output across multiple function calls.
+
+    n_splits : int, default=10
+        Number of re-shuffling & splitting iterations.
+    """
+    classes, y_indices = np.unique(y, return_inverse=True)
+    n_classes = classes.shape[0]
+    class_counts = np.bincount(y_indices)
+    if np.min(class_counts) < 2:
+        raise ValueError("Minimum class count error")
+    if n_train < n_classes:
+        raise ValueError(
+            "The train_size = %d should be greater or " "equal to the number of classes = %d" % (n_train, n_classes)
+        )
+    if n_test < n_classes:
+        raise ValueError(
+            "The test_size = %d should be greater or " "equal to the number of classes = %d" % (n_test, n_classes)
+        )
+    class_indices = np.split(np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1])
+    for _ in range(n_splits):
+        n_i = approximate_mode(class_counts, n_train, rng)
+        class_counts_remaining = class_counts - n_i
+        t_i = approximate_mode(class_counts_remaining, n_test, rng)
+
+        train = []
+        test = []
+
+        for i in range(n_classes):
+            permutation = rng.permutation(class_counts[i])
+            perm_indices_class_i = class_indices[i].take(permutation, mode="clip")
+            train.extend(perm_indices_class_i[: n_i[i]])
+            test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]])
+        train = rng.permutation(train)
+        test = rng.permutation(test)
+
+        yield train, test
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -11,6 +11,7 @@
 from unittest.mock import patch
 
 import numpy as np
+import numpy.testing as npt
 import pandas as pd
 import pyarrow as pa
 import pytest
@@ -3553,3 +3554,69 @@ def test_task_text_classification_when_columns_removed(self):
         with Dataset.from_dict(data, info=info) as dset:
             with dset.map(lambda x: {"new_column": 0}, remove_columns=dset.column_names) as dset:
                 self.assertDictEqual(dset.features, features_after_map)
+
+
+class StratifiedTest(TestCase):
+    def test_errors_train_test_split_stratify(self):
+        ys = [
+            np.array([0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2]),
+            np.array([0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+            np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
+            np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]),
+            np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]),
+        ]
+        for i in range(len(ys)):
+            features = Features({"text": Value("int64"), "label": ClassLabel(len(np.unique(ys[i])))})
+            data = {"text": np.ones(len(ys[i])), "label": ys[i]}
+            d1 = Dataset.from_dict(data, features=features)
+
+            # For checking stratify_by_column exist as key in self.features.keys()
+            if i == 0:
+                self.assertRaises(ValueError, d1.train_test_split, 0.33, stratify_by_column="labl")
+
+            # For checking minimum class count error
+            elif i == 1:
+                self.assertRaises(ValueError, d1.train_test_split, 0.33, stratify_by_column="label")
+
+            # For check typeof label as ClassLabel type
+            elif i == 2:
+                d1 = Dataset.from_dict(data)
+                self.assertRaises(ValueError, d1.train_test_split, 0.33, stratify_by_column="label")
+
+            # For checking test_size should be greater than or equal to number of classes
+            elif i == 3:
+                self.assertRaises(ValueError, d1.train_test_split, 0.30, stratify_by_column="label")
+
+            # For checking train_size should be greater than or equal to number of classes
+            elif i == 4:
+                self.assertRaises(ValueError, d1.train_test_split, 0.60, stratify_by_column="label")
+
+    def test_train_test_split_startify(self):
+        ys = [
+            np.array([0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2]),
+            np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
+            np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
+            np.array([0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]),
+            np.array([0] * 800 + [1] * 50),
+        ]
+        for y in ys:
+            features = Features({"text": Value("int64"), "label": ClassLabel(len(np.unique(y)))})
+            data = {"text": np.ones(len(y)), "label": y}
+            d1 = Dataset.from_dict(data, features=features)
+            d1 = d1.train_test_split(test_size=0.33, stratify_by_column="label")
+            y = np.asanyarray(y)  # To make it indexable for y[train]
+            test_size = np.ceil(0.33 * len(y))
+            train_size = len(y) - test_size
+            npt.assert_array_equal(np.unique(d1["train"]["label"]), np.unique(d1["test"]["label"]))
+
+            # checking classes proportion
+            p_train = np.bincount(np.unique(d1["train"]["label"], return_inverse=True)[1]) / float(
+                len(d1["train"]["label"])
+            )
+            p_test = np.bincount(np.unique(d1["test"]["label"], return_inverse=True)[1]) / float(
+                len(d1["test"]["label"])
+            )
+            npt.assert_array_almost_equal(p_train, p_test, 1)
+            assert len(d1["train"]["text"]) + len(d1["test"]["text"]) == y.size
+            assert len(d1["train"]["text"]) == train_size
+            assert len(d1["test"]["text"]) == test_size