Commit 961e596
Added stratify option to train_test_split function. (#4322)
* Add stratify option to train_test_split
* Add utility functions for performing stratified split
* Removed unused import
* Add suggested changes
* Remove unused import from splits.py
* Add example usage of train_test_split with stratify arg to docstring
* Add test cases to test stratified_train_test_split
* Move stratify functions to utils/stratify.py and refactor code.
* Fix test cases according to ClassLabel class
* Add changes for error handling and recommended changes
* Add error handling for KeyErr for stratify_by_column arg
* Add tests for checking error handling in stratified train_test_split
* Removed unwanted imports
* Remove `import datasets`
* Update src/datasets/arrow_dataset.py
Co-authored-by: Mario Šaško <mario@huggingface.co>
Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>1 parent ea4df89 commit 961e596
File tree
3 files changed
+226
-4
lines changed- src/datasets
- utils
- tests
3 files changed
+226
-4
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
93 | 93 | | |
94 | 94 | | |
95 | 95 | | |
| 96 | + | |
96 | 97 | | |
97 | 98 | | |
98 | 99 | | |
| |||
3255 | 3256 | | |
3256 | 3257 | | |
3257 | 3258 | | |
| 3259 | + | |
3258 | 3260 | | |
3259 | 3261 | | |
3260 | 3262 | | |
| |||
3281 | 3283 | | |
3282 | 3284 | | |
3283 | 3285 | | |
| 3286 | + | |
3284 | 3287 | | |
3285 | 3288 | | |
3286 | 3289 | | |
| |||
3320 | 3323 | | |
3321 | 3324 | | |
3322 | 3325 | | |
| 3326 | + | |
| 3327 | + | |
| 3328 | + | |
| 3329 | + | |
| 3330 | + | |
| 3331 | + | |
| 3332 | + | |
| 3333 | + | |
| 3334 | + | |
| 3335 | + | |
| 3336 | + | |
| 3337 | + | |
| 3338 | + | |
| 3339 | + | |
| 3340 | + | |
| 3341 | + | |
| 3342 | + | |
| 3343 | + | |
3323 | 3344 | | |
3324 | 3345 | | |
3325 | 3346 | | |
| |||
3437 | 3458 | | |
3438 | 3459 | | |
3439 | 3460 | | |
3440 | | - | |
3441 | 3461 | | |
| 3462 | + | |
| 3463 | + | |
3442 | 3464 | | |
3443 | 3465 | | |
3444 | 3466 | | |
| 3467 | + | |
| 3468 | + | |
| 3469 | + | |
| 3470 | + | |
| 3471 | + | |
| 3472 | + | |
| 3473 | + | |
| 3474 | + | |
| 3475 | + | |
| 3476 | + | |
| 3477 | + | |
| 3478 | + | |
| 3479 | + | |
| 3480 | + | |
| 3481 | + | |
| 3482 | + | |
| 3483 | + | |
| 3484 | + | |
| 3485 | + | |
| 3486 | + | |
| 3487 | + | |
| 3488 | + | |
| 3489 | + | |
| 3490 | + | |
| 3491 | + | |
3445 | 3492 | | |
3446 | | - | |
3447 | | - | |
3448 | | - | |
| 3493 | + | |
| 3494 | + | |
| 3495 | + | |
| 3496 | + | |
3449 | 3497 | | |
3450 | 3498 | | |
3451 | 3499 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
| 1 | + | |
| 2 | + | |
| 3 | + | |
| 4 | + | |
| 5 | + | |
| 6 | + | |
| 7 | + | |
| 8 | + | |
| 9 | + | |
| 10 | + | |
| 11 | + | |
| 12 | + | |
| 13 | + | |
| 14 | + | |
| 15 | + | |
| 16 | + | |
| 17 | + | |
| 18 | + | |
| 19 | + | |
| 20 | + | |
| 21 | + | |
| 22 | + | |
| 23 | + | |
| 24 | + | |
| 25 | + | |
| 26 | + | |
| 27 | + | |
| 28 | + | |
| 29 | + | |
| 30 | + | |
| 31 | + | |
| 32 | + | |
| 33 | + | |
| 34 | + | |
| 35 | + | |
| 36 | + | |
| 37 | + | |
| 38 | + | |
| 39 | + | |
| 40 | + | |
| 41 | + | |
| 42 | + | |
| 43 | + | |
| 44 | + | |
| 45 | + | |
| 46 | + | |
| 47 | + | |
| 48 | + | |
| 49 | + | |
| 50 | + | |
| 51 | + | |
| 52 | + | |
| 53 | + | |
| 54 | + | |
| 55 | + | |
| 56 | + | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| 61 | + | |
| 62 | + | |
| 63 | + | |
| 64 | + | |
| 65 | + | |
| 66 | + | |
| 67 | + | |
| 68 | + | |
| 69 | + | |
| 70 | + | |
| 71 | + | |
| 72 | + | |
| 73 | + | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + | |
| 84 | + | |
| 85 | + | |
| 86 | + | |
| 87 | + | |
| 88 | + | |
| 89 | + | |
| 90 | + | |
| 91 | + | |
| 92 | + | |
| 93 | + | |
| 94 | + | |
| 95 | + | |
| 96 | + | |
| 97 | + | |
| 98 | + | |
| 99 | + | |
| 100 | + | |
| 101 | + | |
| 102 | + | |
| 103 | + | |
| 104 | + | |
| 105 | + | |
| 106 | + | |
| 107 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
11 | 11 | | |
12 | 12 | | |
13 | 13 | | |
| 14 | + | |
14 | 15 | | |
15 | 16 | | |
16 | 17 | | |
| |||
3553 | 3554 | | |
3554 | 3555 | | |
3555 | 3556 | | |
| 3557 | + | |
| 3558 | + | |
| 3559 | + | |
| 3560 | + | |
| 3561 | + | |
| 3562 | + | |
| 3563 | + | |
| 3564 | + | |
| 3565 | + | |
| 3566 | + | |
| 3567 | + | |
| 3568 | + | |
| 3569 | + | |
| 3570 | + | |
| 3571 | + | |
| 3572 | + | |
| 3573 | + | |
| 3574 | + | |
| 3575 | + | |
| 3576 | + | |
| 3577 | + | |
| 3578 | + | |
| 3579 | + | |
| 3580 | + | |
| 3581 | + | |
| 3582 | + | |
| 3583 | + | |
| 3584 | + | |
| 3585 | + | |
| 3586 | + | |
| 3587 | + | |
| 3588 | + | |
| 3589 | + | |
| 3590 | + | |
| 3591 | + | |
| 3592 | + | |
| 3593 | + | |
| 3594 | + | |
| 3595 | + | |
| 3596 | + | |
| 3597 | + | |
| 3598 | + | |
| 3599 | + | |
| 3600 | + | |
| 3601 | + | |
| 3602 | + | |
| 3603 | + | |
| 3604 | + | |
| 3605 | + | |
| 3606 | + | |
| 3607 | + | |
| 3608 | + | |
| 3609 | + | |
| 3610 | + | |
| 3611 | + | |
| 3612 | + | |
| 3613 | + | |
| 3614 | + | |
| 3615 | + | |
| 3616 | + | |
| 3617 | + | |
| 3618 | + | |
| 3619 | + | |
| 3620 | + | |
| 3621 | + | |
| 3622 | + | |
0 commit comments