Skip to content

Commit 5348e01

Browse files
Add szilard1m and szilard10m datasets (#199)
* add szilard1m/10m sets * fix * lint * black * remove szilard_1m from weekly * Update sklbench/datasets/loaders.py Co-authored-by: david-cortes-intel <david.cortes@intel.com> --------- Co-authored-by: Dmitry Razdoburdin <> Co-authored-by: david-cortes-intel <david.cortes@intel.com>
1 parent 0212cd9 commit 5348e01

File tree

2 files changed

+94
-0
lines changed

2 files changed

+94
-0
lines changed

configs/weekly/xgboost_binary.json

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"INCLUDE": ["../common/xgboost.json"],
3+
"PARAMETERS_SETS": {
4+
"xgboost data": [
5+
{
6+
"data": {
7+
"dataset": "szilard_1m"
8+
},
9+
"algorithm": {
10+
"estimator_params": {
11+
"n_estimators": 100,
12+
"max_depth": 10,
13+
"learning_rate": 0.1
14+
}
15+
}
16+
}
17+
]
18+
},
19+
"TEMPLATES": {
20+
"regression": {
21+
"SETS": [
22+
"xgboost binary classification",
23+
"xgboost implementations",
24+
"xgboost data"
25+
]
26+
}
27+
}
28+
}

sklbench/datasets/loaders.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,70 @@ def load_sensit(
663663
return {"x": x, "y": y}, data_desc
664664

665665

666+
@cache
667+
def load_szilard_1m(
668+
data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
669+
) -> Tuple[Dict, Dict]:
670+
"""
671+
https://github.com/szilard/GBM-perf
672+
"""
673+
url = "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv"
674+
d_train = download_and_read_csv(url, raw_data_cache)
675+
676+
url = "https://s3.amazonaws.com/benchm-ml--main/test.csv"
677+
d_test = download_and_read_csv(url, raw_data_cache)
678+
679+
label_col = "dep_delayed_15min"
680+
y_train = (d_train[label_col] == "Y").astype(int).values
681+
y_test = (d_test[label_col] == "Y").astype(int).values
682+
y = np.concatenate([y_train, y_test])
683+
684+
X_train_raw = d_train.drop(columns=[label_col])
685+
X_test_raw = d_test.drop(columns=[label_col])
686+
687+
combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
688+
X_combined_oh = pd.get_dummies(combined)
689+
x = sparse.csr_matrix(X_combined_oh.values)
690+
691+
n_train = len(d_train)
692+
n_test = len(d_test)
693+
data_desc = {"default_split": {"test_size": n_train, "test_size": n_test}}
694+
695+
return {"x": x, "y": y}, data_desc
696+
697+
698+
@cache
699+
def load_szilard_10m(
700+
data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
701+
) -> Tuple[Dict, Dict]:
702+
"""
703+
https://github.com/szilard/GBM-perf
704+
"""
705+
url = "https://s3.amazonaws.com/benchm-ml--main/train-10m.csv"
706+
d_train = download_and_read_csv(url, raw_data_cache)
707+
708+
url = "https://s3.amazonaws.com/benchm-ml--main/test.csv"
709+
d_test = download_and_read_csv(url, raw_data_cache)
710+
711+
label_col = "dep_delayed_15min"
712+
y_train = (d_train[label_col] == "Y").astype(int).values
713+
y_test = (d_test[label_col] == "Y").astype(int).values
714+
y = np.concatenate([y_train, y_test])
715+
716+
X_train_raw = d_train.drop(columns=[label_col])
717+
X_test_raw = d_test.drop(columns=[label_col])
718+
719+
combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
720+
X_combined_oh = pd.get_dummies(combined, sparse=True)
721+
x = sparse.csr_matrix(X_combined_oh)
722+
723+
n_train = len(d_train)
724+
n_test = len(d_test)
725+
data_desc = {"default_split": {"test_size": n_train, "test_size": n_test}}
726+
727+
return {"x": x, "y": y}, data_desc
728+
729+
666730
"""
667731
Regression datasets
668732
"""
@@ -832,6 +896,8 @@ def load_gist(
832896
"svhn": load_svhn,
833897
"sensit": load_sensit,
834898
"letters": load_letters,
899+
"szilard_1m": load_szilard_1m,
900+
"szilard_10m": load_szilard_10m,
835901
# regression
836902
"abalone": load_abalone,
837903
"california_housing": load_california_housing,

0 commit comments

Comments
 (0)