diff --git a/configs/weekly/xgboost_binary.json b/configs/weekly/xgboost_binary.json new file mode 100644 index 00000000..96bc5e82 --- /dev/null +++ b/configs/weekly/xgboost_binary.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../common/xgboost.json"], + "PARAMETERS_SETS": { + "xgboost data": [ + { + "data": { + "dataset": "szilard_1m" + }, + "algorithm": { + "estimator_params": { + "n_estimators": 100, + "max_depth": 10, + "learning_rate": 0.1 + } + } + } + ] + }, + "TEMPLATES": { + "regression": { + "SETS": [ + "xgboost binary classification", + "xgboost implementations", + "xgboost data" + ] + } + } +} diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index a57681ba..b4ba6cef 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -663,6 +663,70 @@ def load_sensit( return {"x": x, "y": y}, data_desc +@cache +def load_szilard_1m( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + https://github.com/szilard/GBM-perf + """ + url = "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv" + d_train = download_and_read_csv(url, raw_data_cache) + + url = "https://s3.amazonaws.com/benchm-ml--main/test.csv" + d_test = download_and_read_csv(url, raw_data_cache) + + label_col = "dep_delayed_15min" + y_train = (d_train[label_col] == "Y").astype(int).values + y_test = (d_test[label_col] == "Y").astype(int).values + y = np.concatenate([y_train, y_test]) + + X_train_raw = d_train.drop(columns=[label_col]) + X_test_raw = d_test.drop(columns=[label_col]) + + combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) + X_combined_oh = pd.get_dummies(combined) + x = sparse.csr_matrix(X_combined_oh.values) + + n_train = len(d_train) + n_test = len(d_test) + data_desc = {"default_split": {"test_size": n_train, "test_size": n_test}} + + return {"x": x, "y": y}, data_desc + + +@cache +def load_szilard_10m( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + https://github.com/szilard/GBM-perf + """ + url = "https://s3.amazonaws.com/benchm-ml--main/train-10m.csv" + d_train = download_and_read_csv(url, raw_data_cache) + + url = "https://s3.amazonaws.com/benchm-ml--main/test.csv" + d_test = download_and_read_csv(url, raw_data_cache) + + label_col = "dep_delayed_15min" + y_train = (d_train[label_col] == "Y").astype(int).values + y_test = (d_test[label_col] == "Y").astype(int).values + y = np.concatenate([y_train, y_test]) + + X_train_raw = d_train.drop(columns=[label_col]) + X_test_raw = d_test.drop(columns=[label_col]) + + combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) + X_combined_oh = pd.get_dummies(combined, sparse=True) + x = sparse.csr_matrix(X_combined_oh) + + n_train = len(d_train) + n_test = len(d_test) + data_desc = {"default_split": {"test_size": n_train, "test_size": n_test}} + + return {"x": x, "y": y}, data_desc + + """ Regression datasets """ @@ -832,6 +896,8 @@ def load_gist( "svhn": load_svhn, "sensit": load_sensit, "letters": load_letters, + "szilard_1m": load_szilard_1m, + "szilard_10m": load_szilard_10m, # regression "abalone": load_abalone, "california_housing": load_california_housing,