@@ -663,6 +663,70 @@ def load_sensit(
663663 return {"x" : x , "y" : y }, data_desc
664664
665665
666+ @cache
667+ def load_szilard_1m (
668+ data_name : str , data_cache : str , raw_data_cache : str , dataset_params : Dict
669+ ) -> Tuple [Dict , Dict ]:
670+ """
671+ https://github.com/szilard/GBM-perf
672+ """
673+ url = "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv"
674+ d_train = download_and_read_csv (url , raw_data_cache )
675+
676+ url = "https://s3.amazonaws.com/benchm-ml--main/test.csv"
677+ d_test = download_and_read_csv (url , raw_data_cache )
678+
679+ label_col = "dep_delayed_15min"
680+ y_train = (d_train [label_col ] == "Y" ).astype (int ).values
681+ y_test = (d_test [label_col ] == "Y" ).astype (int ).values
682+ y = np .concatenate ([y_train , y_test ])
683+
684+ X_train_raw = d_train .drop (columns = [label_col ])
685+ X_test_raw = d_test .drop (columns = [label_col ])
686+
687+ combined = pd .concat ([X_train_raw , X_test_raw ], axis = 0 , ignore_index = True )
688+ X_combined_oh = pd .get_dummies (combined )
689+ x = sparse .csr_matrix (X_combined_oh .values )
690+
691+ n_train = len (d_train )
692+ n_test = len (d_test )
693+ data_desc = {"default_split" : {"test_size" : n_train , "test_size" : n_test }}
694+
695+ return {"x" : x , "y" : y }, data_desc
696+
697+
698+ @cache
699+ def load_szilard_10m (
700+ data_name : str , data_cache : str , raw_data_cache : str , dataset_params : Dict
701+ ) -> Tuple [Dict , Dict ]:
702+ """
703+ https://github.com/szilard/GBM-perf
704+ """
705+ url = "https://s3.amazonaws.com/benchm-ml--main/train-10m.csv"
706+ d_train = download_and_read_csv (url , raw_data_cache )
707+
708+ url = "https://s3.amazonaws.com/benchm-ml--main/test.csv"
709+ d_test = download_and_read_csv (url , raw_data_cache )
710+
711+ label_col = "dep_delayed_15min"
712+ y_train = (d_train [label_col ] == "Y" ).astype (int ).values
713+ y_test = (d_test [label_col ] == "Y" ).astype (int ).values
714+ y = np .concatenate ([y_train , y_test ])
715+
716+ X_train_raw = d_train .drop (columns = [label_col ])
717+ X_test_raw = d_test .drop (columns = [label_col ])
718+
719+ combined = pd .concat ([X_train_raw , X_test_raw ], axis = 0 , ignore_index = True )
720+ X_combined_oh = pd .get_dummies (combined , sparse = True )
721+ x = sparse .csr_matrix (X_combined_oh )
722+
723+ n_train = len (d_train )
724+ n_test = len (d_test )
725+ data_desc = {"default_split" : {"test_size" : n_train , "test_size" : n_test }}
726+
727+ return {"x" : x , "y" : y }, data_desc
728+
729+
666730"""
667731Regression datasets
668732"""
@@ -832,6 +896,8 @@ def load_gist(
832896 "svhn" : load_svhn ,
833897 "sensit" : load_sensit ,
834898 "letters" : load_letters ,
899+ "szilard_1m" : load_szilard_1m ,
900+ "szilard_10m" : load_szilard_10m ,
835901 # regression
836902 "abalone" : load_abalone ,
837903 "california_housing" : load_california_housing ,
0 commit comments