From d590e4b9071bb717a8e2e28291900dd9079bf2a1 Mon Sep 17 00:00:00 2001 From: winstonq Date: Mon, 6 Dec 2021 15:49:10 -0800 Subject: [PATCH 1/2] Allow passing in cr/cl bounds and settings. Allow CPU execution. Fix GPU support. Fix module loading. --- args.py | 6 +++-- experiment.py | 58 +++++++++++++++++++++++++++++++++++++---------- rrl/components.py | 9 ++++++-- rrl/models.py | 29 +++++++++++++++--------- rrl/utils.py | 42 ++++++++++++++++++++-------------- 5 files changed, 100 insertions(+), 44 deletions(-) diff --git a/args.py b/args.py index b8169e2..ea63852 100644 --- a/args.py +++ b/args.py @@ -1,12 +1,13 @@ import os import argparse +import torch parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-d', '--data_set', type=str, default='tic-tac-toe', help='Set the data set for training. All the data sets in the dataset folder are available.') parser.add_argument('-i', '--device_ids', type=str, default=None, help='Set the device (GPU ids). Split by @.' - ' E.g., 0@2@3.') + ' E.g., cuda:0@cuda:2@cuda:3.') parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking within the nodes') parser.add_argument('-e', '--epoch', type=int, default=41, help='Set the total epoch.') parser.add_argument('-bs', '--batch_size', type=int, default=64, help='Set the batch size.') @@ -51,7 +52,8 @@ rrl_args.plot_file = os.path.join(rrl_args.folder_path, 'plot_file.pdf') rrl_args.log = os.path.join(rrl_args.folder_path, 'log.txt') rrl_args.test_res = os.path.join(rrl_args.folder_path, 'test_res.txt') -rrl_args.device_ids = list(map(int, rrl_args.device_ids.strip().split('@'))) +rrl_args.device_ids = list(map(lambda id: torch.device(id), rrl_args.device_ids.strip().split('@'))) \ + if rrl_args.device_ids else [None] rrl_args.gpus = len(rrl_args.device_ids) rrl_args.nodes = 1 rrl_args.world_size = rrl_args.gpus * rrl_args.nodes diff --git a/experiment.py b/experiment.py index a70a20b..f67a7c1 100644 --- a/experiment.py +++ b/experiment.py @@ -1,4 +1,4 @@ -import os +import os, json import numpy as np import torch from torch.utils.data.dataset import random_split @@ -14,15 +14,37 @@ DATA_DIR = './dataset' +def read_settings(settings_path): + if os.path.exists(settings_path): + with open(settings_path, 'r') as f: + settings = json.load(f) + else: + settings = { + 'normalize_continuous': True, + 'one_hot_encode_features': True, + 'impute_continuous': True, + # of shape [continious columns, lower bounds, upper bounds] + 'bounds': None + # alternatively, pass in individual bounds + # lower_bound: [continuous cols] + # upper_bound: [continuous cols] + } + return settings + + def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False, save_best=True): data_path = os.path.join(DATA_DIR, dataset + '.data') info_path = os.path.join(DATA_DIR, dataset + '.info') + settings_path = os.path.join(DATA_DIR, dataset + '.settings.json') X_df, y_df, f_df, label_pos = read_csv(data_path, info_path, shuffle=True) - db_enc = DBEncoder(f_df, discrete=False) + settings = read_settings(settings_path) + db_enc = DBEncoder(f_df, discrete=False, + one_hot_encode_features=settings['one_hot_encode_features'], + impute_continuous=settings['impute_continuous']) db_enc.fit(X_df, y_df) - X, y = db_enc.transform(X_df, y_df, normalized=True, keep_stat=True) + X, y = db_enc.transform(X_df, y_df, normalized=settings['normalize_continuous'], keep_stat=True) kf = KFold(n_splits=5, shuffle=True, random_state=0) train_index, test_index = list(kf.split(X_df))[k] @@ -45,15 +67,21 @@ def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False valid_loader = DataLoader(valid_sub, batch_size=batch_size, shuffle=False, pin_memory=pin_memory) test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=pin_memory) - return db_enc, train_loader, valid_loader, test_loader + if settings['bounds'] is not None and 'lower_bounds' not in settings: + bounds = settings['bounds'] + settings['lower_bounds'] = np.array([bounds[col][0] for col in db_enc.X_fname[db_enc.discrete_flen:]]) + settings['upper_bounds'] = np.array([bounds[col][1] for col in db_enc.X_fname[db_enc.discrete_flen:]]) + return db_enc, train_loader, valid_loader, test_loader, settings -def train_model(gpu, args): +def train_model(gpu, args, distributed=True): rank = args.nr * args.gpus + gpu - dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank) + if distributed: + dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank) torch.manual_seed(42) device_id = args.device_ids[gpu] - torch.cuda.set_device(device_id) + if device_id and device_id.type == 'cuda': + torch.cuda.set_device(device_id) if gpu == 0: writer = SummaryWriter(args.folder_path) @@ -63,8 +91,9 @@ def train_model(gpu, args): is_rank0 = False dataset = args.data_set - db_enc, train_loader, valid_loader, _ = get_data_loader(dataset, args.world_size, rank, args.batch_size, - k=args.ith_kfold, pin_memory=True, save_best=args.save_best) + db_enc, train_loader, valid_loader, _, settings = get_data_loader(dataset, args.world_size, rank, args.batch_size, + k=args.ith_kfold, pin_memory=True, + save_best=args.save_best) X_fname = db_enc.X_fname y_fname = db_enc.y_fname @@ -74,11 +103,14 @@ def train_model(gpu, args): rrl = RRL(dim_list=[(discrete_flen, continuous_flen)] + list(map(int, args.structure.split('@'))) + [len(y_fname)], device_id=device_id, use_not=args.use_not, + cl=settings.get('lower_bounds', None), + cr=settings.get('upper_bounds', None), is_rank0=is_rank0, log_file=args.log, writer=writer, save_best=args.save_best, estimated_grad=args.estimated_grad, + distributed=distributed, save_path=args.model) rrl.train_model( @@ -106,7 +138,8 @@ def load_model(path, device_id, log_file=None, distributed=True): stat_dict = checkpoint['model_state_dict'] for key in list(stat_dict.keys()): # remove 'module.' prefix - stat_dict[key[7:]] = stat_dict.pop(key) + if key.startswith('module.'): + stat_dict[key[7:]] = stat_dict.pop(key) rrl.net.load_state_dict(checkpoint['model_state_dict']) return rrl @@ -114,8 +147,9 @@ def load_model(path, device_id, log_file=None, distributed=True): def test_model(args): rrl = load_model(args.model, args.device_ids[0], log_file=args.test_res, distributed=False) dataset = args.data_set - db_enc, train_loader, _, test_loader = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold, save_best=False) - rrl.test(test_loader=test_loader, set_name='Test') + db_enc, train_loader, _, test_loader, _ = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold, + save_best=False) + rrl.test(test_loader=test_loader, set_name='Test', labels=db_enc.y_fname) with open(args.rrl_file, 'w') as rrl_file: rrl.rule_print(db_enc.X_fname, db_enc.y_fname, train_loader, file=rrl_file, mean=db_enc.mean, std=db_enc.std) diff --git a/rrl/components.py b/rrl/components.py index 04c9262..dbf61d6 100644 --- a/rrl/components.py +++ b/rrl/components.py @@ -23,7 +23,7 @@ def backward(ctx, grad_output): class BinarizeLayer(nn.Module): """Implement the feature discretization and binarization.""" - def __init__(self, n, input_dim, use_not=False, left=None, right=None): + def __init__(self, n, input_dim, use_not=False, cl=None, cr=None, left=None, right=None): super(BinarizeLayer, self).__init__() self.n = n self.input_dim = input_dim @@ -39,12 +39,17 @@ def __init__(self, n, input_dim, use_not=False, left=None, right=None): self.register_buffer('right', right) if self.input_dim[1] > 0: - if self.left is not None and self.right is not None: + if cl is not None and cr is not None: # bounds are specified + cl = torch.tensor(cl).type(torch.float).t() + cr = torch.tensor(cr).type(torch.float).t() + elif self.left is not None and self.right is not None: cl = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left) cr = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left) else: cl = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.) cr = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.) + assert torch.Size([self.n, self.input_dim[1]]) == cl.size() + assert torch.Size([self.n, self.input_dim[1]]) == cr.size() self.register_buffer('cl', cl) self.register_buffer('cr', cr) diff --git a/rrl/models.py b/rrl/models.py index 2063f7b..d4f5cb6 100644 --- a/rrl/models.py +++ b/rrl/models.py @@ -14,7 +14,7 @@ class MLLP(nn.Module): - def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_grad=False): + def __init__(self, dim_list, use_not=False, cl=None, cr=None, left=None, right=None, estimated_grad=False): super(MLLP, self).__init__() self.dim_list = dim_list @@ -30,7 +30,7 @@ def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_gra num += self.layer_list[-2].output_dim if i == 1: - layer = BinarizeLayer(dim_list[i], num, self.use_not, self.left, self.right) + layer = BinarizeLayer(dim_list[i], num, self.use_not, cl=cl, cr=cr, left=self.left, right=self.right) layer_name = 'binary{}'.format(i) elif i == len(dim_list) - 1: layer = LRLayer(dim_list[i], num) @@ -77,7 +77,7 @@ def layer_list(self): class RRL: def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=None, writer=None, left=None, - right=None, save_best=False, estimated_grad=False, save_path=None, distributed=True): + right=None, cl=None, cr=None, save_best=False, estimated_grad=False, save_path=None, distributed=True): super(RRL, self).__init__() self.dim_list = dim_list self.use_not = use_not @@ -99,9 +99,11 @@ def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file= logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode='w', format=log_format) self.writer = writer - self.net = MLLP(dim_list, use_not=use_not, left=left, right=right, estimated_grad=estimated_grad) + self.net = MLLP(dim_list, use_not=use_not, cl=cl, cr=cr, left=left, right=right, + estimated_grad=estimated_grad) - self.net.cuda(self.device_id) + if self.device_id and self.device_id.type == 'cuda': + self.net.cuda(self.device_id) if distributed: self.net = MyDistributedDataParallel(self.net, device_ids=[self.device_id]) @@ -161,8 +163,9 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data ba_cnt = 0 for X, y in data_loader: ba_cnt += 1 - X = X.cuda(self.device_id, non_blocking=True) - y = y.cuda(self.device_id, non_blocking=True) + if self.device_id and self.device_id.type == 'cuda': + X = X.cuda(self.device_id, non_blocking=True) + y = y.cuda(self.device_id, non_blocking=True) optimizer.zero_grad() # Zero the gradient buffers. y_pred_mllp, y_pred_rrl = self.net.forward(X) with torch.no_grad(): @@ -231,7 +234,8 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data self.save_model() return epoch_histc - def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=True, set_name='Validation'): + def test(self, X=None, y=None, labels=None, test_loader=None, + batch_size=32, need_transform=True, set_name='Validation'): if X is not None and y is not None and need_transform: X, y = self.data_transform(X, y) with torch.no_grad(): @@ -251,7 +255,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T y_pred_list = [] y_pred_b_list = [] for X, y in test_loader: - X = X.cuda(self.device_id, non_blocking=True) + if self.device_id and self.device_id.type == 'cuda': + X = X.cuda(self.device_id, non_blocking=True) output = self.net.forward(X) y_pred_list.append(output[0]) y_pred_b_list.append(output[1]) @@ -275,7 +280,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T logging.info('On {} Set:\n\tAccuracy of RRL Model: {}' '\n\tF1 Score of RRL Model: {}'.format(set_name, accuracy_b, f1_score_b)) logging.info('On {} Set:\nPerformance of RRL Model: \n{}\n{}'.format( - set_name, metrics.confusion_matrix(y_true, y_pred_b_arg), metrics.classification_report(y_true, y_pred_b_arg))) + set_name, metrics.confusion_matrix(y_true, y_pred_b_arg), + metrics.classification_report(y_true, y_pred_b_arg, target_names=labels))) logging.info('-' * 60) return accuracy, accuracy_b, f1_score, f1_score_b @@ -289,7 +295,8 @@ def detect_dead_node(self, data_loader=None): layer.node_activation_cnt = torch.zeros(layer.output_dim, dtype=torch.double, device=self.device_id) layer.forward_tot = 0 for x, y in data_loader: - x = x.cuda(self.device_id) + if self.device_id and self.device_id.type == 'cuda': + x = x.cuda(self.device_id) x_res = None for i, layer in enumerate(self.net.layer_list[:-1]): if i <= 1: diff --git a/rrl/utils.py b/rrl/utils.py index 440f396..3b23872 100644 --- a/rrl/utils.py +++ b/rrl/utils.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +from scipy.sparse import issparse from sklearn import preprocessing from sklearn.impute import SimpleImputer @@ -29,13 +30,14 @@ def read_csv(data_path, info_path, shuffle=False): class DBEncoder: """Encoder used for data discretization and binarization.""" - def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first'): + def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first', + impute_continuous=True, one_hot_encode_features=True): self.f_df = f_df self.discrete = discrete self.y_one_hot = y_one_hot self.label_enc = preprocessing.OneHotEncoder(categories='auto') if y_one_hot else preprocessing.LabelEncoder() - self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop) - self.imp = SimpleImputer(missing_values=np.nan, strategy='mean') + self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop) if one_hot_encode_features else None + self.imp = SimpleImputer(missing_values=np.nan, strategy='mean') if impute_continuous else None self.X_fname = None self.y_fname = None self.discrete_flen = None @@ -59,16 +61,18 @@ def fit(self, X_df, y_df): self.y_fname = list(self.label_enc.get_feature_names(y_df.columns)) if self.y_one_hot else y_df.columns if not continuous_data.empty: - # Use mean as missing value for continuous columns if do not discretize them. - self.imp.fit(continuous_data.values) + if self.imp is not None: + # Use mean as missing value for continuous columns if do not discretize them. + self.imp.fit(continuous_data.values) if not discrete_data.empty: - # One-hot encoding - self.feature_enc.fit(discrete_data) - feature_names = discrete_data.columns - self.X_fname = list(self.feature_enc.get_feature_names(feature_names)) + self.X_fname = discrete_data.columns.to_list() + if self.feature_enc is not None: + # One-hot encoding + self.feature_enc.fit(discrete_data) + self.X_fname = list(self.feature_enc.get_feature_names(self.X_fname)) self.discrete_flen = len(self.X_fname) if not self.discrete: - self.X_fname.extend(continuous_data.columns) + self.X_fname.extend(continuous_data.columns.to_list()) else: self.X_fname = continuous_data.columns self.discrete_flen = 0 @@ -84,21 +88,25 @@ def transform(self, X_df, y_df, normalized=False, keep_stat=False): y = y.toarray() if not continuous_data.empty: - # Use mean as missing value for continuous columns if we do not discretize them. - continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values), - columns=continuous_data.columns) + if self.imp is not None: + # Use mean as missing value for continuous columns if we do not discretize them. + continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values), + columns=continuous_data.columns) if normalized: if keep_stat: self.mean = continuous_data.mean() self.std = continuous_data.std() continuous_data = (continuous_data - self.mean) / self.std if not discrete_data.empty: - # One-hot encoding - discrete_data = self.feature_enc.transform(discrete_data) + if self.feature_enc is not None: + # One-hot encoding + discrete_data = self.feature_enc.transform(discrete_data) + if issparse(discrete_data): + discrete_data = discrete_data.toarray() if not self.discrete: - X_df = pd.concat([pd.DataFrame(discrete_data.toarray()), continuous_data], axis=1) + X_df = pd.concat([pd.DataFrame(discrete_data), continuous_data], axis=1) else: - X_df = pd.DataFrame(discrete_data.toarray()) + X_df = pd.DataFrame(discrete_data) else: X_df = continuous_data return X_df.values, y From 9ddf68abf1a12d79ca98bd5ee4c67daa97e4d69c Mon Sep 17 00:00:00 2001 From: winstonq Date: Mon, 6 Dec 2021 16:09:29 -0800 Subject: [PATCH 2/2] Update README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index db5f787..8547817 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ We need to put the data sets in the `dataset` folder. You can specify one data s ```bash # trained on the tic-tac-toe data set with one GPU. -python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i 0 -wd 1e-6 & +python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i cuda:0 -wd 1e-6 & ``` The demo reads the data set and data set information first, then trains the RRL on the training set. During the training, you can check the training loss and the evaluation result on the validation set by: @@ -78,7 +78,7 @@ optional arguments: the dataset folder are available. (default: tic-tac- toe) -i DEVICE_IDS, --device_ids DEVICE_IDS - Set the device (GPU ids). Split by @. E.g., 0@2@3. + Set the device (GPU ids). Split by @. E.g., cuda:0@cuda:2@cuda:3. (default: None) -nr NR, --nr NR ranking within the nodes (default: 0) -e EPOCH, --epoch EPOCH