12wang3 · winston-zillow · Dec 6, 2021 · Dec 7, 2021 · Dec 7, 2021 · winston-zillow
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ We need to put the data sets in the `dataset` folder. You can specify one data s
 
 ```bash
 # trained on the tic-tac-toe data set with one GPU.
-python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i 0 -wd 1e-6 &
+python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i cuda:0 -wd 1e-6 &
 ```
 The demo reads the data set and data set information first, then trains the RRL on the training set. 
 During the training, you can check the training loss and the evaluation result on the validation set by:
@@ -78,7 +78,7 @@ optional arguments:
                         the dataset folder are available. (default: tic-tac-
                         toe)
   -i DEVICE_IDS, --device_ids DEVICE_IDS
-                        Set the device (GPU ids). Split by @. E.g., 0@2@3.
+                        Set the device (GPU ids). Split by @. E.g., cuda:0@cuda:2@cuda:3.
                         (default: None)
   -nr NR, --nr NR       ranking within the nodes (default: 0)
   -e EPOCH, --epoch EPOCH

diff --git a/args.py b/args.py
@@ -1,12 +1,13 @@
 import os
 import argparse
+import torch
 
 
 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('-d', '--data_set', type=str, default='tic-tac-toe',
                     help='Set the data set for training. All the data sets in the dataset folder are available.')
 parser.add_argument('-i', '--device_ids', type=str, default=None, help='Set the device (GPU ids). Split by @.'
-                                                                       ' E.g., 0@2@3.')
+                                                                       ' E.g., cuda:0@cuda:2@cuda:3.')
 parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking within the nodes')
 parser.add_argument('-e', '--epoch', type=int, default=41, help='Set the total epoch.')
 parser.add_argument('-bs', '--batch_size', type=int, default=64, help='Set the batch size.')
@@ -51,7 +52,8 @@
 rrl_args.plot_file = os.path.join(rrl_args.folder_path, 'plot_file.pdf')
 rrl_args.log = os.path.join(rrl_args.folder_path, 'log.txt')
 rrl_args.test_res = os.path.join(rrl_args.folder_path, 'test_res.txt')
-rrl_args.device_ids = list(map(int, rrl_args.device_ids.strip().split('@')))
+rrl_args.device_ids = list(map(lambda id: torch.device(id), rrl_args.device_ids.strip().split('@'))) \
+    if rrl_args.device_ids else [None]
 rrl_args.gpus = len(rrl_args.device_ids)
 rrl_args.nodes = 1
 rrl_args.world_size = rrl_args.gpus * rrl_args.nodes

diff --git a/experiment.py b/experiment.py
@@ -1,4 +1,4 @@
-import os
+import os, json
 import numpy as np
 import torch
 from torch.utils.data.dataset import random_split
@@ -14,15 +14,37 @@
 DATA_DIR = './dataset'
 
 
+def read_settings(settings_path):
+    if os.path.exists(settings_path):
+        with open(settings_path, 'r') as f:
+            settings = json.load(f)
+    else:
+        settings = {
+            'normalize_continuous': True,
+            'one_hot_encode_features': True,
+            'impute_continuous': True,
+            # of shape [continious columns, lower bounds, upper bounds]
+            'bounds': None
+            # alternatively, pass in individual bounds
+            # lower_bound: [continuous cols]
+            # upper_bound: [continuous cols]
+        }
+    return settings
+
+
 def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False, save_best=True):
     data_path = os.path.join(DATA_DIR, dataset + '.data')
     info_path = os.path.join(DATA_DIR, dataset + '.info')
+    settings_path = os.path.join(DATA_DIR, dataset + '.settings.json')
     X_df, y_df, f_df, label_pos = read_csv(data_path, info_path, shuffle=True)
 
-    db_enc = DBEncoder(f_df, discrete=False)
+    settings = read_settings(settings_path)
+    db_enc = DBEncoder(f_df, discrete=False,
+                       one_hot_encode_features=settings['one_hot_encode_features'],
+                       impute_continuous=settings['impute_continuous'])
     db_enc.fit(X_df, y_df)
 
-    X, y = db_enc.transform(X_df, y_df, normalized=True, keep_stat=True)
+    X, y = db_enc.transform(X_df, y_df, normalized=settings['normalize_continuous'], keep_stat=True)
 
     kf = KFold(n_splits=5, shuffle=True, random_state=0)
     train_index, test_index = list(kf.split(X_df))[k]
@@ -45,15 +67,21 @@ def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False
     valid_loader = DataLoader(valid_sub, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
     test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
 
-    return db_enc, train_loader, valid_loader, test_loader
+    if settings['bounds'] is not None and 'lower_bounds' not in settings:
+        bounds = settings['bounds']
+        settings['lower_bounds'] = np.array([bounds[col][0] for col in db_enc.X_fname[db_enc.discrete_flen:]])
+        settings['upper_bounds'] = np.array([bounds[col][1] for col in db_enc.X_fname[db_enc.discrete_flen:]])
+    return db_enc, train_loader, valid_loader, test_loader, settings
 
 
-def train_model(gpu, args):
+def train_model(gpu, args, distributed=True):
     rank = args.nr * args.gpus + gpu
-    dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
+    if distributed:
+        dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
     torch.manual_seed(42)
     device_id = args.device_ids[gpu]
-    torch.cuda.set_device(device_id)
+    if device_id and device_id.type == 'cuda':
+        torch.cuda.set_device(device_id)
 
     if gpu == 0:
         writer = SummaryWriter(args.folder_path)
@@ -63,8 +91,9 @@ def train_model(gpu, args):
         is_rank0 = False
 
     dataset = args.data_set
-    db_enc, train_loader, valid_loader, _ = get_data_loader(dataset, args.world_size, rank, args.batch_size,
-                                                            k=args.ith_kfold, pin_memory=True, save_best=args.save_best)
+    db_enc, train_loader, valid_loader, _, settings = get_data_loader(dataset, args.world_size, rank, args.batch_size,
+                                                                      k=args.ith_kfold, pin_memory=True,
+                                                                      save_best=args.save_best)
 
     X_fname = db_enc.X_fname
     y_fname = db_enc.y_fname
@@ -74,11 +103,14 @@ def train_model(gpu, args):
     rrl = RRL(dim_list=[(discrete_flen, continuous_flen)] + list(map(int, args.structure.split('@'))) + [len(y_fname)],
               device_id=device_id,
               use_not=args.use_not,
+              cl=settings.get('lower_bounds', None),
+              cr=settings.get('upper_bounds', None),
               is_rank0=is_rank0,
               log_file=args.log,
               writer=writer,
               save_best=args.save_best,
               estimated_grad=args.estimated_grad,
+              distributed=distributed,
               save_path=args.model)
 
     rrl.train_model(
@@ -106,16 +138,18 @@ def load_model(path, device_id, log_file=None, distributed=True):
     stat_dict = checkpoint['model_state_dict']
     for key in list(stat_dict.keys()):
         # remove 'module.' prefix
-        stat_dict[key[7:]] = stat_dict.pop(key)
+        if key.startswith('module.'):
+            stat_dict[key[7:]] = stat_dict.pop(key)
     rrl.net.load_state_dict(checkpoint['model_state_dict'])
     return rrl
 
 
 def test_model(args):
     rrl = load_model(args.model, args.device_ids[0], log_file=args.test_res, distributed=False)
     dataset = args.data_set
-    db_enc, train_loader, _, test_loader = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold, save_best=False)
-    rrl.test(test_loader=test_loader, set_name='Test')
+    db_enc, train_loader, _, test_loader, _ = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold,
+                                                              save_best=False)
+    rrl.test(test_loader=test_loader, set_name='Test', labels=db_enc.y_fname)
     with open(args.rrl_file, 'w') as rrl_file:
         rrl.rule_print(db_enc.X_fname, db_enc.y_fname, train_loader, file=rrl_file, mean=db_enc.mean, std=db_enc.std)
 

diff --git a/rrl/components.py b/rrl/components.py
@@ -23,7 +23,7 @@ def backward(ctx, grad_output):
 class BinarizeLayer(nn.Module):
     """Implement the feature discretization and binarization."""
 
-    def __init__(self, n, input_dim, use_not=False, left=None, right=None):
+    def __init__(self, n, input_dim, use_not=False, cl=None, cr=None, left=None, right=None):
         super(BinarizeLayer, self).__init__()
         self.n = n
         self.input_dim = input_dim
@@ -39,12 +39,17 @@ def __init__(self, n, input_dim, use_not=False, left=None, right=None):
         self.register_buffer('right', right)
 
         if self.input_dim[1] > 0:
-            if self.left is not None and self.right is not None:
+            if cl is not None and cr is not None:  # bounds are specified
+                cl = torch.tensor(cl).type(torch.float).t()
+                cr = torch.tensor(cr).type(torch.float).t()
+            elif self.left is not None and self.right is not None:
                 cl = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left)
                 cr = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left)
             else:
                 cl = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.)
                 cr = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.)
+            assert torch.Size([self.n, self.input_dim[1]]) == cl.size()
+            assert torch.Size([self.n, self.input_dim[1]]) == cr.size()
             self.register_buffer('cl', cl)
             self.register_buffer('cr', cr)
 

diff --git a/rrl/models.py b/rrl/models.py
@@ -14,7 +14,7 @@
 
 
 class MLLP(nn.Module):
-    def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_grad=False):
+    def __init__(self, dim_list, use_not=False, cl=None, cr=None, left=None, right=None, estimated_grad=False):
         super(MLLP, self).__init__()
 
         self.dim_list = dim_list
@@ -30,7 +30,7 @@ def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_gra
                 num += self.layer_list[-2].output_dim
 
             if i == 1:
-                layer = BinarizeLayer(dim_list[i], num, self.use_not, self.left, self.right)
+                layer = BinarizeLayer(dim_list[i], num, self.use_not, cl=cl, cr=cr, left=self.left, right=self.right)
                 layer_name = 'binary{}'.format(i)
             elif i == len(dim_list) - 1:
                 layer = LRLayer(dim_list[i], num)
@@ -77,7 +77,7 @@ def layer_list(self):
 
 class RRL:
     def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=None, writer=None, left=None,
-                 right=None, save_best=False, estimated_grad=False, save_path=None, distributed=True):
+                 right=None, cl=None, cr=None, save_best=False, estimated_grad=False, save_path=None, distributed=True):
         super(RRL, self).__init__()
         self.dim_list = dim_list
         self.use_not = use_not
@@ -99,9 +99,11 @@ def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=
                 logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode='w', format=log_format)
         self.writer = writer
 
-        self.net = MLLP(dim_list, use_not=use_not, left=left, right=right, estimated_grad=estimated_grad)
+        self.net = MLLP(dim_list, use_not=use_not, cl=cl, cr=cr, left=left, right=right,
+                        estimated_grad=estimated_grad)
 
-        self.net.cuda(self.device_id)
+        if self.device_id and self.device_id.type == 'cuda':
+            self.net.cuda(self.device_id)
         if distributed:
             self.net = MyDistributedDataParallel(self.net, device_ids=[self.device_id])
 
@@ -161,8 +163,9 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data
             ba_cnt = 0
             for X, y in data_loader:
                 ba_cnt += 1
-                X = X.cuda(self.device_id, non_blocking=True)
-                y = y.cuda(self.device_id, non_blocking=True)
+                if self.device_id and self.device_id.type == 'cuda':
+                    X = X.cuda(self.device_id, non_blocking=True)
+                    y = y.cuda(self.device_id, non_blocking=True)
                 optimizer.zero_grad()  # Zero the gradient buffers.
                 y_pred_mllp, y_pred_rrl = self.net.forward(X)
                 with torch.no_grad():
@@ -231,7 +234,8 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data
             self.save_model()
         return epoch_histc
 
-    def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=True, set_name='Validation'):
+    def test(self, X=None, y=None, labels=None, test_loader=None,
+             batch_size=32, need_transform=True, set_name='Validation'):
         if X is not None and y is not None and need_transform:
             X, y = self.data_transform(X, y)
         with torch.no_grad():
@@ -251,7 +255,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T
             y_pred_list = []
             y_pred_b_list = []
             for X, y in test_loader:
-                X = X.cuda(self.device_id, non_blocking=True)
+                if self.device_id and self.device_id.type == 'cuda':
+                    X = X.cuda(self.device_id, non_blocking=True)
                 output = self.net.forward(X)
                 y_pred_list.append(output[0])
                 y_pred_b_list.append(output[1])
@@ -275,7 +280,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T
             logging.info('On {} Set:\n\tAccuracy of RRL  Model: {}'
                          '\n\tF1 Score of RRL  Model: {}'.format(set_name, accuracy_b, f1_score_b))
             logging.info('On {} Set:\nPerformance of  RRL Model: \n{}\n{}'.format(
-                set_name, metrics.confusion_matrix(y_true, y_pred_b_arg), metrics.classification_report(y_true, y_pred_b_arg)))
+                set_name, metrics.confusion_matrix(y_true, y_pred_b_arg),
+                metrics.classification_report(y_true, y_pred_b_arg, target_names=labels)))
             logging.info('-' * 60)
         return accuracy, accuracy_b, f1_score, f1_score_b
 
@@ -289,7 +295,8 @@ def detect_dead_node(self, data_loader=None):
                 layer.node_activation_cnt = torch.zeros(layer.output_dim, dtype=torch.double, device=self.device_id)
                 layer.forward_tot = 0
             for x, y in data_loader:
-                x = x.cuda(self.device_id)
+                if self.device_id and self.device_id.type == 'cuda':
+                    x = x.cuda(self.device_id)
                 x_res = None
                 for i, layer in enumerate(self.net.layer_list[:-1]):
                     if i <= 1:

diff --git a/rrl/utils.py b/rrl/utils.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from scipy.sparse import issparse
 from sklearn import preprocessing
 from sklearn.impute import SimpleImputer
 
@@ -29,13 +30,14 @@ def read_csv(data_path, info_path, shuffle=False):
 class DBEncoder:
     """Encoder used for data discretization and binarization."""
 
-    def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first'):
+    def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first',
+                 impute_continuous=True, one_hot_encode_features=True):
         self.f_df = f_df
         self.discrete = discrete
         self.y_one_hot = y_one_hot
         self.label_enc = preprocessing.OneHotEncoder(categories='auto') if y_one_hot else preprocessing.LabelEncoder()
-        self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop)
-        self.imp = SimpleImputer(missing_values=np.nan, strategy='mean')
+        self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop) if one_hot_encode_features else None
+        self.imp = SimpleImputer(missing_values=np.nan, strategy='mean') if impute_continuous else None
         self.X_fname = None
         self.y_fname = None
         self.discrete_flen = None
@@ -59,16 +61,18 @@ def fit(self, X_df, y_df):
         self.y_fname = list(self.label_enc.get_feature_names(y_df.columns)) if self.y_one_hot else y_df.columns
 
         if not continuous_data.empty:
-            # Use mean as missing value for continuous columns if do not discretize them.
-            self.imp.fit(continuous_data.values)
+            if self.imp is not None:
+                # Use mean as missing value for continuous columns if do not discretize them.
+                self.imp.fit(continuous_data.values)
         if not discrete_data.empty:
-            # One-hot encoding
-            self.feature_enc.fit(discrete_data)
-            feature_names = discrete_data.columns
-            self.X_fname = list(self.feature_enc.get_feature_names(feature_names))
+            self.X_fname = discrete_data.columns.to_list()
+            if self.feature_enc is not None:
+                # One-hot encoding
+                self.feature_enc.fit(discrete_data)
+                self.X_fname = list(self.feature_enc.get_feature_names(self.X_fname))
             self.discrete_flen = len(self.X_fname)
             if not self.discrete:
-                self.X_fname.extend(continuous_data.columns)
+                self.X_fname.extend(continuous_data.columns.to_list())
         else:
             self.X_fname = continuous_data.columns
             self.discrete_flen = 0
@@ -84,21 +88,25 @@ def transform(self, X_df, y_df, normalized=False, keep_stat=False):
             y = y.toarray()
 
         if not continuous_data.empty:
-            # Use mean as missing value for continuous columns if we do not discretize them.
-            continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values),
-                                           columns=continuous_data.columns)
+            if self.imp is not None:
+                # Use mean as missing value for continuous columns if we do not discretize them.
+                continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values),
+                                               columns=continuous_data.columns)
             if normalized:
                 if keep_stat:
                     self.mean = continuous_data.mean()
                     self.std = continuous_data.std()
                 continuous_data = (continuous_data - self.mean) / self.std
         if not discrete_data.empty:
-            # One-hot encoding
-            discrete_data = self.feature_enc.transform(discrete_data)
+            if self.feature_enc is not None:
+                # One-hot encoding
+                discrete_data = self.feature_enc.transform(discrete_data)
+                if issparse(discrete_data):
+                    discrete_data = discrete_data.toarray()
             if not self.discrete:
-                X_df = pd.concat([pd.DataFrame(discrete_data.toarray()), continuous_data], axis=1)
+                X_df = pd.concat([pd.DataFrame(discrete_data), continuous_data], axis=1)
             else:
-                X_df = pd.DataFrame(discrete_data.toarray())
+                X_df = pd.DataFrame(discrete_data)
         else:
             X_df = continuous_data
         return X_df.values, y