From d590e4b9071bb717a8e2e28291900dd9079bf2a1 Mon Sep 17 00:00:00 2001
From: winstonq <winstonq@zillowgroup.com>
Date: Mon, 6 Dec 2021 15:49:10 -0800
Subject: [PATCH 1/2] Allow passing in cr/cl bounds and settings. Allow CPU
 execution. Fix GPU support. Fix module loading.

---
 args.py           |  6 +++--
 experiment.py     | 58 +++++++++++++++++++++++++++++++++++++----------
 rrl/components.py |  9 ++++++--
 rrl/models.py     | 29 +++++++++++++++---------
 rrl/utils.py      | 42 ++++++++++++++++++++--------------
 5 files changed, 100 insertions(+), 44 deletions(-)

diff --git a/args.py b/args.py
index b8169e2..ea63852 100644
--- a/args.py
+++ b/args.py
@@ -1,12 +1,13 @@
 import os
 import argparse
+import torch
 
 
 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('-d', '--data_set', type=str, default='tic-tac-toe',
                     help='Set the data set for training. All the data sets in the dataset folder are available.')
 parser.add_argument('-i', '--device_ids', type=str, default=None, help='Set the device (GPU ids). Split by @.'
-                                                                       ' E.g., 0@2@3.')
+                                                                       ' E.g., cuda:0@cuda:2@cuda:3.')
 parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking within the nodes')
 parser.add_argument('-e', '--epoch', type=int, default=41, help='Set the total epoch.')
 parser.add_argument('-bs', '--batch_size', type=int, default=64, help='Set the batch size.')
@@ -51,7 +52,8 @@
 rrl_args.plot_file = os.path.join(rrl_args.folder_path, 'plot_file.pdf')
 rrl_args.log = os.path.join(rrl_args.folder_path, 'log.txt')
 rrl_args.test_res = os.path.join(rrl_args.folder_path, 'test_res.txt')
-rrl_args.device_ids = list(map(int, rrl_args.device_ids.strip().split('@')))
+rrl_args.device_ids = list(map(lambda id: torch.device(id), rrl_args.device_ids.strip().split('@'))) \
+    if rrl_args.device_ids else [None]
 rrl_args.gpus = len(rrl_args.device_ids)
 rrl_args.nodes = 1
 rrl_args.world_size = rrl_args.gpus * rrl_args.nodes
diff --git a/experiment.py b/experiment.py
index a70a20b..f67a7c1 100644
--- a/experiment.py
+++ b/experiment.py
@@ -1,4 +1,4 @@
-import os
+import os, json
 import numpy as np
 import torch
 from torch.utils.data.dataset import random_split
@@ -14,15 +14,37 @@
 DATA_DIR = './dataset'
 
 
+def read_settings(settings_path):
+    if os.path.exists(settings_path):
+        with open(settings_path, 'r') as f:
+            settings = json.load(f)
+    else:
+        settings = {
+            'normalize_continuous': True,
+            'one_hot_encode_features': True,
+            'impute_continuous': True,
+            # of shape [continious columns, lower bounds, upper bounds]
+            'bounds': None
+            # alternatively, pass in individual bounds
+            # lower_bound: [continuous cols]
+            # upper_bound: [continuous cols]
+        }
+    return settings
+
+
 def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False, save_best=True):
     data_path = os.path.join(DATA_DIR, dataset + '.data')
     info_path = os.path.join(DATA_DIR, dataset + '.info')
+    settings_path = os.path.join(DATA_DIR, dataset + '.settings.json')
     X_df, y_df, f_df, label_pos = read_csv(data_path, info_path, shuffle=True)
 
-    db_enc = DBEncoder(f_df, discrete=False)
+    settings = read_settings(settings_path)
+    db_enc = DBEncoder(f_df, discrete=False,
+                       one_hot_encode_features=settings['one_hot_encode_features'],
+                       impute_continuous=settings['impute_continuous'])
     db_enc.fit(X_df, y_df)
 
-    X, y = db_enc.transform(X_df, y_df, normalized=True, keep_stat=True)
+    X, y = db_enc.transform(X_df, y_df, normalized=settings['normalize_continuous'], keep_stat=True)
 
     kf = KFold(n_splits=5, shuffle=True, random_state=0)
     train_index, test_index = list(kf.split(X_df))[k]
@@ -45,15 +67,21 @@ def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False
     valid_loader = DataLoader(valid_sub, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
     test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
 
-    return db_enc, train_loader, valid_loader, test_loader
+    if settings['bounds'] is not None and 'lower_bounds' not in settings:
+        bounds = settings['bounds']
+        settings['lower_bounds'] = np.array([bounds[col][0] for col in db_enc.X_fname[db_enc.discrete_flen:]])
+        settings['upper_bounds'] = np.array([bounds[col][1] for col in db_enc.X_fname[db_enc.discrete_flen:]])
+    return db_enc, train_loader, valid_loader, test_loader, settings
 
 
-def train_model(gpu, args):
+def train_model(gpu, args, distributed=True):
     rank = args.nr * args.gpus + gpu
-    dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
+    if distributed:
+        dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
     torch.manual_seed(42)
     device_id = args.device_ids[gpu]
-    torch.cuda.set_device(device_id)
+    if device_id and device_id.type == 'cuda':
+        torch.cuda.set_device(device_id)
 
     if gpu == 0:
         writer = SummaryWriter(args.folder_path)
@@ -63,8 +91,9 @@ def train_model(gpu, args):
         is_rank0 = False
 
     dataset = args.data_set
-    db_enc, train_loader, valid_loader, _ = get_data_loader(dataset, args.world_size, rank, args.batch_size,
-                                                            k=args.ith_kfold, pin_memory=True, save_best=args.save_best)
+    db_enc, train_loader, valid_loader, _, settings = get_data_loader(dataset, args.world_size, rank, args.batch_size,
+                                                                      k=args.ith_kfold, pin_memory=True,
+                                                                      save_best=args.save_best)
 
     X_fname = db_enc.X_fname
     y_fname = db_enc.y_fname
@@ -74,11 +103,14 @@ def train_model(gpu, args):
     rrl = RRL(dim_list=[(discrete_flen, continuous_flen)] + list(map(int, args.structure.split('@'))) + [len(y_fname)],
               device_id=device_id,
               use_not=args.use_not,
+              cl=settings.get('lower_bounds', None),
+              cr=settings.get('upper_bounds', None),
               is_rank0=is_rank0,
               log_file=args.log,
               writer=writer,
               save_best=args.save_best,
               estimated_grad=args.estimated_grad,
+              distributed=distributed,
               save_path=args.model)
 
     rrl.train_model(
@@ -106,7 +138,8 @@ def load_model(path, device_id, log_file=None, distributed=True):
     stat_dict = checkpoint['model_state_dict']
     for key in list(stat_dict.keys()):
         # remove 'module.' prefix
-        stat_dict[key[7:]] = stat_dict.pop(key)
+        if key.startswith('module.'):
+            stat_dict[key[7:]] = stat_dict.pop(key)
     rrl.net.load_state_dict(checkpoint['model_state_dict'])
     return rrl
 
@@ -114,8 +147,9 @@ def load_model(path, device_id, log_file=None, distributed=True):
 def test_model(args):
     rrl = load_model(args.model, args.device_ids[0], log_file=args.test_res, distributed=False)
     dataset = args.data_set
-    db_enc, train_loader, _, test_loader = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold, save_best=False)
-    rrl.test(test_loader=test_loader, set_name='Test')
+    db_enc, train_loader, _, test_loader, _ = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold,
+                                                              save_best=False)
+    rrl.test(test_loader=test_loader, set_name='Test', labels=db_enc.y_fname)
     with open(args.rrl_file, 'w') as rrl_file:
         rrl.rule_print(db_enc.X_fname, db_enc.y_fname, train_loader, file=rrl_file, mean=db_enc.mean, std=db_enc.std)
 
diff --git a/rrl/components.py b/rrl/components.py
index 04c9262..dbf61d6 100644
--- a/rrl/components.py
+++ b/rrl/components.py
@@ -23,7 +23,7 @@ def backward(ctx, grad_output):
 class BinarizeLayer(nn.Module):
     """Implement the feature discretization and binarization."""
 
-    def __init__(self, n, input_dim, use_not=False, left=None, right=None):
+    def __init__(self, n, input_dim, use_not=False, cl=None, cr=None, left=None, right=None):
         super(BinarizeLayer, self).__init__()
         self.n = n
         self.input_dim = input_dim
@@ -39,12 +39,17 @@ def __init__(self, n, input_dim, use_not=False, left=None, right=None):
         self.register_buffer('right', right)
 
         if self.input_dim[1] > 0:
-            if self.left is not None and self.right is not None:
+            if cl is not None and cr is not None:  # bounds are specified
+                cl = torch.tensor(cl).type(torch.float).t()
+                cr = torch.tensor(cr).type(torch.float).t()
+            elif self.left is not None and self.right is not None:
                 cl = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left)
                 cr = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left)
             else:
                 cl = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.)
                 cr = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.)
+            assert torch.Size([self.n, self.input_dim[1]]) == cl.size()
+            assert torch.Size([self.n, self.input_dim[1]]) == cr.size()
             self.register_buffer('cl', cl)
             self.register_buffer('cr', cr)
 
diff --git a/rrl/models.py b/rrl/models.py
index 2063f7b..d4f5cb6 100644
--- a/rrl/models.py
+++ b/rrl/models.py
@@ -14,7 +14,7 @@
 
 
 class MLLP(nn.Module):
-    def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_grad=False):
+    def __init__(self, dim_list, use_not=False, cl=None, cr=None, left=None, right=None, estimated_grad=False):
         super(MLLP, self).__init__()
 
         self.dim_list = dim_list
@@ -30,7 +30,7 @@ def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_gra
                 num += self.layer_list[-2].output_dim
 
             if i == 1:
-                layer = BinarizeLayer(dim_list[i], num, self.use_not, self.left, self.right)
+                layer = BinarizeLayer(dim_list[i], num, self.use_not, cl=cl, cr=cr, left=self.left, right=self.right)
                 layer_name = 'binary{}'.format(i)
             elif i == len(dim_list) - 1:
                 layer = LRLayer(dim_list[i], num)
@@ -77,7 +77,7 @@ def layer_list(self):
 
 class RRL:
     def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=None, writer=None, left=None,
-                 right=None, save_best=False, estimated_grad=False, save_path=None, distributed=True):
+                 right=None, cl=None, cr=None, save_best=False, estimated_grad=False, save_path=None, distributed=True):
         super(RRL, self).__init__()
         self.dim_list = dim_list
         self.use_not = use_not
@@ -99,9 +99,11 @@ def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=
                 logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode='w', format=log_format)
         self.writer = writer
 
-        self.net = MLLP(dim_list, use_not=use_not, left=left, right=right, estimated_grad=estimated_grad)
+        self.net = MLLP(dim_list, use_not=use_not, cl=cl, cr=cr, left=left, right=right,
+                        estimated_grad=estimated_grad)
 
-        self.net.cuda(self.device_id)
+        if self.device_id and self.device_id.type == 'cuda':
+            self.net.cuda(self.device_id)
         if distributed:
             self.net = MyDistributedDataParallel(self.net, device_ids=[self.device_id])
 
@@ -161,8 +163,9 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data
             ba_cnt = 0
             for X, y in data_loader:
                 ba_cnt += 1
-                X = X.cuda(self.device_id, non_blocking=True)
-                y = y.cuda(self.device_id, non_blocking=True)
+                if self.device_id and self.device_id.type == 'cuda':
+                    X = X.cuda(self.device_id, non_blocking=True)
+                    y = y.cuda(self.device_id, non_blocking=True)
                 optimizer.zero_grad()  # Zero the gradient buffers.
                 y_pred_mllp, y_pred_rrl = self.net.forward(X)
                 with torch.no_grad():
@@ -231,7 +234,8 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data
             self.save_model()
         return epoch_histc
 
-    def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=True, set_name='Validation'):
+    def test(self, X=None, y=None, labels=None, test_loader=None,
+             batch_size=32, need_transform=True, set_name='Validation'):
         if X is not None and y is not None and need_transform:
             X, y = self.data_transform(X, y)
         with torch.no_grad():
@@ -251,7 +255,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T
             y_pred_list = []
             y_pred_b_list = []
             for X, y in test_loader:
-                X = X.cuda(self.device_id, non_blocking=True)
+                if self.device_id and self.device_id.type == 'cuda':
+                    X = X.cuda(self.device_id, non_blocking=True)
                 output = self.net.forward(X)
                 y_pred_list.append(output[0])
                 y_pred_b_list.append(output[1])
@@ -275,7 +280,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T
             logging.info('On {} Set:\n\tAccuracy of RRL  Model: {}'
                          '\n\tF1 Score of RRL  Model: {}'.format(set_name, accuracy_b, f1_score_b))
             logging.info('On {} Set:\nPerformance of  RRL Model: \n{}\n{}'.format(
-                set_name, metrics.confusion_matrix(y_true, y_pred_b_arg), metrics.classification_report(y_true, y_pred_b_arg)))
+                set_name, metrics.confusion_matrix(y_true, y_pred_b_arg),
+                metrics.classification_report(y_true, y_pred_b_arg, target_names=labels)))
             logging.info('-' * 60)
         return accuracy, accuracy_b, f1_score, f1_score_b
 
@@ -289,7 +295,8 @@ def detect_dead_node(self, data_loader=None):
                 layer.node_activation_cnt = torch.zeros(layer.output_dim, dtype=torch.double, device=self.device_id)
                 layer.forward_tot = 0
             for x, y in data_loader:
-                x = x.cuda(self.device_id)
+                if self.device_id and self.device_id.type == 'cuda':
+                    x = x.cuda(self.device_id)
                 x_res = None
                 for i, layer in enumerate(self.net.layer_list[:-1]):
                     if i <= 1:
diff --git a/rrl/utils.py b/rrl/utils.py
index 440f396..3b23872 100644
--- a/rrl/utils.py
+++ b/rrl/utils.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from scipy.sparse import issparse
 from sklearn import preprocessing
 from sklearn.impute import SimpleImputer
 
@@ -29,13 +30,14 @@ def read_csv(data_path, info_path, shuffle=False):
 class DBEncoder:
     """Encoder used for data discretization and binarization."""
 
-    def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first'):
+    def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first',
+                 impute_continuous=True, one_hot_encode_features=True):
         self.f_df = f_df
         self.discrete = discrete
         self.y_one_hot = y_one_hot
         self.label_enc = preprocessing.OneHotEncoder(categories='auto') if y_one_hot else preprocessing.LabelEncoder()
-        self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop)
-        self.imp = SimpleImputer(missing_values=np.nan, strategy='mean')
+        self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop) if one_hot_encode_features else None
+        self.imp = SimpleImputer(missing_values=np.nan, strategy='mean') if impute_continuous else None
         self.X_fname = None
         self.y_fname = None
         self.discrete_flen = None
@@ -59,16 +61,18 @@ def fit(self, X_df, y_df):
         self.y_fname = list(self.label_enc.get_feature_names(y_df.columns)) if self.y_one_hot else y_df.columns
 
         if not continuous_data.empty:
-            # Use mean as missing value for continuous columns if do not discretize them.
-            self.imp.fit(continuous_data.values)
+            if self.imp is not None:
+                # Use mean as missing value for continuous columns if do not discretize them.
+                self.imp.fit(continuous_data.values)
         if not discrete_data.empty:
-            # One-hot encoding
-            self.feature_enc.fit(discrete_data)
-            feature_names = discrete_data.columns
-            self.X_fname = list(self.feature_enc.get_feature_names(feature_names))
+            self.X_fname = discrete_data.columns.to_list()
+            if self.feature_enc is not None:
+                # One-hot encoding
+                self.feature_enc.fit(discrete_data)
+                self.X_fname = list(self.feature_enc.get_feature_names(self.X_fname))
             self.discrete_flen = len(self.X_fname)
             if not self.discrete:
-                self.X_fname.extend(continuous_data.columns)
+                self.X_fname.extend(continuous_data.columns.to_list())
         else:
             self.X_fname = continuous_data.columns
             self.discrete_flen = 0
@@ -84,21 +88,25 @@ def transform(self, X_df, y_df, normalized=False, keep_stat=False):
             y = y.toarray()
 
         if not continuous_data.empty:
-            # Use mean as missing value for continuous columns if we do not discretize them.
-            continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values),
-                                           columns=continuous_data.columns)
+            if self.imp is not None:
+                # Use mean as missing value for continuous columns if we do not discretize them.
+                continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values),
+                                               columns=continuous_data.columns)
             if normalized:
                 if keep_stat:
                     self.mean = continuous_data.mean()
                     self.std = continuous_data.std()
                 continuous_data = (continuous_data - self.mean) / self.std
         if not discrete_data.empty:
-            # One-hot encoding
-            discrete_data = self.feature_enc.transform(discrete_data)
+            if self.feature_enc is not None:
+                # One-hot encoding
+                discrete_data = self.feature_enc.transform(discrete_data)
+                if issparse(discrete_data):
+                    discrete_data = discrete_data.toarray()
             if not self.discrete:
-                X_df = pd.concat([pd.DataFrame(discrete_data.toarray()), continuous_data], axis=1)
+                X_df = pd.concat([pd.DataFrame(discrete_data), continuous_data], axis=1)
             else:
-                X_df = pd.DataFrame(discrete_data.toarray())
+                X_df = pd.DataFrame(discrete_data)
         else:
             X_df = continuous_data
         return X_df.values, y

From 9ddf68abf1a12d79ca98bd5ee4c67daa97e4d69c Mon Sep 17 00:00:00 2001
From: winstonq <winstonq@zillowgroup.com>
Date: Mon, 6 Dec 2021 16:09:29 -0800
Subject: [PATCH 2/2] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index db5f787..8547817 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ We need to put the data sets in the `dataset` folder. You can specify one data s
 
 ```bash
 # trained on the tic-tac-toe data set with one GPU.
-python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i 0 -wd 1e-6 &
+python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i cuda:0 -wd 1e-6 &
 ```
 The demo reads the data set and data set information first, then trains the RRL on the training set. 
 During the training, you can check the training loss and the evaluation result on the validation set by:
@@ -78,7 +78,7 @@ optional arguments:
                         the dataset folder are available. (default: tic-tac-
                         toe)
   -i DEVICE_IDS, --device_ids DEVICE_IDS
-                        Set the device (GPU ids). Split by @. E.g., 0@2@3.
+                        Set the device (GPU ids). Split by @. E.g., cuda:0@cuda:2@cuda:3.
                         (default: None)
   -nr NR, --nr NR       ranking within the nodes (default: 0)
   -e EPOCH, --epoch EPOCH