Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ We need to put the data sets in the `dataset` folder. You can specify one data s

```bash
# trained on the tic-tac-toe data set with one GPU.
python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i 0 -wd 1e-6 &
python3 experiment.py -d tic-tac-toe -bs 32 -s 1@16 -e401 -lrde 200 -lr 0.002 -ki 0 -mp 12481 -i cuda:0 -wd 1e-6 &
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: see review comment on args.py changes

```
The demo reads the data set and data set information first, then trains the RRL on the training set.
During the training, you can check the training loss and the evaluation result on the validation set by:
Expand Down Expand Up @@ -78,7 +78,7 @@ optional arguments:
the dataset folder are available. (default: tic-tac-
toe)
-i DEVICE_IDS, --device_ids DEVICE_IDS
Set the device (GPU ids). Split by @. E.g., 0@2@3.
Set the device (GPU ids). Split by @. E.g., cuda:0@cuda:2@cuda:3.
(default: None)
-nr NR, --nr NR ranking within the nodes (default: 0)
-e EPOCH, --epoch EPOCH
Expand Down
6 changes: 4 additions & 2 deletions args.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import os
import argparse
import torch


parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-d', '--data_set', type=str, default='tic-tac-toe',
help='Set the data set for training. All the data sets in the dataset folder are available.')
parser.add_argument('-i', '--device_ids', type=str, default=None, help='Set the device (GPU ids). Split by @.'
' E.g., 0@2@3.')
' E.g., cuda:0@cuda:2@cuda:3.')
parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking within the nodes')
parser.add_argument('-e', '--epoch', type=int, default=41, help='Set the total epoch.')
parser.add_argument('-bs', '--batch_size', type=int, default=64, help='Set the batch size.')
Expand Down Expand Up @@ -51,7 +52,8 @@
rrl_args.plot_file = os.path.join(rrl_args.folder_path, 'plot_file.pdf')
rrl_args.log = os.path.join(rrl_args.folder_path, 'log.txt')
rrl_args.test_res = os.path.join(rrl_args.folder_path, 'test_res.txt')
rrl_args.device_ids = list(map(int, rrl_args.device_ids.strip().split('@')))
rrl_args.device_ids = list(map(lambda id: torch.device(id), rrl_args.device_ids.strip().split('@'))) \
if rrl_args.device_ids else [None]
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I found that passing in integer device ID would get the tensors pegged to the GPU memory but the GPU compute utilization remains at 0, as shown by nvidia-smi. After I change the device ID to that returned by torch.device("cuda:0"), the GPU is utilized fully. I do not know why that's the case as simple test using a python loop can cause GPU utilization.

Example run passing in integer device ID:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.142.00   Driver Version: 450.142.00   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   47C    P0    70W / 149W |    322MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     27173      C   ...vs/pytorch_p37/bin/python      319MiB |
+-----------------------------------------------------------------------------+

Example run passing in cuda:*:

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     27346      C   ...vs/pytorch_p37/bin/python     1736MiB |
+-----------------------------------------------------------------------------+
Sat Dec  4 01:31:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.142.00   Driver Version: 450.142.00   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   52C    P0   138W / 149W |   1739MiB / 11441MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     27346      C   ...vs/pytorch_p37/bin/python     1736MiB |
+-----------------------------------------------------------------------------+

rrl_args.gpus = len(rrl_args.device_ids)
rrl_args.nodes = 1
rrl_args.world_size = rrl_args.gpus * rrl_args.nodes
Expand Down
58 changes: 46 additions & 12 deletions experiment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
import os, json
import numpy as np
import torch
from torch.utils.data.dataset import random_split
Expand All @@ -14,15 +14,37 @@
DATA_DIR = './dataset'


def read_settings(settings_path):
if os.path.exists(settings_path):
with open(settings_path, 'r') as f:
settings = json.load(f)
else:
settings = {
'normalize_continuous': True,
'one_hot_encode_features': True,
'impute_continuous': True,
# of shape [continious columns, lower bounds, upper bounds]
'bounds': None
# alternatively, pass in individual bounds
# lower_bound: [continuous cols]
# upper_bound: [continuous cols]
}
return settings
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I added this new setting file so that the user can pass in CR/CL bounds as well as controlling normalization and one-hot encoding etc. (those are currently hard-coded)



def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False, save_best=True):
data_path = os.path.join(DATA_DIR, dataset + '.data')
info_path = os.path.join(DATA_DIR, dataset + '.info')
settings_path = os.path.join(DATA_DIR, dataset + '.settings.json')
X_df, y_df, f_df, label_pos = read_csv(data_path, info_path, shuffle=True)

db_enc = DBEncoder(f_df, discrete=False)
settings = read_settings(settings_path)
db_enc = DBEncoder(f_df, discrete=False,
one_hot_encode_features=settings['one_hot_encode_features'],
impute_continuous=settings['impute_continuous'])
db_enc.fit(X_df, y_df)

X, y = db_enc.transform(X_df, y_df, normalized=True, keep_stat=True)
X, y = db_enc.transform(X_df, y_df, normalized=settings['normalize_continuous'], keep_stat=True)

kf = KFold(n_splits=5, shuffle=True, random_state=0)
train_index, test_index = list(kf.split(X_df))[k]
Expand All @@ -45,15 +67,21 @@ def get_data_loader(dataset, world_size, rank, batch_size, k=0, pin_memory=False
valid_loader = DataLoader(valid_sub, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)

return db_enc, train_loader, valid_loader, test_loader
if settings['bounds'] is not None and 'lower_bounds' not in settings:
bounds = settings['bounds']
settings['lower_bounds'] = np.array([bounds[col][0] for col in db_enc.X_fname[db_enc.discrete_flen:]])
settings['upper_bounds'] = np.array([bounds[col][1] for col in db_enc.X_fname[db_enc.discrete_flen:]])
return db_enc, train_loader, valid_loader, test_loader, settings


def train_model(gpu, args):
def train_model(gpu, args, distributed=True):
rank = args.nr * args.gpus + gpu
dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
if distributed:
dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
torch.manual_seed(42)
device_id = args.device_ids[gpu]
torch.cuda.set_device(device_id)
if device_id and device_id.type == 'cuda':
torch.cuda.set_device(device_id)

if gpu == 0:
writer = SummaryWriter(args.folder_path)
Expand All @@ -63,8 +91,9 @@ def train_model(gpu, args):
is_rank0 = False

dataset = args.data_set
db_enc, train_loader, valid_loader, _ = get_data_loader(dataset, args.world_size, rank, args.batch_size,
k=args.ith_kfold, pin_memory=True, save_best=args.save_best)
db_enc, train_loader, valid_loader, _, settings = get_data_loader(dataset, args.world_size, rank, args.batch_size,
k=args.ith_kfold, pin_memory=True,
save_best=args.save_best)

X_fname = db_enc.X_fname
y_fname = db_enc.y_fname
Expand All @@ -74,11 +103,14 @@ def train_model(gpu, args):
rrl = RRL(dim_list=[(discrete_flen, continuous_flen)] + list(map(int, args.structure.split('@'))) + [len(y_fname)],
device_id=device_id,
use_not=args.use_not,
cl=settings.get('lower_bounds', None),
cr=settings.get('upper_bounds', None),
is_rank0=is_rank0,
log_file=args.log,
writer=writer,
save_best=args.save_best,
estimated_grad=args.estimated_grad,
distributed=distributed,
save_path=args.model)

rrl.train_model(
Expand Down Expand Up @@ -106,16 +138,18 @@ def load_model(path, device_id, log_file=None, distributed=True):
stat_dict = checkpoint['model_state_dict']
for key in list(stat_dict.keys()):
# remove 'module.' prefix
stat_dict[key[7:]] = stat_dict.pop(key)
if key.startswith('module.'):
stat_dict[key[7:]] = stat_dict.pop(key)
rrl.net.load_state_dict(checkpoint['model_state_dict'])
return rrl


def test_model(args):
rrl = load_model(args.model, args.device_ids[0], log_file=args.test_res, distributed=False)
dataset = args.data_set
db_enc, train_loader, _, test_loader = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold, save_best=False)
rrl.test(test_loader=test_loader, set_name='Test')
db_enc, train_loader, _, test_loader, _ = get_data_loader(dataset, 4, 0, args.batch_size, args.ith_kfold,
save_best=False)
rrl.test(test_loader=test_loader, set_name='Test', labels=db_enc.y_fname)
with open(args.rrl_file, 'w') as rrl_file:
rrl.rule_print(db_enc.X_fname, db_enc.y_fname, train_loader, file=rrl_file, mean=db_enc.mean, std=db_enc.std)

Expand Down
9 changes: 7 additions & 2 deletions rrl/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def backward(ctx, grad_output):
class BinarizeLayer(nn.Module):
"""Implement the feature discretization and binarization."""

def __init__(self, n, input_dim, use_not=False, left=None, right=None):
def __init__(self, n, input_dim, use_not=False, cl=None, cr=None, left=None, right=None):
super(BinarizeLayer, self).__init__()
self.n = n
self.input_dim = input_dim
Expand All @@ -39,12 +39,17 @@ def __init__(self, n, input_dim, use_not=False, left=None, right=None):
self.register_buffer('right', right)

if self.input_dim[1] > 0:
if self.left is not None and self.right is not None:
if cl is not None and cr is not None: # bounds are specified
cl = torch.tensor(cl).type(torch.float).t()
cr = torch.tensor(cr).type(torch.float).t()
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: here we can pass in the cl/cr bounds directly.

elif self.left is not None and self.right is not None:
cl = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left)
cr = self.left + torch.rand(self.n, self.input_dim[1]) * (self.right - self.left)
else:
cl = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.)
cr = 3. * (2. * torch.rand(self.n, self.input_dim[1]) - 1.)
assert torch.Size([self.n, self.input_dim[1]]) == cl.size()
assert torch.Size([self.n, self.input_dim[1]]) == cr.size()
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: and verify the shapes are correct.

self.register_buffer('cl', cl)
self.register_buffer('cr', cr)

Expand Down
29 changes: 18 additions & 11 deletions rrl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class MLLP(nn.Module):
def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_grad=False):
def __init__(self, dim_list, use_not=False, cl=None, cr=None, left=None, right=None, estimated_grad=False):
super(MLLP, self).__init__()

self.dim_list = dim_list
Expand All @@ -30,7 +30,7 @@ def __init__(self, dim_list, use_not=False, left=None, right=None, estimated_gra
num += self.layer_list[-2].output_dim

if i == 1:
layer = BinarizeLayer(dim_list[i], num, self.use_not, self.left, self.right)
layer = BinarizeLayer(dim_list[i], num, self.use_not, cl=cl, cr=cr, left=self.left, right=self.right)
layer_name = 'binary{}'.format(i)
elif i == len(dim_list) - 1:
layer = LRLayer(dim_list[i], num)
Expand Down Expand Up @@ -77,7 +77,7 @@ def layer_list(self):

class RRL:
def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=None, writer=None, left=None,
right=None, save_best=False, estimated_grad=False, save_path=None, distributed=True):
right=None, cl=None, cr=None, save_best=False, estimated_grad=False, save_path=None, distributed=True):
super(RRL, self).__init__()
self.dim_list = dim_list
self.use_not = use_not
Expand All @@ -99,9 +99,11 @@ def __init__(self, dim_list, device_id, use_not=False, is_rank0=False, log_file=
logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode='w', format=log_format)
self.writer = writer

self.net = MLLP(dim_list, use_not=use_not, left=left, right=right, estimated_grad=estimated_grad)
self.net = MLLP(dim_list, use_not=use_not, cl=cl, cr=cr, left=left, right=right,
estimated_grad=estimated_grad)

self.net.cuda(self.device_id)
if self.device_id and self.device_id.type == 'cuda':
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: the condition allows the program to run in CPU mode as well.

self.net.cuda(self.device_id)
if distributed:
self.net = MyDistributedDataParallel(self.net, device_ids=[self.device_id])

Expand Down Expand Up @@ -161,8 +163,9 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data
ba_cnt = 0
for X, y in data_loader:
ba_cnt += 1
X = X.cuda(self.device_id, non_blocking=True)
y = y.cuda(self.device_id, non_blocking=True)
if self.device_id and self.device_id.type == 'cuda':
X = X.cuda(self.device_id, non_blocking=True)
y = y.cuda(self.device_id, non_blocking=True)
optimizer.zero_grad() # Zero the gradient buffers.
y_pred_mllp, y_pred_rrl = self.net.forward(X)
with torch.no_grad():
Expand Down Expand Up @@ -231,7 +234,8 @@ def train_model(self, X=None, y=None, X_validation=None, y_validation=None, data
self.save_model()
return epoch_histc

def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=True, set_name='Validation'):
def test(self, X=None, y=None, labels=None, test_loader=None,
batch_size=32, need_transform=True, set_name='Validation'):
if X is not None and y is not None and need_transform:
X, y = self.data_transform(X, y)
with torch.no_grad():
Expand All @@ -251,7 +255,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T
y_pred_list = []
y_pred_b_list = []
for X, y in test_loader:
X = X.cuda(self.device_id, non_blocking=True)
if self.device_id and self.device_id.type == 'cuda':
X = X.cuda(self.device_id, non_blocking=True)
output = self.net.forward(X)
y_pred_list.append(output[0])
y_pred_b_list.append(output[1])
Expand All @@ -275,7 +280,8 @@ def test(self, X=None, y=None, test_loader=None, batch_size=32, need_transform=T
logging.info('On {} Set:\n\tAccuracy of RRL Model: {}'
'\n\tF1 Score of RRL Model: {}'.format(set_name, accuracy_b, f1_score_b))
logging.info('On {} Set:\nPerformance of RRL Model: \n{}\n{}'.format(
set_name, metrics.confusion_matrix(y_true, y_pred_b_arg), metrics.classification_report(y_true, y_pred_b_arg)))
set_name, metrics.confusion_matrix(y_true, y_pred_b_arg),
metrics.classification_report(y_true, y_pred_b_arg, target_names=labels)))
logging.info('-' * 60)
return accuracy, accuracy_b, f1_score, f1_score_b

Expand All @@ -289,7 +295,8 @@ def detect_dead_node(self, data_loader=None):
layer.node_activation_cnt = torch.zeros(layer.output_dim, dtype=torch.double, device=self.device_id)
layer.forward_tot = 0
for x, y in data_loader:
x = x.cuda(self.device_id)
if self.device_id and self.device_id.type == 'cuda':
x = x.cuda(self.device_id)
x_res = None
for i, layer in enumerate(self.net.layer_list[:-1]):
if i <= 1:
Expand Down
42 changes: 25 additions & 17 deletions rrl/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

Expand Down Expand Up @@ -29,13 +30,14 @@ def read_csv(data_path, info_path, shuffle=False):
class DBEncoder:
"""Encoder used for data discretization and binarization."""

def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first'):
def __init__(self, f_df, discrete=False, y_one_hot=True, drop='first',
impute_continuous=True, one_hot_encode_features=True):
self.f_df = f_df
self.discrete = discrete
self.y_one_hot = y_one_hot
self.label_enc = preprocessing.OneHotEncoder(categories='auto') if y_one_hot else preprocessing.LabelEncoder()
self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop)
self.imp = SimpleImputer(missing_values=np.nan, strategy='mean')
self.feature_enc = preprocessing.OneHotEncoder(categories='auto', drop=drop) if one_hot_encode_features else None
self.imp = SimpleImputer(missing_values=np.nan, strategy='mean') if impute_continuous else None
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: for dataset not requiring or already have one-hot encoding or imputation, they can now be skipped.

self.X_fname = None
self.y_fname = None
self.discrete_flen = None
Expand All @@ -59,16 +61,18 @@ def fit(self, X_df, y_df):
self.y_fname = list(self.label_enc.get_feature_names(y_df.columns)) if self.y_one_hot else y_df.columns

if not continuous_data.empty:
# Use mean as missing value for continuous columns if do not discretize them.
self.imp.fit(continuous_data.values)
if self.imp is not None:
# Use mean as missing value for continuous columns if do not discretize them.
self.imp.fit(continuous_data.values)
if not discrete_data.empty:
# One-hot encoding
self.feature_enc.fit(discrete_data)
feature_names = discrete_data.columns
self.X_fname = list(self.feature_enc.get_feature_names(feature_names))
self.X_fname = discrete_data.columns.to_list()
if self.feature_enc is not None:
# One-hot encoding
self.feature_enc.fit(discrete_data)
self.X_fname = list(self.feature_enc.get_feature_names(self.X_fname))
self.discrete_flen = len(self.X_fname)
if not self.discrete:
self.X_fname.extend(continuous_data.columns)
self.X_fname.extend(continuous_data.columns.to_list())
else:
self.X_fname = continuous_data.columns
self.discrete_flen = 0
Expand All @@ -84,21 +88,25 @@ def transform(self, X_df, y_df, normalized=False, keep_stat=False):
y = y.toarray()

if not continuous_data.empty:
# Use mean as missing value for continuous columns if we do not discretize them.
continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values),
columns=continuous_data.columns)
if self.imp is not None:
# Use mean as missing value for continuous columns if we do not discretize them.
continuous_data = pd.DataFrame(self.imp.transform(continuous_data.values),
columns=continuous_data.columns)
if normalized:
if keep_stat:
self.mean = continuous_data.mean()
self.std = continuous_data.std()
continuous_data = (continuous_data - self.mean) / self.std
if not discrete_data.empty:
# One-hot encoding
discrete_data = self.feature_enc.transform(discrete_data)
if self.feature_enc is not None:
# One-hot encoding
discrete_data = self.feature_enc.transform(discrete_data)
if issparse(discrete_data):
discrete_data = discrete_data.toarray()
if not self.discrete:
X_df = pd.concat([pd.DataFrame(discrete_data.toarray()), continuous_data], axis=1)
X_df = pd.concat([pd.DataFrame(discrete_data), continuous_data], axis=1)
else:
X_df = pd.DataFrame(discrete_data.toarray())
X_df = pd.DataFrame(discrete_data)
else:
X_df = continuous_data
return X_df.values, y