From b887d33e482e93cd9fb4ed871d5c9c6e336f99d6 Mon Sep 17 00:00:00 2001 From: sndnyang Date: Wed, 25 Mar 2020 10:18:25 -0400 Subject: [PATCH 1/5] update to newer PyTorch(>=1.1.0) --- README.md | 9 ++++++++- base_layers.py | 4 ++-- l0_layers.py | 20 ++++++++++---------- requirements.txt | 35 +++++++++++++++++++++++++++++++++++ train_lenet5.py | 20 ++++++++++---------- train_wide_resnet.py | 22 +++++++++++----------- 6 files changed, 76 insertions(+), 34 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index e80261b..f1307eb 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,10 @@ Example implementation of the L0 regularization method described at -Learning Sparse Neural Networks through L0 regularization, Christos Louizos, Max Welling & Diederik P. Kingma, https://openreview.net/pdf?id=BkdI3hgRZ \ No newline at end of file +Learning Sparse Neural Networks through L0 regularization, Christos Louizos, Max Welling & Diederik P. Kingma, https://openreview.net/pdf?id=BkdI3hgRZ + +# ChangeLog + +## 2020.3.5 + +Update the code for newer PyTorch (>= 1.1.0) +Add requirements.txt \ No newline at end of file diff --git a/base_layers.py b/base_layers.py index 0deaa4b..b3da9af 100644 --- a/base_layers.py +++ b/base_layers.py @@ -23,7 +23,7 @@ def __init__(self, in_features, out_features, bias=True, weight_decay=1., **kwar print(self) def reset_parameters(self): - init.kaiming_normal(self.weight, mode='fan_out') + init.kaiming_normal_(self.weight, mode='fan_out') if self.bias is not None: self.bias.data.normal_(0, 1e-2) @@ -90,7 +90,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, print(self) def reset_parameters(self): - init.kaiming_normal(self.weight, mode='fan_in') + init.kaiming_normal_(self.weight, mode='fan_in') if self.bias is not None: self.bias.data.normal_(0, 1e-2) diff --git a/l0_layers.py b/l0_layers.py index b2f844d..5601d14 100644 --- a/l0_layers.py +++ b/l0_layers.py @@ -43,7 +43,7 @@ def __init__(self, in_features, out_features, bias=True, weight_decay=1., dropra print(self) def reset_parameters(self): - init.kaiming_normal(self.weights, mode='fan_out') + init.kaiming_normal_(self.weights, mode='fan_out') self.qz_loga.data.normal_(math.log(1 - self.droprate_init) - math.log(self.droprate_init), 1e-2) @@ -57,11 +57,11 @@ def cdf_qz(self, x): """Implements the CDF of the 'stretched' concrete distribution""" xn = (x - limit_a) / (limit_b - limit_a) logits = math.log(xn) - math.log(1 - xn) - return F.sigmoid(logits * self.temperature - self.qz_loga).clamp(min=epsilon, max=1 - epsilon) + return torch.sigmoid(logits * self.temperature - self.qz_loga).clamp(min=epsilon, max=1 - epsilon) def quantile_concrete(self, x): """Implements the quantile, aka inverse CDF, of the 'stretched' concrete distribution""" - y = F.sigmoid((torch.log(x) - torch.log(1 - x) + self.qz_loga) / self.temperature) + y = torch.sigmoid((torch.log(x) - torch.log(1 - x) + self.qz_loga) / self.temperature) return y * (limit_b - limit_a) + limit_a def _reg_w(self): @@ -85,7 +85,7 @@ def count_expected_flops_and_l0(self): if self.use_bias: expected_flops += self.out_features expected_l0 += self.out_features - return expected_flops.data[0], expected_l0.data[0] + return expected_flops.item(), expected_l0.item() def get_eps(self, size): """Uniform random numbers for the concrete distribution""" @@ -100,7 +100,7 @@ def sample_z(self, batch_size, sample=True): z = self.quantile_concrete(eps) return F.hardtanh(z, min_val=0, max_val=1) else: # mode - pi = F.sigmoid(self.qz_loga).view(1, self.in_features).expand(batch_size, self.in_features) + pi = torch.sigmoid(self.qz_loga).view(1, self.in_features).expand(batch_size, self.in_features) return F.hardtanh(pi * (limit_b - limit_a) + limit_a, min_val=0, max_val=1) def sample_weights(self): @@ -182,7 +182,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, print(self) def reset_parameters(self): - init.kaiming_normal(self.weights, mode='fan_in') + init.kaiming_normal_(self.weights, mode='fan_in') self.qz_loga.data.normal_(math.log(1 - self.droprate_init) - math.log(self.droprate_init), 1e-2) @@ -196,11 +196,11 @@ def cdf_qz(self, x): """Implements the CDF of the 'stretched' concrete distribution""" xn = (x - limit_a) / (limit_b - limit_a) logits = math.log(xn) - math.log(1 - xn) - return F.sigmoid(logits * self.temperature - self.qz_loga).clamp(min=epsilon, max=1 - epsilon) + return torch.sigmoid(logits * self.temperature - self.qz_loga).clamp(min=epsilon, max=1 - epsilon) def quantile_concrete(self, x): """Implements the quantile, aka inverse CDF, of the 'stretched' concrete distribution""" - y = F.sigmoid((torch.log(x) - torch.log(1 - x) + self.qz_loga) / self.temperature) + y = torch.sigmoid((torch.log(x) - torch.log(1 - x) + self.qz_loga) / self.temperature) return y * (limit_b - limit_a) + limit_a def _reg_w(self): @@ -233,7 +233,7 @@ def count_expected_flops_and_l0(self): expected_flops += num_instances_per_filter * ppos expected_l0 += ppos - return expected_flops.data[0], expected_l0.data[0] + return expected_flops.item(), expected_l0.item() def get_eps(self, size): """Uniform random numbers for the concrete distribution""" @@ -248,7 +248,7 @@ def sample_z(self, batch_size, sample=True): z = self.quantile_concrete(eps).view(batch_size, self.dim_z, 1, 1) return F.hardtanh(z, min_val=0, max_val=1) else: # mode - pi = F.sigmoid(self.qz_loga).view(1, self.dim_z, 1, 1) + pi = torch.sigmoid(self.qz_loga).view(1, self.dim_z, 1, 1) return F.hardtanh(pi * (limit_b - limit_a) + limit_a, min_val=0, max_val=1) def sample_weights(self): diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..40f6700 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,35 @@ +absl-py==0.9.0 +certifi==2019.11.28 +chardet==3.0.4 +cycler==0.10.0 +fire==0.2.1 +GPUtil==1.4.0 +grpcio==1.27.2 +idna==2.9 +jsonpatch==1.25 +jsonpointer==2.0 +kiwisolver==1.1.0 +Markdown==3.2.1 +matplotlib==3.2.0 +numpy==1.17.4 +Pillow==6.2.1 +protobuf==3.11.3 +pyparsing==2.4.6 +python-dateutil==2.8.1 +pyzmq==19.0.0 +requests==2.23.0 +scipy==1.4.1 +six==1.13.0 +tensorboard==1.14.0 +tensorboardX==1.6 +termcolor==1.1.0 +torch==1.3.1 +torchfile==0.1.0 +torchnet==0.0.4 +torchvision==0.4.2 +tornado==6.0.4 +tqdm==4.43.0 +urllib3==1.25.8 +visdom==0.1.8.9 +websocket-client==0.57.0 +Werkzeug==1.0.0 diff --git a/train_lenet5.py b/train_lenet5.py index f7f0792..d9c8cad 100755 --- a/train_lenet5.py +++ b/train_lenet5.py @@ -35,7 +35,7 @@ parser.add_argument('--beta_ema', type=float, default=0.999) parser.add_argument('--lambas', nargs='*', type=float, default=[1., 1., 1., 1.]) parser.add_argument('--local_rep', action='store_true') -parser.add_argument('--temp', type=float, default=2./3.) +parser.add_argument('--temp', type=float, default=2. / 3.) parser.add_argument('--multi_gpu', action='store_true') parser.set_defaults(tensorboard=True) @@ -60,7 +60,7 @@ def main(): else: os.makedirs(directory) writer = SummaryWriter(directory) - + # Data loading code print('[0, 1] normalization of input') train_loader, val_loader, num_classes = mnist(args.batch_size, pm=False) @@ -167,8 +167,8 @@ def train(train_loader, model, criterion, optimizer, epoch): # measure accuracy and record loss prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.data[0], input_.size(0)) - top1.update(100 - prec1[0], input_.size(0)) + losses.update(loss.item(), input_.size(0)) + top1.update(100 - prec1.item(), input_.size(0)) # compute gradient and do SGD step optimizer.zero_grad() @@ -206,8 +206,8 @@ def train(train_loader, model, criterion, optimizer, epoch): 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( - epoch, i, len(train_loader), batch_time=batch_time, - data_time=data_time, loss=losses, top1=top1)) + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, top1=top1)) # log to TensorBoard if writer is not None: @@ -249,8 +249,8 @@ def validate(val_loader, model, criterion, epoch): # measure accuracy and record loss prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.data[0], input_.size(0)) - top1.update(100 - prec1[0], input_.size(0)) + losses.update(loss.item(), input_.size(0)) + top1.update(100 - prec1.item(), input_.size(0)) # measure elapsed time batch_time.update(time.time() - end) @@ -261,8 +261,8 @@ def validate(val_loader, model, criterion, epoch): 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( - i, len(val_loader), batch_time=batch_time, loss=losses, - top1=top1)) + i, len(val_loader), batch_time=batch_time, loss=losses, + top1=top1)) print(' * Err@1 {top1.avg:.3f}'.format(top1=top1)) if not args.multi_gpu: diff --git a/train_wide_resnet.py b/train_wide_resnet.py index f074eb9..4ca52c9 100755 --- a/train_wide_resnet.py +++ b/train_wide_resnet.py @@ -52,7 +52,7 @@ parser.add_argument('--dataset', choices=['c10', 'c100'], default='c10') parser.add_argument('--local_rep', action='store_true') parser.add_argument('--epoch_drop', nargs='*', type=int, default=(60, 120, 160)) -parser.add_argument('--temp', type=float, default=2./3.) +parser.add_argument('--temp', type=float, default=2. / 3.) parser.set_defaults(bottleneck=True) parser.set_defaults(augment=True) parser.set_defaults(tensorboard=True) @@ -93,8 +93,8 @@ def main(): lamba=args.lamba, temperature=args.temp) print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))) - - # for training on multiple GPUs. + + # for training on multiple GPUs. # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use if args.multi_gpu: model = torch.nn.DataParallel(model).cuda() @@ -201,7 +201,6 @@ def train(train_loader, model, criterion, optimizer, lr_schedule, epoch): # switch to train mode model.train() - lr_schedule.step(epoch=epoch) if writer is not None: writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) @@ -221,8 +220,8 @@ def train(train_loader, model, criterion, optimizer, lr_schedule, epoch): # measure accuracy and record loss prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.data[0], input_.size(0)) - top1.update(100 - prec1[0], input_.size(0)) + losses.update(loss.item(), input_.size(0)) + top1.update(100 - prec1.item(), input_.size(0)) # compute gradient and do SGD step optimizer.zero_grad() @@ -260,14 +259,15 @@ def train(train_loader, model, criterion, optimizer, lr_schedule, epoch): 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( - epoch, i, len(train_loader), batch_time=batch_time, - data_time=data_time, loss=losses, top1=top1)) + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, top1=top1)) # log to TensorBoard if writer is not None: writer.add_scalar('train/loss', losses.avg, epoch) writer.add_scalar('train/err', top1.avg, epoch) + lr_schedule.step(epoch=epoch) return top1.avg @@ -303,7 +303,7 @@ def validate(val_loader, model, criterion, epoch): # measure accuracy and record loss prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.data[0], input_.size(0)) + losses.update(loss.item(), input_.size(0)) top1.update(100 - prec1[0], input_.size(0)) # measure elapsed time @@ -315,8 +315,8 @@ def validate(val_loader, model, criterion, epoch): 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( - i, len(val_loader), batch_time=batch_time, loss=losses, - top1=top1)) + i, len(val_loader), batch_time=batch_time, loss=losses, + top1=top1)) print(' * Err@1 {top1.avg:.3f}'.format(top1=top1)) if not args.multi_gpu: From 6b8ca00ddc92782080d569f32455da63e1b8c7dc Mon Sep 17 00:00:00 2001 From: sndnyang Date: Wed, 25 Mar 2020 10:19:45 -0400 Subject: [PATCH 2/5] update to newer PyTorch(>=1.1.0) --- train_wide_resnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_wide_resnet.py b/train_wide_resnet.py index 4ca52c9..f63594e 100755 --- a/train_wide_resnet.py +++ b/train_wide_resnet.py @@ -304,7 +304,7 @@ def validate(val_loader, model, criterion, epoch): # measure accuracy and record loss prec1 = accuracy(output.data, target, topk=(1,))[0] losses.update(loss.item(), input_.size(0)) - top1.update(100 - prec1[0], input_.size(0)) + top1.update(100 - prec1.item(), input_.size(0)) # measure elapsed time batch_time.update(time.time() - end) From 57508e878e8fa8b49b8aaf653c2b4c139bfa55da Mon Sep 17 00:00:00 2001 From: sndnyang Date: Thu, 9 Apr 2020 22:54:25 -0400 Subject: [PATCH 3/5] refactor --- .gitignore | 2 + models.py | 305 ------------------------- base_layers.py => models/BaseLayers.py | 0 l0_layers.py => models/L0Layers.py | 20 +- models/LeNet.py | 176 ++++++++++++++ models/MLP.py | 75 ++++++ models/WideResNet.py | 159 +++++++++++++ train_lenet5.py | 103 +++++---- train_wide_resnet.py | 2 +- 9 files changed, 482 insertions(+), 360 deletions(-) create mode 100644 .gitignore delete mode 100644 models.py rename base_layers.py => models/BaseLayers.py (100%) rename l0_layers.py => models/L0Layers.py (96%) create mode 100644 models/LeNet.py create mode 100644 models/MLP.py create mode 100644 models/WideResNet.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4fa5ec4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +*.pyc diff --git a/models.py b/models.py deleted file mode 100644 index 79a62dd..0000000 --- a/models.py +++ /dev/null @@ -1,305 +0,0 @@ -import torch -import torch.nn as nn -from l0_layers import L0Conv2d, L0Dense -from base_layers import MAPConv2d, MAPDense -from utils import get_flat_fts -from copy import deepcopy -import torch.nn.functional as F - - -class L0MLP(nn.Module): - def __init__(self, input_dim, num_classes, layer_dims=(300, 100), N=50000, beta_ema=0.999, - weight_decay=1, lambas=(1., 1., 1.), local_rep=False, temperature=2./3.): - super(L0MLP, self).__init__() - self.layer_dims = layer_dims - self.input_dim = input_dim - self.N = N - self.beta_ema = beta_ema - self.weight_decay = self.N * weight_decay - self.lambas = lambas - - layers = [] - for i, dimh in enumerate(self.layer_dims): - inp_dim = self.input_dim if i == 0 else self.layer_dims[i - 1] - droprate_init, lamba = 0.2 if i == 0 else 0.5, lambas[i] if len(lambas) > 1 else lambas[0] - layers += [L0Dense(inp_dim, dimh, droprate_init=droprate_init, weight_decay=self.weight_decay, - lamba=lamba, local_rep=local_rep, temperature=temperature), nn.ReLU()] - - layers.append(L0Dense(self.layer_dims[-1], num_classes, droprate_init=0.5, weight_decay=self.weight_decay, - lamba=lambas[-1], local_rep=local_rep, temperature=temperature)) - self.output = nn.Sequential(*layers) - - self.layers = [] - for m in self.modules(): - if isinstance(m, L0Dense): - self.layers.append(m) - - if beta_ema > 0.: - print('Using temporal averaging with beta: {}'.format(beta_ema)) - self.avg_param = deepcopy(list(p.data for p in self.parameters())) - if torch.cuda.is_available(): - self.avg_param = [a.cuda() for a in self.avg_param] - self.steps_ema = 0. - - def forward(self, x): - return self.output(x) - - def regularization(self): - regularization = 0. - for layer in self.layers: - regularization += - (1. / self.N) * layer.regularization() - if torch.cuda.is_available(): - regularization = regularization.cuda() - return regularization - - def get_exp_flops_l0(self): - expected_flops, expected_l0 = 0., 0. - for layer in self.layers: - e_fl, e_l0 = layer.count_expected_flops_and_l0() - expected_flops += e_fl - expected_l0 += e_l0 - return expected_flops, expected_l0 - - def update_ema(self): - self.steps_ema += 1 - for p, avg_p in zip(self.parameters(), self.avg_param): - avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) - - def load_ema_params(self): - for p, avg_p in zip(self.parameters(), self.avg_param): - p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) - - def load_params(self, params): - for p, avg_p in zip(self.parameters(), params): - p.data.copy_(avg_p) - - def get_params(self): - params = deepcopy(list(p.data for p in self.parameters())) - return params - - -class L0LeNet5(nn.Module): - def __init__(self, num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, - N=50000, beta_ema=0., weight_decay=1, lambas=(1., 1., 1., 1.), local_rep=False, - temperature=2./3.): - super(L0LeNet5, self).__init__() - self.N = N - assert(len(conv_dims) == 2) - self.conv_dims = conv_dims - self.fc_dims = fc_dims - self.beta_ema = beta_ema - self.weight_decay = weight_decay - - convs = [L0Conv2d(input_size[0], conv_dims[0], 5, droprate_init=0.5, temperature=temperature, - weight_decay=self.weight_decay, lamba=lambas[0], local_rep=local_rep), - nn.ReLU(), nn.MaxPool2d(2), - L0Conv2d(conv_dims[0], conv_dims[1], 5, droprate_init=0.5, temperature=temperature, - weight_decay=self.weight_decay, lamba=lambas[1], local_rep=local_rep), - nn.ReLU(), nn.MaxPool2d(2)] - self.convs = nn.Sequential(*convs) - if torch.cuda.is_available(): - self.convs = self.convs.cuda() - - flat_fts = get_flat_fts(input_size, self.convs) - fcs = [L0Dense(flat_fts, self.fc_dims, droprate_init=0.5, weight_decay=self.weight_decay, - lamba=lambas[2], local_rep=local_rep, temperature=temperature), nn.ReLU(), - L0Dense(self.fc_dims, num_classes, droprate_init=0.5, weight_decay=self.weight_decay, - lamba=lambas[3], local_rep=local_rep, temperature=temperature)] - self.fcs = nn.Sequential(*fcs) - - self.layers = [] - for m in self.modules(): - if isinstance(m, L0Dense) or isinstance(m, L0Conv2d): - self.layers.append(m) - - if beta_ema > 0.: - print('Using temporal averaging with beta: {}'.format(beta_ema)) - self.avg_param = deepcopy(list(p.data for p in self.parameters())) - if torch.cuda.is_available(): - self.avg_param = [a.cuda() for a in self.avg_param] - self.steps_ema = 0. - - def forward(self, x): - o = self.convs(x) - o = o.view(o.size(0), -1) - return self.fcs(o) - - def regularization(self): - regularization = 0. - for layer in self.layers: - regularization += - (1. / self.N) * layer.regularization() - if torch.cuda.is_available(): - regularization = regularization.cuda() - return regularization - - def get_exp_flops_l0(self): - expected_flops, expected_l0 = 0., 0. - for layer in self.layers: - e_fl, e_l0 = layer.count_expected_flops_and_l0() - expected_flops += e_fl - expected_l0 += e_l0 - return expected_flops, expected_l0 - - def update_ema(self): - self.steps_ema += 1 - for p, avg_p in zip(self.parameters(), self.avg_param): - avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) - - def load_ema_params(self): - for p, avg_p in zip(self.parameters(), self.avg_param): - p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) - - def load_params(self, params): - for p, avg_p in zip(self.parameters(), params): - p.data.copy_(avg_p) - - def get_params(self): - params = deepcopy(list(p.data for p in self.parameters())) - return params - - -class BasicBlock(nn.Module): - def __init__(self, in_planes, out_planes, stride, droprate_init=0.0, weight_decay=0., lamba=0.01, local_rep=False, - temperature=2./3.): - super(BasicBlock, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = L0Conv2d(in_planes, out_planes, kernel_size=3, stride=1, padding=1, bias=False, - droprate_init=droprate_init, weight_decay=weight_decay / (1 - 0.3), local_rep=local_rep, - lamba=lamba, temperature=temperature) - - self.bn2 = nn.BatchNorm2d(out_planes) - self.conv2 = MAPConv2d(out_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False, - weight_decay=weight_decay) - - self.equalInOut = (in_planes == out_planes) - self.convShortcut = (not self.equalInOut) and \ - MAPConv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False, - weight_decay=weight_decay) or None - - def forward(self, x): - if not self.equalInOut: - x = F.relu(self.bn1(x)) - else: - out = F.relu(self.bn1(x)) - - out = self.conv1(out if self.equalInOut else x) - out = self.conv2(F.relu(self.bn2(out))) - return torch.add(out, x if self.equalInOut else self.convShortcut(x)) - - -class NetworkBlock(nn.Module): - def __init__(self, nb_layers, in_planes, out_planes, block, stride, droprate_init=0.0, weight_decay=0., lamba=0.01, - local_rep=False, temperature=2./3.): - super(NetworkBlock, self).__init__() - self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, droprate_init, - weight_decay=weight_decay, lamba=lamba, local_rep=local_rep, - temperature=temperature) - - def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, droprate_init, - weight_decay=0., lamba=0.01, local_rep=False, temperature=2./3.): - layers = [] - for i in range(nb_layers): - layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, - droprate_init, weight_decay, lamba, local_rep=local_rep, temperature=temperature)) - return nn.Sequential(*layers) - - def forward(self, x): - return self.layer(x) - - -class L0WideResNet(nn.Module): - def __init__(self, depth, num_classes, widen_factor=1, droprate_init=0.3, N=50000, beta_ema=0.99, - weight_decay=5e-4, local_rep=False, lamba=0.01, temperature=2./3.): - super(L0WideResNet, self).__init__() - nChannels = [16, 16*widen_factor, 32*widen_factor, 64*widen_factor] - assert((depth - 4) % 6 == 0) - self.n = (depth - 4) // 6 - self.N = N - self.beta_ema = beta_ema - block = BasicBlock - - self.weight_decay = N * weight_decay - self.lamba = lamba - - # 1st conv before any network block - self.conv1 = MAPConv2d(3, nChannels[0], kernel_size=3, stride=1, padding=1, bias=False, - weight_decay=self.weight_decay) - # 1st block - self.block1 = NetworkBlock(self.n, nChannels[0], nChannels[1], block, 1, droprate_init, self.weight_decay, - self.lamba, local_rep=local_rep, temperature=temperature) - # 2nd block - self.block2 = NetworkBlock(self.n, nChannels[1], nChannels[2], block, 2, droprate_init, self.weight_decay, - self.lamba, local_rep=local_rep, temperature=temperature) - # 3rd block - self.block3 = NetworkBlock(self.n, nChannels[2], nChannels[3], block, 2, droprate_init, self.weight_decay, - self.lamba, local_rep=local_rep, temperature=temperature) - # bn, relu and classifier - self.bn = nn.BatchNorm2d(nChannels[3]) - self.fcout = MAPDense(nChannels[3], num_classes, weight_decay=self.weight_decay) - - self.layers, self.bn_params = [], [] - for m in self.modules(): - if isinstance(m, MAPDense) or isinstance(m, MAPConv2d) or isinstance(m, L0Conv2d): - self.layers.append(m) - elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1) - m.bias.data.zero_() - self.bn_params += [m.weight, m.bias] - - if beta_ema > 0.: - print('Using temporal averaging with beta: {}'.format(beta_ema)) - self.avg_param = deepcopy(list(p.data for p in self.parameters())) - if torch.cuda.is_available(): - self.avg_param = [a.cuda() for a in self.avg_param] - self.steps_ema = 0. - - print('Using weight decay: {}'.format(self.weight_decay)) - - def forward(self, x): - out = self.conv1(x) - out = self.block1(out) - out = self.block2(out) - out = self.block3(out) - out = F.relu(self.bn(out)) - out = F.avg_pool2d(out, 8) - out = out.view(out.size(0), -1) - return self.fcout(out) - - def regularization(self): - regularization = 0. - for layer in self.layers: - regularization += - (1. / self.N) * layer.regularization() - for bnw in self.bn_params: - if self.weight_decay > 0: - regularization += (self.weight_decay / self.N) * .5 * torch.sum(bnw.pow(2)) - if torch.cuda.is_available(): - regularization = regularization.cuda() - return regularization - - def get_exp_flops_l0(self): - expected_flops, expected_l0 = 0., 0. - for layer in self.layers: - try: - e_fl, e_l0 = layer.count_expected_flops_and_l0() - expected_flops += e_fl - expected_l0 += e_l0 - except: - pass - return expected_flops, expected_l0 - - def update_ema(self): - self.steps_ema += 1 - for p, avg_p in zip(self.parameters(), self.avg_param): - avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) - - def load_ema_params(self): - for p, avg_p in zip(self.parameters(), self.avg_param): - p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) - - def load_params(self, params): - for p, avg_p in zip(self.parameters(), params): - p.data.copy_(avg_p) - - def get_params(self): - params = deepcopy(list(p.data for p in self.parameters())) - return params diff --git a/base_layers.py b/models/BaseLayers.py similarity index 100% rename from base_layers.py rename to models/BaseLayers.py diff --git a/l0_layers.py b/models/L0Layers.py similarity index 96% rename from l0_layers.py rename to models/L0Layers.py index 5601d14..89888a2 100644 --- a/l0_layers.py +++ b/models/L0Layers.py @@ -111,6 +111,7 @@ def sample_weights(self): def forward(self, input): if self.local_rep or not self.training: z = self.sample_z(input.size(0), sample=self.training) + self.test_z = z xin = input.mul(z) output = xin.mm(self.weights) else: @@ -120,6 +121,12 @@ def forward(self, input): output.add_(self.bias) return output + def activated_neurons(self): + return (self.test_z > 0).sum() / self.test_z.size(0) + + def masked_weight(self): + return self.weights * self.test_z[0].reshape(self.out_channels, 1, 1, 1) + def __repr__(self): s = ('{name}({in_features} -> {out_features}, droprate_init={droprate_init}, ' 'lamba={lamba}, temperature={temperature}, weight_decay={prior_prec}, ' @@ -262,12 +269,19 @@ def forward(self, input_): if self.local_rep or not self.training: output = F.conv2d(input_, self.weights, b, self.stride, self.padding, self.dilation, self.groups) z = self.sample_z(output.size(0), sample=self.training) + self.test_z = z return output.mul(z) else: weights = self.sample_weights() output = F.conv2d(input_, weights, None, self.stride, self.padding, self.dilation, self.groups) return output + def activated_neurons(self): + return (self.test_z > 0).sum() / self.test_z.size(0) + + def masked_weight(self): + return self.weights * self.test_z[0].reshape(self.out_channels, 1, 1, 1) + def __repr__(self): s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}, ' 'droprate_init={droprate_init}, temperature={temperature}, prior_prec={prior_prec}, ' @@ -284,9 +298,3 @@ def __repr__(self): s += ', bias=False' s += ')' return s.format(name=self.__class__.__name__, **self.__dict__) - - - - - - diff --git a/models/LeNet.py b/models/LeNet.py new file mode 100644 index 0000000..7fa74ab --- /dev/null +++ b/models/LeNet.py @@ -0,0 +1,176 @@ +import torch +import torch.nn as nn +from copy import deepcopy +from utils import get_flat_fts +from models.L0Layers import L0Conv2d, L0Dense + + +class LeNet5(nn.Module): + def __init__(self, num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, + N=50000, beta_ema=0., weight_decay=1, lambas=(1., 1., 1., 1.), local_rep=False, + temperature=2./3.): + super(LeNet5, self).__init__() + self.N = N + assert(len(conv_dims) == 2) + self.conv_dims = conv_dims + self.fc_dims = fc_dims + self.beta_ema = beta_ema + self.weight_decay = weight_decay + + convs = [nn.Conv2d(input_size[0], conv_dims[0], 5, droprate_init=0.5, temperature=temperature, + weight_decay=self.weight_decay, lamba=lambas[0], local_rep=local_rep), + nn.ReLU(), nn.MaxPool2d(2), + nn.Conv2d(conv_dims[0], conv_dims[1], 5, droprate_init=0.5, temperature=temperature, + weight_decay=self.weight_decay, lamba=lambas[1], local_rep=local_rep), + nn.ReLU(), nn.MaxPool2d(2)] + self.convs = nn.Sequential(*convs) + if torch.cuda.is_available(): + self.convs = self.convs.cuda() + + flat_fts = get_flat_fts(input_size, self.convs) + fcs = [nn.Linear(flat_fts, self.fc_dims, droprate_init=0.5, weight_decay=self.weight_decay, + lamba=lambas[2], local_rep=local_rep, temperature=temperature), nn.ReLU(), + nn.Linear(self.fc_dims, num_classes, droprate_init=0.5, weight_decay=self.weight_decay, + lamba=lambas[3], local_rep=local_rep, temperature=temperature)] + self.fcs = nn.Sequential(*fcs) + + self.layers = [] + for m in self.modules(): + if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): + self.layers.append(m) + + if beta_ema > 0.: + print('Using temporal averaging with beta: {}'.format(beta_ema)) + self.avg_param = deepcopy(list(p.data for p in self.parameters())) + if torch.cuda.is_available(): + self.avg_param = [a.cuda() for a in self.avg_param] + self.steps_ema = 0. + + def forward(self, x): + o = self.convs(x) + o = o.view(o.size(0), -1) + return self.fcs(o) + + def regularization(self): + regularization = 0. + for layer in self.layers: + regularization += - (1. / self.N) * layer.regularization() + if torch.cuda.is_available(): + regularization = regularization.cuda() + return regularization + + def get_exp_flops_l0(self): + expected_flops, expected_l0 = 0., 0. + for layer in self.layers: + e_fl, e_l0 = layer.count_expected_flops_and_l0() + expected_flops += e_fl + expected_l0 += e_l0 + return expected_flops, expected_l0 + + def update_ema(self): + self.steps_ema += 1 + for p, avg_p in zip(self.parameters(), self.avg_param): + avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) + + def load_ema_params(self): + for p, avg_p in zip(self.parameters(), self.avg_param): + p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) + + def load_params(self, params): + for p, avg_p in zip(self.parameters(), params): + p.data.copy_(avg_p) + + def get_params(self): + params = deepcopy(list(p.data for p in self.parameters())) + return params + + +class L0LeNet5(nn.Module): + def __init__(self, num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, + N=50000, beta_ema=0., weight_decay=1, lambas=(1., 1., 1., 1.), local_rep=False, + temperature=2./3.): + super(L0LeNet5, self).__init__() + self.N = N + assert(len(conv_dims) == 2) + self.conv_dims = conv_dims + self.fc_dims = fc_dims + self.beta_ema = beta_ema + self.weight_decay = weight_decay + + convs = [L0Conv2d(input_size[0], conv_dims[0], 5, droprate_init=0.5, temperature=temperature, + weight_decay=self.weight_decay, lamba=lambas[0], local_rep=local_rep), + nn.ReLU(), nn.MaxPool2d(2), + L0Conv2d(conv_dims[0], conv_dims[1], 5, droprate_init=0.5, temperature=temperature, + weight_decay=self.weight_decay, lamba=lambas[1], local_rep=local_rep), + nn.ReLU(), nn.MaxPool2d(2)] + self.convs = nn.Sequential(*convs) + if torch.cuda.is_available(): + self.convs = self.convs.cuda() + + flat_fts = get_flat_fts(input_size, self.convs) + fcs = [L0Dense(flat_fts, self.fc_dims, droprate_init=0.5, weight_decay=self.weight_decay, + lamba=lambas[2], local_rep=local_rep, temperature=temperature), nn.ReLU(), + L0Dense(self.fc_dims, num_classes, droprate_init=0.5, weight_decay=self.weight_decay, + lamba=lambas[3], local_rep=local_rep, temperature=temperature)] + self.fcs = nn.Sequential(*fcs) + + self.layers = [] + for m in self.modules(): + if isinstance(m, L0Dense) or isinstance(m, L0Conv2d): + self.layers.append(m) + + if beta_ema > 0.: + print('Using temporal averaging with beta: {}'.format(beta_ema)) + self.avg_param = deepcopy(list(p.data for p in self.parameters())) + if torch.cuda.is_available(): + self.avg_param = [a.cuda() for a in self.avg_param] + self.steps_ema = 0. + + def forward(self, x): + o = self.convs(x) + o = o.view(o.size(0), -1) + return self.fcs(o) + + def regularization(self): + regularization = 0. + for layer in self.layers: + regularization += - (1. / self.N) * layer.regularization() + if torch.cuda.is_available(): + regularization = regularization.cuda() + return regularization + + def get_exp_flops_l0(self): + expected_flops, expected_l0 = 0., 0. + for layer in self.layers: + e_fl, e_l0 = layer.count_expected_flops_and_l0() + expected_flops += e_fl + expected_l0 += e_l0 + return expected_flops, expected_l0 + + def update_ema(self): + self.steps_ema += 1 + for p, avg_p in zip(self.parameters(), self.avg_param): + avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) + + def load_ema_params(self): + for p, avg_p in zip(self.parameters(), self.avg_param): + p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) + + def load_params(self, params): + for p, avg_p in zip(self.parameters(), params): + p.data.copy_(avg_p) + + def get_params(self): + params = deepcopy(list(p.data for p in self.parameters())) + return params + + def arch(self): + return [layer.activated_neurons().cpu().numpy() for layer in self.layers] + + def prune_rate(self): + ''' + the number of parameters being pruned / the number of parameters + ''' + l = [layer.activated_neurons().cpu().numpy() for layer in self.layers] + return 100 - 100.0 * (l[0] * 25.0 + l[1] * l[0] * 25.0 + l[2] * l[3] + l[3] * 10.0) / ( + 20.0 * 25 + 50 * 20 * 25 + 800 * 500 + 5000) \ No newline at end of file diff --git a/models/MLP.py b/models/MLP.py new file mode 100644 index 0000000..b14160d --- /dev/null +++ b/models/MLP.py @@ -0,0 +1,75 @@ +import torch +import torch.nn as nn +from models.L0Layers import L0Dense +from copy import deepcopy + + +class L0MLP(nn.Module): + def __init__(self, input_dim, num_classes, layer_dims=(300, 100), N=50000, beta_ema=0.999, + weight_decay=1, lambdas=(1., 1., 1.), local_rep=False, temperature=2. / 3.): + super(L0MLP, self).__init__() + self.layer_dims = layer_dims + self.input_dim = input_dim + self.N = N + self.beta_ema = beta_ema + self.weight_decay = self.N * weight_decay + self.lambdas = lambdas + + layers = [] + for i, dim_h in enumerate(self.layer_dims): + inp_dim = self.input_dim if i == 0 else self.layer_dims[i - 1] + drop_rate_init, lamb = 0.2 if i == 0 else 0.5, lambdas[i] if len(lambdas) > 1 else lambdas[0] + layers += [L0Dense(inp_dim, dim_h, droprate_init=drop_rate_init, weight_decay=self.weight_decay, + lamba=lamb, local_rep=local_rep, temperature=temperature), nn.ReLU()] + + layers.append(L0Dense(self.layer_dims[-1], num_classes, droprate_init=0.5, weight_decay=self.weight_decay, + lamba=lambdas[-1], local_rep=local_rep, temperature=temperature)) + self.output = nn.Sequential(*layers) + + self.layers = [] + for m in self.modules(): + if isinstance(m, L0Dense): + self.layers.append(m) + + if beta_ema > 0.: + print('Using temporal averaging with beta: {}'.format(beta_ema)) + self.avg_param = deepcopy(list(p.data for p in self.parameters())) + if torch.cuda.is_available(): + self.avg_param = [a.cuda() for a in self.avg_param] + self.steps_ema = 0. + + def forward(self, x): + return self.output(x) + + def regularization(self): + regularization = 0. + for layer in self.layers: + regularization += - (1. / self.N) * layer.regularization() + if torch.cuda.is_available(): + regularization = regularization.cuda() + return regularization + + def get_exp_flops_l0(self): + expected_flops, expected_l0 = 0., 0. + for layer in self.layers: + e_fl, e_l0 = layer.count_expected_flops_and_l0() + expected_flops += e_fl + expected_l0 += e_l0 + return expected_flops, expected_l0 + + def update_ema(self): + self.steps_ema += 1 + for p, avg_p in zip(self.parameters(), self.avg_param): + avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) + + def load_ema_params(self): + for p, avg_p in zip(self.parameters(), self.avg_param): + p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) + + def load_params(self, params): + for p, avg_p in zip(self.parameters(), params): + p.data.copy_(avg_p) + + def get_params(self): + params = deepcopy(list(p.data for p in self.parameters())) + return params diff --git a/models/WideResNet.py b/models/WideResNet.py new file mode 100644 index 0000000..8e0e6b8 --- /dev/null +++ b/models/WideResNet.py @@ -0,0 +1,159 @@ +import torch +import torch.nn as nn +from copy import deepcopy +from models.BaseLayers import MAPConv2d, MAPDense +from models.L0Layers import L0Conv2d +import torch.nn.functional as tn_func + + +class BasicBlock(nn.Module): + def __init__(self, in_planes, out_planes, stride, drop_rate_init=0.0, weight_decay=0., lmbd=0.01, local_rep=False, + temperature=2. / 3.): + super(BasicBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = L0Conv2d(in_planes, out_planes, kernel_size=3, stride=1, padding=1, bias=False, + droprate_init=drop_rate_init, weight_decay=weight_decay / (1 - 0.3), local_rep=local_rep, + lamba=lmbd, temperature=temperature) + + self.bn2 = nn.BatchNorm2d(out_planes) + self.conv2 = MAPConv2d(out_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False, + weight_decay=weight_decay) + + self.equalInOut = (in_planes == out_planes) + self.convShortcut = (not self.equalInOut) and \ + MAPConv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False, + weight_decay=weight_decay) or None + + def forward(self, x): + if not self.equalInOut: + x = tn_func.relu(self.bn1(x)) + else: + out = tn_func.relu(self.bn1(x)) + + out = self.conv1(out if self.equalInOut else x) + out = self.conv2(tn_func.relu(self.bn2(out))) + return torch.add(out, x if self.equalInOut else self.convShortcut(x)) + + +class NetworkBlock(nn.Module): + def __init__(self, nb_layers, in_planes, out_planes, block, stride, droprate_init=0.0, weight_decay=0., lamba=0.01, + local_rep=False, temperature=2. / 3.): + super(NetworkBlock, self).__init__() + self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, droprate_init, + weight_decay=weight_decay, lamba=lamba, local_rep=local_rep, + temperature=temperature) + + def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, droprate_init, + weight_decay=0., lamba=0.01, local_rep=False, temperature=2. / 3.): + layers = [] + for i in range(nb_layers): + layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, + droprate_init, weight_decay, lamba, local_rep=local_rep, temperature=temperature)) + return nn.Sequential(*layers) + + def forward(self, x): + return self.layer(x) + + +class L0WideResNet(nn.Module): + def __init__(self, depth, num_classes, widen_factor=1, droprate_init=0.3, N=50000, beta_ema=0.99, + weight_decay=5e-4, local_rep=False, lamba=0.01, temperature=2. / 3.): + super(L0WideResNet, self).__init__() + nChannels = [16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor] + assert((depth - 4) % 6 == 0) + self.n = (depth - 4) // 6 + self.N = N + self.beta_ema = beta_ema + block = BasicBlock + + self.weight_decay = N * weight_decay + self.lamba = lamba + + # 1st conv before any network block + self.conv1 = MAPConv2d(3, nChannels[0], kernel_size=3, stride=1, padding=1, bias=False, + weight_decay=self.weight_decay) + # 1st block + self.block1 = NetworkBlock(self.n, nChannels[0], nChannels[1], block, 1, droprate_init, self.weight_decay, + self.lamba, local_rep=local_rep, temperature=temperature) + # 2nd block + self.block2 = NetworkBlock(self.n, nChannels[1], nChannels[2], block, 2, droprate_init, self.weight_decay, + self.lamba, local_rep=local_rep, temperature=temperature) + # 3rd block + self.block3 = NetworkBlock(self.n, nChannels[2], nChannels[3], block, 2, droprate_init, self.weight_decay, + self.lamba, local_rep=local_rep, temperature=temperature) + # bn, relu and classifier + self.bn = nn.BatchNorm2d(nChannels[3]) + self.fcout = MAPDense(nChannels[3], num_classes, weight_decay=self.weight_decay) + + self.layers, self.bn_params = [], [] + for m in self.modules(): + if isinstance(m, MAPDense) or isinstance(m, MAPConv2d) or isinstance(m, L0Conv2d): + self.layers.append(m) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + self.bn_params += [m.weight, m.bias] + + if beta_ema > 0.: + print('Using temporal averaging with beta: {}'.format(beta_ema)) + self.avg_param = deepcopy(list(p.data for p in self.parameters())) + if torch.cuda.is_available(): + self.avg_param = [a.cuda() for a in self.avg_param] + self.steps_ema = 0. + + print('Using weight decay: {}'.format(self.weight_decay)) + + def forward(self, x): + out = self.conv1(x) + out = self.block1(out) + out = self.block2(out) + out = self.block3(out) + out = tn_func.relu(self.bn(out)) + out = tn_func.avg_pool2d(out, 8) + out = out.view(out.size(0), -1) + return self.fcout(out) + + def regularization(self): + regularization = 0. + for layer in self.layers: + regularization += - (1. / self.N) * layer.regularization() + for bnw in self.bn_params: + if self.weight_decay > 0: + regularization += (self.weight_decay / self.N) * .5 * torch.sum(bnw.pow(2)) + if torch.cuda.is_available(): + regularization = regularization.cuda() + return regularization + + def get_exp_flops_l0(self): + expected_flops, expected_l0 = 0., 0. + for layer in self.layers: + try: + e_fl, e_l0 = layer.count_expected_flops_and_l0() + expected_flops += e_fl + expected_l0 += e_l0 + except BaseException: + pass + return expected_flops, expected_l0 + + def update_ema(self): + self.steps_ema += 1 + for p, avg_p in zip(self.parameters(), self.avg_param): + avg_p.mul_(self.beta_ema).add_((1 - self.beta_ema) * p.data) + + def load_ema_params(self): + for p, avg_p in zip(self.parameters(), self.avg_param): + p.data.copy_(avg_p / (1 - self.beta_ema**self.steps_ema)) + + def load_params(self, params): + for p, avg_p in zip(self.parameters(), params): + p.data.copy_(avg_p) + + def get_params(self): + params = deepcopy(list(p.data for p in self.parameters())) + return params + + def prune_rate(self): + l = [layer.activated_neurons().cpu().numpy() for layer in self.l0_layers] + return 100 - 100. * (l[0] * 16 + (l[1] + l[2] + l[3] + l[4]) * 160 + (l[5] + l[6] + l[7] + l[8]) * 320 + ( + l[9] + l[10] + l[11]) * 640) \ + / (16 * 160 + 160 * 160 * 3 + 160 * 320 + 320 * 320 * 3 + 320 * 640 + 640 * 640 * 3) diff --git a/train_lenet5.py b/train_lenet5.py index d9c8cad..9af8c6a 100755 --- a/train_lenet5.py +++ b/train_lenet5.py @@ -7,18 +7,18 @@ import torch.nn as nn import torch.backends.cudnn as cudnn -from models import L0LeNet5 +from models.LeNet import L0LeNet5, LeNet5 from utils import save_checkpoint from dataloaders import mnist from utils import AverageMeter, accuracy parser = argparse.ArgumentParser(description='PyTorch LeNet5 Training') -parser.add_argument('--epochs', default=200, type=int, +parser.add_argument('--epochs', default=100, type=int, help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, help='manual epoch number (useful on restarts)') -parser.add_argument('-b', '--batch-size', default=100, type=int, +parser.add_argument('-b', '--batch-size', default=128, type=int, help='mini-batch size (default: 100)') parser.add_argument('--lr', '--learning-rate', default=0.001, type=float, help='initial learning rate') @@ -33,9 +33,11 @@ parser.add_argument('--no-tensorboard', dest='tensorboard', action='store_false', help='whether to use tensorboard (default: True)') parser.add_argument('--beta_ema', type=float, default=0.999) -parser.add_argument('--lambas', nargs='*', type=float, default=[1., 1., 1., 1.]) +parser.add_argument('--lambas', nargs='*', type=float, default=[.1, .1, .1, .1]) parser.add_argument('--local_rep', action='store_true') parser.add_argument('--temp', type=float, default=2. / 3.) +parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') +parser.add_argument('--gpu-id', default='0', type=str, help='GPU id list') parser.add_argument('--multi_gpu', action='store_true') parser.set_defaults(tensorboard=True) @@ -44,10 +46,14 @@ total_steps = 0 exp_flops, exp_l0 = [], [] +args = parser.parse_args() +os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id +cuda = torch.cuda.is_available() and not args.no_cuda +args.device = torch.device('cuda' if cuda else 'cpu') + def main(): global args, best_prec1, writer, total_steps, exp_flops, exp_l0 - args = parser.parse_args() log_dir_net = args.name print('model:', args.name) if args.tensorboard: @@ -66,15 +72,19 @@ def main(): train_loader, val_loader, num_classes = mnist(args.batch_size, pm=False) # create model - model = L0LeNet5(num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, N=60000, - weight_decay=args.weight_decay, lambas=args.lambas, local_rep=args.local_rep, - temperature=args.temp) + if args.name in ['lenet', 'lenet5', 'vanilla']: + model = LeNet5(num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, N=60000, + weight_decay=args.weight_decay, lambas=args.lambas, local_rep=args.local_rep, + temperature=args.temp) + else: + model = L0LeNet5(num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, N=60000, + weight_decay=args.weight_decay, lambas=args.lambas, local_rep=args.local_rep, + temperature=args.temp) optimizer = torch.optim.Adam(model.parameters(), args.lr) print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))) - if torch.cuda.is_available(): - model = model.cuda() + model = model.to(args.device) # optionally resume from a checkpoint if args.resume: @@ -135,6 +145,8 @@ def loss_function(output, target_var, model): state['avg_params'] = model.avg_param state['steps_ema'] = model.steps_ema save_checkpoint(state, is_best, args.name) + print("Prune rate %.3g" % model.prune_rate()) + print('Arch', model.arch()) print('Best error: ', best_prec1) if args.tensorboard: writer.close() @@ -152,23 +164,20 @@ def train(train_loader, model, criterion, optimizer, epoch): model.train() end = time.time() - for i, (input_, target) in enumerate(train_loader): + for i, (data, label) in enumerate(train_loader): data_time.update(time.time() - end) total_steps += 1 - if torch.cuda.is_available(): - target = target.cuda(async=True) - input_ = input_.cuda() - input_var = torch.autograd.Variable(input_) - target_var = torch.autograd.Variable(target) + label = label.to(args.device) + data = data.to(args.device) # compute output - output = model(input_var) - loss = criterion(output, target_var, model) + output = model(data) + loss = criterion(output, label, model) # measure accuracy and record loss - prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.item(), input_.size(0)) - top1.update(100 - prec1.item(), input_.size(0)) + prec1 = accuracy(output.data, label, topk=(1,))[0] + losses.update(loss.item(), data.size(0)) + top1.update(100 - prec1.item(), data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() @@ -236,33 +245,31 @@ def validate(val_loader, model, criterion, epoch): model.module.load_ema_params() end = time.time() - for i, (input_, target) in enumerate(val_loader): - if torch.cuda.is_available(): - target = target.cuda(async=True) - input_ = input_.cuda() - input_var = torch.autograd.Variable(input_, volatile=True) - target_var = torch.autograd.Variable(target, volatile=True) - - # compute output - output = model(input_var) - loss = criterion(output, target_var, model) - - # measure accuracy and record loss - prec1 = accuracy(output.data, target, topk=(1,))[0] - losses.update(loss.item(), input_.size(0)) - top1.update(100 - prec1.item(), input_.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - print('Test: [{0}/{1}]\t' - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' - 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( - i, len(val_loader), batch_time=batch_time, loss=losses, - top1=top1)) + with torch.no_grad(): + for i, (data, label) in enumerate(val_loader): + label = label.to(args.device) + data = data.to(args.device) + + # compute output + output = model(data) + loss = criterion(output, label, model) + + # measure accuracy and record loss + prec1 = accuracy(output.data, label, topk=(1,))[0] + losses.update(loss.item(), data.size(0)) + top1.update(100 - prec1.item(), data.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print('Test: [{0}/{1}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( + i, len(val_loader), batch_time=batch_time, loss=losses, + top1=top1)) print(' * Err@1 {top1.avg:.3f}'.format(top1=top1)) if not args.multi_gpu: diff --git a/train_wide_resnet.py b/train_wide_resnet.py index f63594e..c1706ca 100755 --- a/train_wide_resnet.py +++ b/train_wide_resnet.py @@ -8,7 +8,7 @@ import torch.backends.cudnn as cudnn -from models import L0WideResNet +from models.WideResNet import L0WideResNet from dataloaders import cifar10, cifar100 from utils import save_checkpoint, AverageMeter, accuracy from torch.optim import lr_scheduler From 16461d517b3da41e298ddd3bd6a0f96f0abf18e8 Mon Sep 17 00:00:00 2001 From: sndnyang Date: Fri, 10 Apr 2020 04:13:45 -0400 Subject: [PATCH 4/5] base attack framework from kWTA --- torch_func/attack.py | 309 +++++++++++++++++++++ torch_func/training.py | 613 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 922 insertions(+) create mode 100644 torch_func/attack.py create mode 100644 torch_func/training.py diff --git a/torch_func/attack.py b/torch_func/attack.py new file mode 100644 index 0000000..26848b3 --- /dev/null +++ b/torch_func/attack.py @@ -0,0 +1,309 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from matplotlib import pyplot as plt +import torchvision +# from kWTA import cwl2 + + +def gen_rand_labels(y, num_classes): + targets = torch.randint_like(y, low=0, high=num_classes) + for i in range(len(targets)): + while targets[i] == y[i]: + targets[i] = torch.randint(low=0, high=10, size=(1,)) + return targets + + +def gen_least_likely_labels(model, X): + preds = model(X) + return preds.min(dim=1)[1] + + +def fgsm_linf_untargeted(model, X, y, epsilon=0.1): + """ Construct FGSM adversarial examples on the examples X""" + delta = torch.zeros_like(X, requires_grad=True) + loss = nn.CrossEntropyLoss()(model(X + delta), y) + loss.backward() + return epsilon * delta.grad.detach().sign() + + +def norms(Z): + """Compute norms over all but the first dimension""" + return Z.view(Z.shape[0], -1).norm(dim=1)[:, None, None, None] + + +def pgd_l2_untargeted(model, X, y, epsilon, alpha, num_iter): + delta = torch.zeros_like(X, requires_grad=True) + for t in range(num_iter): + loss = nn.CrossEntropyLoss()(model(X + delta), y) + loss.backward() + delta.data += alpha * delta.grad.detach() / norms(delta.grad.detach()) + delta.data = torch.min(torch.max(delta.detach(), -X), 1 - X) # clip X+delta to [0,1] + delta.data *= epsilon / norms(delta.detach()).clamp(min=epsilon) + delta.grad.zero_() + + return delta.detach() + + +def pgd_linf_untargeted(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False): + """ Construct FGSM adversarial examples on the examples X""" + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + for t in range(num_iter): + loss = nn.CrossEntropyLoss()(model(X + delta), y) + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_untargeted2(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False): + """ Construct FGSM adversarial examples on the examples X""" + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + for t in range(num_iter): + yp = model(X + delta) + loss = - yp.gather(1, y[:, None])[:, 0] + loss = loss.sum() + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_untargeted_mostlikely(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False): + """ Construct FGSM adversarial examples on the examples X""" + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + yp = model(X) + y = yp.max(dim=1)[1] + + for t in range(num_iter): + loss = nn.CrossEntropyLoss()(model(X + delta), y) + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_untargeted_maxce(model, X, y, epsilon=0.1, alpha=0.01, num_iter=20, randomize=False): + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + yp = model(X).detach() + + for t in range(num_iter): + loss = nn.KLDivLoss()(F.log_softmax(model(X + delta), dim=1), F.softmax(yp, dim=1)) + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_targ(model, X, y, epsilon=0.1, alpha=0.01, use_y=True, + num_iter=20, y_targ='rand', num_classes=10, randomize=False): + """ Construct targeted adversarial examples on the examples X""" + + if isinstance(y_targ, str): + strlist = ['rand', 'leastlikely'] + assert(y_targ in strlist) + if y_targ == 'rand': + y_targ = gen_rand_labels(y, num_classes) + elif y_targ == 'leastlikely': + y_targ = gen_least_likely_labels(model, X) + + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + delta = torch.zeros_like(X, requires_grad=True) + for t in range(num_iter): + yp = model(X + delta) + loss = yp[:, y_targ] + if not use_y: + y = yp.max(dim=1)[1] + loss = loss - yp.gather(1, y[:, None])[:, 0] + loss = loss.sum() + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_targ3(model, X, y, epsilon=0.1, alpha=0.01, + num_iter=20, y_targ='rand', num_classes=10, randomize=False): + """ Without using the label information""" + + if isinstance(y_targ, str): + strlist = ['rand', 'leastlikely'] + assert(y_targ in strlist) + if y_targ == 'rand': + y_targ = gen_rand_labels(y, num_classes) + elif y_targ == 'leastlikely': + y_targ = gen_least_likely_labels(model, X) + + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + delta = torch.zeros_like(X, requires_grad=True) + for t in range(num_iter): + yp = model(X + delta) + loss = yp[:, y_targ].sum() + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_targ2(model, X, y, epsilon=0.1, alpha=0.01, + num_iter=20, y_targ='rand', num_classes=10, randomize=False): + """ Construct targeted adversarial examples on the examples X""" + + if isinstance(y_targ, str): + strlist = ['rand', 'leastlikely'] + assert(y_targ in strlist) + if y_targ == 'rand': + y_targ = gen_rand_labels(y, num_classes) + elif y_targ == 'leastlikely': + y_targ = gen_least_likely_labels(model, X) + + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + for t in range(num_iter): + yp = model(X + delta) + loss = 2 * yp[:, y_targ].sum() - yp.sum() + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def pgd_linf_targ4(model, X, y, epsilon=0.1, alpha=0.01, + num_iter=20, y_targ='rand', num_classes=10, randomize=False): + """ Construct targeted adversarial examples on the examples X""" + + if isinstance(y_targ, str): + strlist = ['rand', 'leastlikely'] + assert(y_targ in strlist) + if y_targ == 'rand': + y_targ = gen_rand_labels(y, num_classes) + elif y_targ == 'leastlikely': + y_targ = gen_least_likely_labels(model, X) + + if randomize: + delta = torch.rand_like(X, requires_grad=True) + delta.data = delta.data * 2 * epsilon - epsilon + else: + delta = torch.zeros_like(X, requires_grad=True) + + for t in range(num_iter): + yp = model(X + delta) + loss = nn.CrossEntropyLoss(yp, y) - nn.CrossEntropyLoss(yp, y_targ) + loss.backward() + delta.data = (delta + alpha * delta.grad.detach().sign()).clamp(-epsilon, epsilon) + delta.grad.zero_() + return delta.detach() + + +def deepfool(model, X, y, epsilon=0.1, num_iter=50): + model.eval() + delta = torch.zeros_like(X) + X = X.clone() + X.requires_grad_() + + out = model(X + delta) + n_class = out.shape[1] + py = out.max(1)[1].item() + ny = out.max(1)[1].item() + + i_iter = 0 + + while py == ny and i_iter < num_iter: + out[0, py].backward(retain_graph=True) + grad_np = X.grad.data.clone() + value_l = np.inf + ri = None + + for i in range(n_class): + if i == py: + continue + + X.grad.data.zero_() + out[0, i].backward(retain_graph=True) + grad_i = X.grad.data.clone() + + wi = grad_i - grad_np + fi = out[0, i] - out[0, py] + value_i = np.abs(fi.item()) / np.linalg.norm(wi.cpu().numpy().flatten()) + + if value_i < value_l: + ri = value_i / np.linalg.norm(wi.cpu().numpy().flatten()) * wi + + delta += ri.clone() + X.grad.data.zero_() + out = model(X + delta) + py = out.max(1)[1].item() + i_iter += 1 + + delta = delta.clamp(-epsilon, epsilon) + + return delta.detach() + + +def cw_l2_random_label(model, X, y, num_classes=10, confidence=20, + debug=False, randomize=False, max_steps=1000): + targets = gen_rand_labels(y, num_classes) + + attack = cwl2.AttackCarliniWagnerL2(num_classes=num_classes, + confidence=confidence, debug=debug, randomize=randomize, max_steps=max_steps) + return attack.run(model, X, targets) + +# def cw_l2(model, X, targets, c, kappa=0, alpha=0.01, num_iter=20): +# omega = torch.zeros_like(X, requires_grad=True) + + +# for i in range(num_iter): +# tan_omega = F.tanh(omega) +# delta = 0.5*(tan_omega+1) - x + +# loss1 = torch.norm(delta, p=2, dim=1) + +# logits = model(X+delta) +# target_logits = logits[:,:] + + +def one_pixel_perturb(p, img): + img_size = img.size(1) + + +def one_pixel_evolve(): + pass + + +def one_pixel_attack(model, X, y): + raise NotImplementedError diff --git a/torch_func/training.py b/torch_func/training.py new file mode 100644 index 0000000..1387932 --- /dev/null +++ b/torch_func/training.py @@ -0,0 +1,613 @@ +import time +import os +import argparse +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.optim import lr_scheduler +from torchvision import datasets, transforms +import torchvision +from torch.autograd import Variable + + +def isnotebook(): + try: + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell': + return True # Jupyter notebook or qtconsole + elif shell == 'TerminalInteractiveShell': + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + +if isnotebook(): + from tqdm import tqdm_notebook as tqdm +else: + from tqdm import tqdm + + +def epoch(loader, model, opt=None, device=None, use_tqdm=False): + """Standard training/evaluation epoch over the dataset""" + total_loss, total_err = 0., 0. + if opt is None: + model.eval() + else: + model.train() + + if use_tqdm: + pbar = tqdm(total=len(loader)) + + for X, y in loader: + X, y = X.to(device), y.to(device) + + yp = model(X) + loss = nn.CrossEntropyLoss()(yp, y) + if opt: + opt.zero_grad() + loss.backward() + opt.step() + + total_err += (yp.max(dim=1)[1] != y).sum().item() + total_loss += loss.item() * X.shape[0] + + if use_tqdm: + pbar.update(1) + + return total_err / len(loader.dataset), total_loss / len(loader.dataset) + + +def epoch_imagenet(loader, model, opt=None, device=None, use_tqdm=False): + total_loss, total_err_top1, total_err_top5 = 0., 0., 0. + + if opt is None: + model.eval() + + if use_tqdm: + pbar = tqdm(total=len(loader)) + + model.to(device) + for X, y in loader: + X, y = X.to(device), y.to(device) + + yp = model(X) + loss = nn.CrossEntropyLoss()(yp, y) + if opt: + opt.zero_grad() + loss.backward() + opt.step() + + total_err_top1 += (yp.max(dim=1)[1] != y).sum().item() + + _, pred = yp.topk(5, dim=1, sorted=True, largest=True) + pred = pred.t() + total_err_top5 += pred.eq(y.view(1, -1).expand_as(pred)).sum().item() + + total_loss += loss.item() * X.shape[0] + + if use_tqdm: + pbar.update(1) + + return total_err_top1 / len(loader.dataset), total_err_top5 / len(loader.dataset), total_loss / len(loader.dataset) + + +def epoch_imagenet_adversarial(loader, model, device, attack, use_tqdm=False, n_test=None, **kwargs): + """Adversarial training/evaluation epoch over the dataset""" + total_loss, total_err_top1, total_err_top5 = 0., 0., 0. + + if use_tqdm: + if n_test is None: + pbar = tqdm(total=len(loader.dataset)) + else: + pbar = tqdm(total=n_test) + + total_n = 0 + model.to(device) + for X, y in loader: + X, y = X.to(device), y.to(device) + delta = attack(model, X, y, **kwargs) + yp = model(X + delta) + loss = nn.CrossEntropyLoss()(yp, y) + + total_err_top1 += (yp.max(dim=1)[1] != y).sum().item() + _, pred = yp.topk(5, dim=1, sorted=True, largest=True) + pred = pred.t() + total_err_top5 += pred.eq(y.view(1, -1).expand_as(pred)).sum().item() + total_loss += loss.item() * X.shape[0] + + if use_tqdm: + pbar.update(X.shape[0]) + + total_n += X.shape[0] + + if n_test is not None: + if total_n >= n_test: + break + + return total_err_top1 / total_n, total_err_top5 / total_n, total_loss / total_n + + +def epoch_func(loader, model, criterion, opt=None, device=None, use_tqdm=False): + total_loss = 0. + model.to(device) + if use_tqdm: + pbar = tqdm(total=len(loader)) + + for X, y in loader: + X, y = X.to(device), y.to(device) + yp = model(X) + loss = criterion(yp, y) + if opt: + opt.zero_grad() + loss.backward() + opt.step() + + total_loss += loss.item() * X.shape[0] + + if use_tqdm: + pbar.update(1) + + return total_loss / len(loader.dataset) + + +def epoch_distill_func(loader, model_teacher, model, device, opt=None, use_tqdm=True, n_test=None, loss_func='mse'): + total_loss, total_err = 0., 0. + total_n = 0 + + model_teacher.to(device) + model.to(device) + + if use_tqdm: + if n_test is None: + pbar = tqdm(total=len(loader.dataset)) + else: + pbar = tqdm(total=n_test) + + for X, y in loader: + X, y = X.to(device), y.to(device) + + teacher_output = model_teacher(X).detach() + output = model(X) + + if loss_func == 'mse': + loss = nn.MSELoss()(output, teacher_output) + elif loss_func == 'l1': + loss = nn.L1Loss()(output, teacher_output) + elif loss_func == 'kl': + loss = nn.KLDivLoss()(F.log_softmax(output, dim=1), + F.softmax(teacher_output, dim=1)) + else: + raise NotImplementedError + + if opt: + opt.zero_grad() + loss.backward() + opt.step() + + total_loss += loss.item() * X.shape[0] + total_n += X.shape[0] + + if use_tqdm: + pbar.update(X.shape[0]) + + if n_test is not None: + if total_n > n_test: + break + + return total_loss / total_n + + +def epoch_distill(loader, model_teacher, model, device, opt=None, use_tqdm=True, n_test=None, loss_func='mse'): + total_loss, total_err = 0., 0. + total_n = 0 + + model_teacher.to(device) + model.to(device) + + if use_tqdm: + if n_test is None: + pbar = tqdm(total=len(loader.dataset)) + else: + pbar = tqdm(total=n_test) + + for X, y in loader: + X, y = X.to(device), y.to(device) + + teacher_output = model_teacher(X).detach() + output = model(X) + + if loss_func == 'mse': + loss = nn.MSELoss()(output, teacher_output) + elif loss_func == 'l1': + loss = nn.L1Loss()(output, teacher_output) + elif loss_func == 'kl': + loss = nn.KLDivLoss()(F.log_softmax(output, dim=1), + F.softmax(teacher_output, dim=1)) + else: + raise NotImplementedError + + if opt: + opt.zero_grad() + loss.backward() + opt.step() + + total_err += (output.max(dim=1)[1] != y).sum().item() + total_loss += loss.item() * X.shape[0] + total_n += X.shape[0] + + if use_tqdm: + pbar.update(X.shape[0]) + + if n_test is not None: + if total_n > n_test: + break + + return total_loss / total_n, total_err / total_n + + +def epoch_transfer_attack(loader, model_source, model_target, attack, device, success_only=False, use_tqdm=True, n_test=None, **kwargs): + source_err = 0. + target_err = 0. + target_err2 = 0. + + success_total_n = 0 + + model_source.eval() + model_target.eval() + + total_n = 0 + + if use_tqdm: + pbar = tqdm(total=n_test) + + model_source.to(device) + model_target.to(device) + for X, y in loader: + X, y = X.to(device), y.to(device) + delta = attack(model_source, X, y, **kwargs) + + if success_only: + raise NotImplementedError + else: + yp_target = model_target(X + delta).detach() + yp_source = model_source(X + delta).detach() + yp_origin = model_target(X).detach() + source_err += (yp_source.max(dim=1)[1] != y).sum().item() + target_err += (yp_target.max(dim=1)[1] != y).sum().item() + target_err2 += (yp_origin.max(dim=1)[1] != y).sum().item() + success_total_n += (yp_origin.max(dim=1)[1] == y) + if use_tqdm: + pbar.update(X.shape[0]) + + total_n += X.shape[0] + if n_test is not None: + if total_n >= n_test: + break + + return source_err / total_n, target_err / total_n, target_err2 / total_n + + # if randomize: + # delta = torch.rand_like(X, requires_grad=True) + # delta.data = delta.data * 2 * epsilon - epsilon + # else: + # delta = torch.zeros_like(X, requires_grad=True) + + # for t in range(num_iter): + # loss = nn.CrossEntropyLoss()(model(X + delta), y) + # loss.backward() + # delta.data = (delta + alpha*delta.grad.detach().sign()).clamp(-epsilon,epsilon) + # delta.grad.zero_() + # return delta.detach() + + +def epoch_free_adversarial(loader, model, m, epsilon, opt, device, use_tqdm=False): + """free adversarial training""" + total_loss, total_err = 0., 0. + total_n = 0 + + pbar = tqdm(total=len(loader)) + + for X, y in loader: + X, y = X.to(device), y.to(device) + delta = torch.zeros_like(X, requires_grad=True) + for i in range(m): + model.train() + yp = model(X + delta) + loss_nn = nn.CrossEntropyLoss()(yp, y) + + total_err += (yp.max(dim=1)[1] != y).sum().item() + total_loss += loss_nn.item() * X.shape[0] + total_n += X.shape[0] + + # update network + opt.zero_grad() + loss_nn.backward() + opt.step() + + # update perturbation + delta.data = delta + epsilon * delta.grad.detach().sign() + delta.data = delta.data.clamp(-epsilon, epsilon) + delta.grad.zero_() + + if use_tqdm: + pbar.update(1) + + return total_err / total_n, total_loss / total_n + + +def epoch_ALP(loader, model, attack, alp_weight=0.5, + opt=None, device=None, use_tqdm=False, n_test=None, **kwargs): + """Adversarial Logit Pairing epoch over the dataset""" + total_loss, total_err = 0., 0. + + # assert(opt is not None) + model.train() + + if use_tqdm: + if n_test is None: + pbar = tqdm(total=len(loader.dataset)) + else: + pbar = tqdm(total=n_test) + total_n = 0 + for X, y in loader: + X, y = X.to(device), y.to(device) + model.eval() + with torch.no_grad(): + clean_logit = model(X) + delta = attack(model, X, y, **kwargs) + + model.train() + yp = model(X + delta) + loss = nn.CrossEntropyLoss()(yp, y) + alp_weight * nn.MSELoss()(yp, clean_logit) + + opt.zero_grad() + loss.backward() + opt.step() + + total_err += (yp.max(dim=1)[1] != y).sum().item() + total_loss += loss.item() * X.shape[0] + if use_tqdm: + pbar.update(X.shape[0]) + + total_n += X.shape[0] + + if n_test is not None: + if total_n >= n_test: + break + + return total_err / total_n, total_loss / total_n + + +def epoch_adversarial(loader, model, attack, + opt=None, device=None, use_tqdm=False, n_test=None, **kwargs): + """Adversarial training/evaluation epoch over the dataset""" + total_loss, total_err = 0., 0. + + if opt is None: + model.eval() + else: + model.train() + + if use_tqdm: + if n_test is None: + pbar = tqdm(total=len(loader.dataset)) + else: + pbar = tqdm(total=n_test) + total_n = 0 + example = None + for X, y in loader: + X, y = X.to(device), y.to(device) + model.eval() + delta = attack(model, X, y, **kwargs) + # temp = torch.abs(delta / X).view(X.shape[0], -1).norm(p=float(2), dim=1) + # print('L2 mag of example', temp.mean().item(), temp.min().item(), temp.max().item()) + # temp = torch.abs(delta / X).view(X.shape[0], -1).norm(p=float(np.inf), dim=1) + # print('L inf mag of example', temp.mean().item(), temp.min().item(), temp.max().item()) + + if opt: + model.train() + + yp = model(X + delta) + example = X + delta + loss = nn.CrossEntropyLoss()(yp, y) + if opt: + opt.zero_grad() + loss.backward() + opt.step() + + total_err += (yp.max(dim=1)[1] != y).sum().item() + total_loss += loss.item() * X.shape[0] + if use_tqdm: + pbar.update(X.shape[0]) + + total_n += X.shape[0] + + if n_test is not None: + if total_n >= n_test: + break + + return total_err / total_n, total_loss / total_n, X, example + + +def get_activation(model, activation, name): + def hook(model, input, output): + activation[name] = output.cpu().detach() + return hook + + +def register_layer(model, layer, activation, name): + layer.register_forward_hook(get_activation(model, activation, name)) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def validate(val_loader, model, criterion, device): + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + + # switch to evaluate mode + model.eval() + + end = time.time() + for i, (inp, target) in enumerate(val_loader): + target = target.to(device) + inp = inp.to(device) + + # compute output + output = model(inp) + loss = criterion(output, target) + + # measure accuracy and record loss + prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) + losses.update(loss.item(), inp.size(0)) + top1.update(prec1.item(), inp.size(0)) + top5.update(prec5.item(), inp.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % 10 == 0: + print('Test: [{0}/{1}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' + 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( + i, len(val_loader), batch_time=batch_time, loss=losses, + top1=top1, top5=top5)) + + print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + + +def squared_l2_norm(x): + flattened = x.view(x.shape[0], -1) + return (flattened ** 2).sum(1) + + +def l2_norm(x): + return squared_l2_norm(x).sqrt() + + +def trades_loss(model, + x_natural, + y, + optimizer, + step_size=0.003, + epsilon=0.031, + perturb_steps=10, + beta=1.0, + distance='l_inf'): + # define KL-loss + criterion_kl = nn.KLDivLoss(size_average=False) + model.eval() + batch_size = len(x_natural) + # generate adversarial example + x_adv = x_natural.detach() + 0.001 * torch.randn_like(x_natural).detach() + if distance == 'l_inf': + for _ in range(perturb_steps): + x_adv.requires_grad_() + with torch.enable_grad(): + loss_kl = criterion_kl(F.log_softmax(model(x_adv), dim=1), + F.softmax(model(x_natural), dim=1)) + grad = torch.autograd.grad(loss_kl, [x_adv])[0] + x_adv = x_adv.detach() + step_size * torch.sign(grad.detach()) + x_adv = torch.min(torch.max(x_adv, x_natural - epsilon), x_natural + epsilon) + x_adv = torch.clamp(x_adv, 0.0, 1.0) + elif distance == 'l_2': + for _ in range(perturb_steps): + x_adv.requires_grad_() + with torch.enable_grad(): + loss_kl = criterion_kl(F.log_softmax(model(x_adv), dim=1), + F.softmax(model(x_natural), dim=1)) + grad = torch.autograd.grad(loss_kl, [x_adv])[0] + for idx_batch in range(batch_size): + grad_idx = grad[idx_batch] + grad_idx_norm = l2_norm(grad_idx) + grad_idx /= (grad_idx_norm + 1e-8) + x_adv[idx_batch] = x_adv[idx_batch].detach() + step_size * grad_idx + eta_x_adv = x_adv[idx_batch] - x_natural[idx_batch] + norm_eta = l2_norm(eta_x_adv) + if norm_eta > epsilon: + eta_x_adv = eta_x_adv * epsilon / l2_norm(eta_x_adv) + x_adv[idx_batch] = x_natural[idx_batch] + eta_x_adv + x_adv = torch.clamp(x_adv, 0.0, 1.0) + else: + x_adv = torch.clamp(x_adv, 0.0, 1.0) + model.train() + + x_adv = Variable(torch.clamp(x_adv, 0.0, 1.0), requires_grad=False) + # zero gradient + optimizer.zero_grad() + # calculate robust loss + logits = model(x_natural) + loss_natural = F.cross_entropy(logits, y) + loss_robust = (1.0 / batch_size) * criterion_kl(F.log_softmax(model(x_adv), dim=1), + F.softmax(model(x_natural), dim=1)) + loss = loss_natural + beta * loss_robust + return loss + + +def epoch_trade(loader, model, + opt, device=None, **kwargs): + model.train() + for batch_idx, (data, target) in enumerate(loader): + data, target = data.to(device), target.to(device) + + opt.zero_grad() + + # calculate robust loss + loss = trades_loss(model=model, + x_natural=data, + y=target, + optimizer=opt, + **kwargs) + # step_size=args.step_size, + # epsilon=args.epsilon, + # perturb_steps=args.num_steps, + # beta=args.beta) + loss.backward() + opt.step() + + return 0, 0 From 2a196113df75b08a0863493f49317aa32d57659b Mon Sep 17 00:00:00 2001 From: sndnyang Date: Fri, 10 Apr 2020 04:14:04 -0400 Subject: [PATCH 5/5] AT lenet5 for mnist --- at_lenet5.py | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 at_lenet5.py diff --git a/at_lenet5.py b/at_lenet5.py new file mode 100644 index 0000000..bacb036 --- /dev/null +++ b/at_lenet5.py @@ -0,0 +1,305 @@ +import argparse +import shutil +import os +import time + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn + +from torch_func.attack import pgd_linf_untargeted +from models.LeNet import L0LeNet5, LeNet5 +from utils import save_checkpoint +from dataloaders import mnist +from utils import AverageMeter, accuracy + + +parser = argparse.ArgumentParser(description='PyTorch LeNet5 Adversarial Training') +parser.add_argument('--epochs', default=100, type=int, + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=128, type=int, + help='mini-batch size (default: 100)') +parser.add_argument('--lr', '--learning-rate', default=0.001, type=float, + help='initial learning rate') +parser.add_argument('--weight-decay', '--wd', default=0.0005, type=float, + help='weight decay (default: 5e-4)') +parser.add_argument('--print-freq', '-p', default=100, type=int, + help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, + help='path to latest checkpoint (default: none)') +parser.add_argument('--name', default='L0LeNet5', type=str, + help='name of experiment') +parser.add_argument('--no-tensorboard', dest='tensorboard', action='store_false', + help='whether to use tensorboard (default: True)') +parser.add_argument('--beta_ema', type=float, default=0.999) +parser.add_argument('--lambas', nargs='*', type=float, default=[30, 1, 0.3, 5]) +parser.add_argument('--local_rep', action='store_true') +parser.add_argument('--temp', type=float, default=2. / 3.) + +parser.add_argument('--eps', type=float, default=0.031) +parser.add_argument('--attack-iter', type=int, default=10) + +parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') +parser.add_argument('--gpu-id', default='0', type=str, help='GPU id list') +parser.add_argument('--multi_gpu', action='store_true') +parser.set_defaults(tensorboard=True) + +best_prec1 = 100 +writer = None +total_steps = 0 +exp_flops, exp_l0 = [], [] + +args = parser.parse_args() +os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id +cuda = torch.cuda.is_available() and not args.no_cuda +args.device = torch.device('cuda' if cuda else 'cpu') + + +def main(): + global args, best_prec1, writer, total_steps, exp_flops, exp_l0 + log_dir_net = args.name + print('model:', args.name) + if args.tensorboard: + # used for logging to TensorBoard + from tensorboardX import SummaryWriter + directory = 'logs/{}/{}'.format(log_dir_net, args.name) + if os.path.exists(directory): + shutil.rmtree(directory) + os.makedirs(directory) + else: + os.makedirs(directory) + writer = SummaryWriter(directory) + + # Data loading code + print('[0, 1] normalization of input') + train_loader, val_loader, num_classes = mnist(args.batch_size, pm=False) + + # create model + if args.name in ['lenet', 'lenet5', 'vanilla']: + model = LeNet5(num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, N=60000, + weight_decay=args.weight_decay, lambas=args.lambas, local_rep=args.local_rep, + temperature=args.temp) + else: + model = L0LeNet5(num_classes, input_size=(1, 28, 28), conv_dims=(20, 50), fc_dims=500, N=60000, + weight_decay=args.weight_decay, lambas=args.lambas, local_rep=args.local_rep, + temperature=args.temp) + + optimizer = torch.optim.Adam(model.parameters(), args.lr) + print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))) + + model = model.to(args.device) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume) + args.start_epoch = checkpoint['epoch'] + best_prec1 = checkpoint['best_prec1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + total_steps = checkpoint['total_steps'] + exp_flops = checkpoint['exp_flops'] + exp_l0 = checkpoint['exp_l0'] + if checkpoint['beta_ema'] > 0: + model.beta_ema = checkpoint['beta_ema'] + model.avg_param = checkpoint['avg_params'] + model.steps_ema = checkpoint['steps_ema'] + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + total_steps, exp_flops, exp_l0 = 0, [], [] + cudnn.benchmark = True + + loglike = nn.CrossEntropyLoss() + if torch.cuda.is_available(): + loglike = loglike.cuda() + + # define loss function (criterion) and optimizer + def loss_function(output, target_var, model): + loss = loglike(output, target_var) + total_loss = loss + model.regularization() + if torch.cuda.is_available(): + total_loss = total_loss.cuda() + return total_loss + + for epoch in range(args.start_epoch, args.epochs): + # train for one epoch + train(train_loader, model, loss_function, optimizer, epoch) + # evaluate on validation set + prec1 = validate(val_loader, model, loss_function, epoch) + + # remember best prec@1 and save checkpoint + is_best = prec1 < best_prec1 + best_prec1 = min(prec1, best_prec1) + state = { + 'epoch': epoch + 1, + 'state_dict': model.state_dict(), + 'best_prec1': best_prec1, + 'curr_prec1': prec1, + 'beta_ema': model.beta_ema, + 'optimizer': optimizer.state_dict(), + 'total_steps': total_steps, + 'exp_flops': exp_flops, + 'exp_l0': exp_l0 + } + if model.beta_ema > 0: + state['avg_params'] = model.avg_param + state['steps_ema'] = model.steps_ema + save_checkpoint(state, is_best, args.name + '_AT_%d' % args.attack_iter) + print("Prune rate %.5g" % model.prune_rate()) + print('Arch', model.arch()) + print('Best error: ', best_prec1) + if args.tensorboard: + writer.close() + + +def train(train_loader, model, criterion, optimizer, epoch): + """Train for one epoch on the training set""" + global total_steps, exp_flops, exp_l0, args, writer + attack = pgd_linf_untargeted + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + + # switch to train mode + model.train() + + end = time.time() + for i, (data, label) in enumerate(train_loader): + data_time.update(time.time() - end) + total_steps += 1 + label = label.to(args.device) + data = data.to(args.device) + + model.eval() + delta = attack(model, data, label, epsilon=args.eps, randomize=True, alpha=0.003, num_iter=args.attack_iter) + # compute output + model.train() + output = model(data + delta) + loss = criterion(output, label, model) + + # measure accuracy and record loss + prec1 = accuracy(output.data, label, topk=(1,))[0] + losses.update(loss.item(), data.size(0)) + top1.update(100 - prec1.item(), data.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # clamp the parameters + layers = model.layers if not args.multi_gpu else model.module.layers + for k, layer in enumerate(layers): + layer.constrain_parameters() + + e_fl, e_l0 = model.get_exp_flops_l0() if not args.multi_gpu else \ + model.module.get_exp_flops_l0() + exp_flops.append(e_fl) + exp_l0.append(e_l0) + if writer is not None: + writer.add_scalar('stats_comp/exp_flops', e_fl, total_steps) + writer.add_scalar('stats_comp/exp_l0', e_l0, total_steps) + + if not args.multi_gpu: + if model.beta_ema > 0.: + model.update_ema() + else: + if model.module.beta_ema > 0.: + model.module.update_ema() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + # input() + if i % args.print_freq == 0: + print(' Epoch: [{0}][{1}/{2}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, top1=top1)) + + # log to TensorBoard + if writer is not None: + writer.add_scalar('train/loss', losses.avg, epoch) + writer.add_scalar('train/err', top1.avg, epoch) + + return top1.avg + + +def validate(val_loader, model, criterion, epoch): + """Perform validation on the validation set""" + global args, writer + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + + # switch to evaluate mode + model.eval() + if not args.multi_gpu: + if model.beta_ema > 0: + old_params = model.get_params() + model.load_ema_params() + else: + if model.module.beta_ema > 0: + old_params = model.module.get_params() + model.module.load_ema_params() + + end = time.time() + with torch.no_grad(): + for i, (data, label) in enumerate(val_loader): + label = label.to(args.device) + data = data.to(args.device) + + # compute output + output = model(data) + loss = criterion(output, label, model) + + # measure accuracy and record loss + prec1 = accuracy(output.data, label, topk=(1,))[0] + losses.update(loss.item(), data.size(0)) + top1.update(100 - prec1.item(), data.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print('Test: [{0}/{1}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Err@1 {top1.val:.3f} ({top1.avg:.3f})'.format( + i, len(val_loader), batch_time=batch_time, loss=losses, + top1=top1)) + + print(' * Err@1 {top1.avg:.3f}'.format(top1=top1)) + if not args.multi_gpu: + if model.beta_ema > 0: + model.load_params(old_params) + else: + if model.module.beta_ema > 0: + model.module.load_params(old_params) + + # log to TensorBoard + if writer is not None: + writer.add_scalar('val/loss', losses.avg, epoch) + writer.add_scalar('val/err', top1.avg, epoch) + layers = model.layers if not args.multi_gpu else model.module.layers + for k, layer in enumerate(layers): + if hasattr(layer, 'qz_loga'): + mode_z = layer.sample_z(1, sample=0).view(-1) + writer.add_histogram('mode_z/layer{}'.format(k), mode_z.cpu().data.numpy(), epoch) + + return top1.avg + + +if __name__ == '__main__': + main()