From 290306c8d4d3271663d06ee69d03626c1990dfbe Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Thu, 2 Jun 2016 11:22:37 -0400
Subject: [PATCH 1/7] fix cpu bug

---
 examples/recurrent-visual-attention.lua | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/recurrent-visual-attention.lua b/examples/recurrent-visual-attention.lua
index eba8282..a7eedf3 100644
--- a/examples/recurrent-visual-attention.lua
+++ b/examples/recurrent-visual-attention.lua
@@ -246,6 +246,8 @@ if opt.cuda then
    require 'cunn'
    cutorch.setDevice(opt.useDevice)
    xp:cuda()
+else
+   xp:float()
 end
 
 xp:verbose(not opt.silent)

From 2612ee23b227f25cb96243f20b1b81dca08ebbbb Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Thu, 2 Jun 2016 13:23:38 -0400
Subject: [PATCH 2/7] initial commit for multi-gpu

---
 examples/noise-contrastive-estimate.lua | 130 +++++++++++++++++-------
 1 file changed, 91 insertions(+), 39 deletions(-)

diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index c9f618f..3f346f8 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -45,7 +45,7 @@ cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory
 cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)')
 cmd:option('--tiny', false, 'use train_tiny.th7 training file')
 cmd:option('--dontsave', false, 'dont save the model')
-cmd:option('--cpulookup', false, 'keep lookuptable on CPU')
+cmd:option('--multigpu', false, 'distribute the model over 4 gpus')
 
 cmd:text()
 local opt = cmd:parse(arg or {})
@@ -91,54 +91,106 @@ end
 --[[ language model ]]--
 
 if not lm then
-   lm = nn.Sequential()
-
-   -- input layer (i.e. word embedding space)
-   local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize)
-   lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
-   if opt.cpulookup then 
-      -- this will be slower but will use up less memory.
-      lookup = nn.DontCast(lookup:float(), false, true)
-   end
-   lm:add(lookup) -- input is seqlen x batchsize
-   if opt.dropout > 0 then
-      lm:add(nn.Dropout(opt.dropout))
-   end
+   if opt.multigpu then
+      lm = nn.Sequential()
+
+      -- input layer (i.e. word embedding space)
+      local concat = nn.Concat(3)
+      for device=1,2 do
+         local inpusize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2)
+         local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize)
+         lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
+         concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize
+      end
+      
+      lm:add(nn.GPU(concat, 2))
+      if opt.dropout > 0 then
+         lm:add(nn.GPU(nn.Dropout(opt.dropout), 2))
+      end
+
+      -- rnn layers
+      local inputsize = opt.inputsize
+      for i,hiddensize in ipairs(opt.hiddensize) do
+         -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
+         local rnn = nn.SeqLSTM(inputsize, hiddensize)
+         rnn.maskzero = true
+         local device = i < opt.hiddensize/2 and 2 or 3
+         lm:add(nn.GPU(rnn, device))
+         if opt.dropout > 0 then
+            lm:add(nn.GPU(nn.Dropout(opt.dropout), device))
+         end
+         inputsize = hiddensize
+      end
+
+      lm:add(nn.GPU(nn.SplitTable(1), 3))
+
+      -- output layer
+      local unigram = trainset.wordfreq:float()
+      local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
 
-   -- rnn layers
-   local inputsize = opt.inputsize
-   for i,hiddensize in ipairs(opt.hiddensize) do
-      -- this is a faster version of nnSequencer(nn.FastLSTM(inpusize, hiddensize))
-      local rnn = nn.SeqLSTM(inputsize, hiddensize)
-      rnn.maskzero = true
-      lm:add(rnn)
+      -- NCE requires {input, target} as inputs
+      lm = nn.Sequential()
+         :add(nn.ParallelTable()
+            :add(lm):add(nn.Identity()))
+         :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
+
+      -- encapsulate stepmodule into a Sequencer
+      lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
+
+      -- remember previous state between batches
+      lm:remember()
+
+      if opt.uniform > 0 then
+         for k,param in ipairs(lm:parameters()) do
+            param:uniform(-opt.uniform, opt.uniform)
+         end
+      end
+   else
+      lm = nn.Sequential()
+
+      -- input layer (i.e. word embedding space)
+      local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize)
+      lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
+      lm:add(lookup) -- input is seqlen x batchsize
       if opt.dropout > 0 then
          lm:add(nn.Dropout(opt.dropout))
       end
-      inputsize = hiddensize
-   end
 
-   lm:add(nn.SplitTable(1))
+      -- rnn layers
+      local inputsize = opt.inputsize
+      for i,hiddensize in ipairs(opt.hiddensize) do
+         -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
+         local rnn = nn.SeqLSTM(inputsize, hiddensize)
+         rnn.maskzero = true
+         lm:add(rnn)
+         if opt.dropout > 0 then
+            lm:add(nn.Dropout(opt.dropout))
+         end
+         inputsize = hiddensize
+      end
+
+      lm:add(nn.SplitTable(1))
 
-   -- output layer
-   local unigram = trainset.wordfreq:float()
-   local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+      -- output layer
+      local unigram = trainset.wordfreq:float()
+      local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
 
-   -- NCE requires {input, target} as inputs
-   lm = nn.Sequential()
-      :add(nn.ParallelTable()
-         :add(lm):add(nn.Identity()))
-      :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
+      -- NCE requires {input, target} as inputs
+      lm = nn.Sequential()
+         :add(nn.ParallelTable()
+            :add(lm):add(nn.Identity()))
+         :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
 
-   -- encapsulate stepmodule into a Sequencer
-   lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
+      -- encapsulate stepmodule into a Sequencer
+      lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
 
-   -- remember previous state between batches
-   lm:remember()
+      -- remember previous state between batches
+      lm:remember()
 
-   if opt.uniform > 0 then
-      for k,param in ipairs(lm:parameters()) do
-         param:uniform(-opt.uniform, opt.uniform)
+      if opt.uniform > 0 then
+         for k,param in ipairs(lm:parameters()) do
+            param:uniform(-opt.uniform, opt.uniform)
+         end
       end
    end
 end

From 5d6058c5bbaac94b3bc393ac5c83c47decdb4a24 Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Fri, 3 Jun 2016 17:05:02 -0400
Subject: [PATCH 3/7] nce --multigpu

---
 examples/noise-contrastive-estimate.lua | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index 3f346f8..4d6f3e7 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -92,12 +92,14 @@ end
 
 if not lm then
    if opt.multigpu then
+      assert(opt.maxnormout <= 0)
       lm = nn.Sequential()
-
+      lm:add(nn.Convert())
+      
       -- input layer (i.e. word embedding space)
       local concat = nn.Concat(3)
       for device=1,2 do
-         local inpusize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2)
+         local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2)
          local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize)
          lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
          concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize
@@ -114,7 +116,7 @@ if not lm then
          -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
          local rnn = nn.SeqLSTM(inputsize, hiddensize)
          rnn.maskzero = true
-         local device = i < opt.hiddensize/2 and 2 or 3
+         local device = 2 -- i < #opt.hiddensize/2 and 1 or 2
          lm:add(nn.GPU(rnn, device))
          if opt.dropout > 0 then
             lm:add(nn.GPU(nn.Dropout(opt.dropout), device))
@@ -126,7 +128,9 @@ if not lm then
 
       -- output layer
       local unigram = trainset.wordfreq:float()
-      local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+      ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+      -- distribute weight, gradWeight and momentum on devices 3 and 4
+      ncemodule:multicuda(3,4) 
 
       -- NCE requires {input, target} as inputs
       lm = nn.Sequential()
@@ -135,7 +139,7 @@ if not lm then
          :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
 
       -- encapsulate stepmodule into a Sequencer
-      lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
+      lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device))
 
       -- remember previous state between batches
       lm:remember()
@@ -347,8 +351,8 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
       xplog.minvalnceloss = nceloss
       xplog.epoch = epoch 
       local filename = paths.concat(opt.savepath, opt.id..'.t7')
-      print("Found new minima. Saving to "..filename)
       if not opt.dontsave then
+         print("Found new minima. Saving to "..filename)
          torch.save(filename, xplog)
       end
       ntrial = 0

From 08c0706ce8f88b17ae50f591e2de79138f45de46 Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Tue, 14 Jun 2016 10:22:52 -0400
Subject: [PATCH 4/7] refactored multigpu nce

---
 examples/multigpu-nce-rnnlm.lua         | 317 ++++++++++++++++++++++++
 examples/noise-contrastive-estimate.lua | 137 +++-------
 test/test.lua                           | 145 +++++++++++
 3 files changed, 501 insertions(+), 98 deletions(-)
 create mode 100644 examples/multigpu-nce-rnnlm.lua

diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua
new file mode 100644
index 0000000..5dc42e8
--- /dev/null
+++ b/examples/multigpu-nce-rnnlm.lua
@@ -0,0 +1,317 @@
+require 'paths'
+require 'rnn'
+require 'nngraph'
+local dl = require 'dataload'
+assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 4, "update dpnn : luarocks install dpnn")
+require 'cunn'
+
+--[[ command line arguments ]]--
+cmd = torch.CmdLine()
+cmd:text()
+cmd:text('Train a Language Model using stacked LSTM on Google Billion Words dataset')
+cmd:text('Example:')
+cmd:text("th examples/multigpu-nce-rnnlm.lua --progress --earlystop 50 --device 2 --seqlen 20 --hiddensize '{200,200}' --batchsize 20 --startlr 1 --uniform 0.1 --cutoff 5 --schedule '{[5]=0.5,[6]=0.25,[7]=0.125,[8]=0.0625,[9]=0.03125,[10]=0.015625,[11]=0.0078125,[12]=0.00390625}'")
+cmd:text("th examples/multigpu-nce-rnnlm.lua.lua --trainsize 400000 --validsize 40000 --cutoff 10 --batchsize 128 --seqlen 100 --hiddensize '{250,250}' --progress --device 2")
+cmd:text("th scripts/evaluate-rnnlm.lua --xplogpath /data/save/rnnlm/ptb:atlas:1458081269:1.t7 --cuda")
+cmd:text('Options:')
+-- training
+cmd:option('--startlr', 0.05, 'learning rate at t=0')
+cmd:option('--minlr', 0.00001, 'minimum learning rate')
+cmd:option('--saturate', 400, 'epoch at which linear decayed LR will reach minlr')
+cmd:option('--schedule', '', 'learning rate schedule. e.g. {[5] = 0.004, [6] = 0.001}')
+cmd:option('--momentum', 0.9, 'momentum')
+cmd:option('--maxnormout', -1, 'max l2-norm of each layer\'s output neuron weights')
+cmd:option('--cutoff', -1, 'max l2-norm of concatenation of all gradParam tensors')
+cmd:option('--device', 1, 'sets the device (GPU) to use')
+cmd:option('--profile', false, 'profile updateOutput,updateGradInput and accGradParameters in Sequential')
+cmd:option('--maxepoch', 1000, 'maximum number of epochs to run')
+cmd:option('--earlystop', 50, 'maximum number of epochs to wait to find a better local minima for early-stopping')
+cmd:option('--progress', false, 'print progress bar')
+cmd:option('--silent', false, 'don\'t print anything to stdout')
+cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization')
+cmd:option('--k', 25, 'how many noise samples to use for NCE')
+cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.')
+cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).')
+cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module')
+-- rnn layer 
+cmd:option('--seqlen', 50, 'sequence length : back-propagate through time (BPTT) for this many time-steps')
+cmd:option('--inputsize', -1, 'size of lookup table embeddings. -1 defaults to hiddensize[1]')
+cmd:option('--hiddensize', '{200,200}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs/GRUs are stacked')
+cmd:option('--dropout', 0, 'ancelossy dropout with this probability after each rnn layer. dropout <= 0 disables it.')
+-- data
+cmd:option('--batchsize', 32, 'number of examples per batch')
+cmd:option('--trainsize', -1, 'number of train time-steps seen between each epoch')
+cmd:option('--validsize', -1, 'number of valid time-steps used for early stopping and cross-validation') 
+cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory where experiment log (includes model) will be saved')
+cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)')
+cmd:option('--tiny', false, 'use train_tiny.th7 training file')
+cmd:option('--dontsave', false, 'dont save the model')
+
+cmd:text()
+local opt = cmd:parse(arg or {})
+opt.hiddensize = loadstring(" return "..opt.hiddensize)()
+opt.schedule = loadstring(" return "..opt.schedule)()
+opt.inputsize = opt.inputsize == -1 and opt.hiddensize[1] or opt.inputsize
+if not opt.silent then
+   table.print(opt)
+end
+opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id
+opt.version = 1
+
+cutorch.setDevice(opt.device)
+
+local xplog, lm, criterion, targetmodule
+if opt.continue ~= '' then
+   xplog = torch.load(opt.continue)
+   xplog.opt.cuda = true
+   xplog.opt.device = opt.device
+   xplog.opt.tiny = opt.tiny
+   opt = xplog.opt
+   lm = xplog.model.module
+   -- prevent re-casting bug
+   for i,lookup in ipairs(lm:findModules('nn.LookupTableMaskZero')) do
+      lookup.__input = nil
+   end
+   criterion = xplog.criterion
+   targetmodule = xplog.targetmodule
+   assert(opt)
+end
+
+--[[ data set ]]--
+
+local trainset, validset, testset = dl.loadGBW({opt.batchsize,opt.batchsize,opt.batchsize}, opt.tiny and 'train_tiny.th7' or nil)
+if not opt.silent then 
+   print("Vocabulary size : "..#trainset.ivocab) 
+   print("Train set split into "..opt.batchsize.." sequences of length "..trainset:size())
+end
+
+--[[ language model ]]--
+
+if not lm then
+   assert(opt.maxnormout <= 0)
+   lm = nn.Sequential()
+   lm:add(nn.Convert())
+   
+   -- input layer (i.e. word embedding space)
+   local concat = nn.Concat(3)
+   for device=1,2 do
+      local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2)
+      local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize)
+      lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
+      concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize
+   end
+   
+   lm:add(nn.GPU(concat, 2))
+   if opt.dropout > 0 then
+      lm:add(nn.GPU(nn.Dropout(opt.dropout), 2))
+   end
+
+   -- rnn layers
+   local inputsize = opt.inputsize
+   for i,hiddensize in ipairs(opt.hiddensize) do
+      -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
+      local rnn = nn.SeqLSTM(inputsize, hiddensize)
+      rnn.maskzero = true
+      local device = 2 -- i < #opt.hiddensize/2 and 1 or 2
+      lm:add(nn.GPU(rnn, device))
+      if opt.dropout > 0 then
+         lm:add(nn.GPU(nn.Dropout(opt.dropout), device))
+      end
+      inputsize = hiddensize
+   end
+
+   lm:add(nn.GPU(nn.SplitTable(1), 3))
+
+   -- output layer
+   local unigram = trainset.wordfreq:float()
+   ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+   ncemodule.batchnoise = not opt.rownoise
+   -- distribute weight, gradWeight and momentum on devices 3 and 4
+   ncemodule:multicuda(3,4) 
+
+   -- NCE requires {input, target} as inputs
+   lm = nn.Sequential()
+      :add(nn.ParallelTable()
+         :add(lm):add(nn.Identity()))
+      :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
+
+   -- encapsulate stepmodule into a Sequencer
+   lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device))
+   
+   -- remember previous state between batches
+   lm:remember()
+
+   if opt.uniform > 0 then
+      for k,param in ipairs(lm:parameters()) do
+         param:uniform(-opt.uniform, opt.uniform)
+      end
+   end
+end
+
+if opt.profile then
+   lm:profile()
+end
+
+if not opt.silent then
+   print"Language Model:"
+   print(lm)
+end
+
+if not (criterion and targetmodule) then
+   --[[ loss function ]]--
+
+   local crit = nn.MaskZeroCriterion(nn.NCECriterion(), 0)
+
+   -- target is also seqlen x batchsize.
+   targetmodule = nn.SplitTable(1)
+   if opt.cuda then
+      targetmodule = nn.Sequential()
+         :add(nn.Convert())
+         :add(targetmodule)
+   end
+    
+   criterion = nn.SequencerCriterion(crit)
+end
+
+--[[ CUDA ]]--
+
+if opt.cuda then
+   lm:cuda()
+   criterion:cuda()
+   targetmodule:cuda()
+end
+
+--[[ experiment log ]]--
+
+-- is saved to file every time a new validation minima is found
+if not xplog then
+   xplog = {}
+   xplog.opt = opt -- save all hyper-parameters and such
+   xplog.dataset = 'GoogleBillionWords'
+   xplog.vocab = trainset.vocab
+   -- will only serialize params
+   xplog.model = nn.Serial(lm)
+   xplog.model:mediumSerial()
+   xplog.criterion = criterion
+   xplog.targetmodule = targetmodule
+   -- keep a log of NLL for each epoch
+   xplog.trainnceloss = {}
+   xplog.valnceloss = {}
+   -- will be used for early-stopping
+   xplog.minvalnceloss = 99999999
+   xplog.epoch = 0
+   paths.mkdir(opt.savepath)
+end
+local ntrial = 0
+
+local epoch = xplog.epoch+1
+opt.lr = opt.lr or opt.startlr
+opt.trainsize = opt.trainsize == -1 and trainset:size() or opt.trainsize
+opt.validsize = opt.validsize == -1 and validset:size() or opt.validsize
+while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
+   print("")
+   print("Epoch #"..epoch.." :")
+
+   -- 1. training
+   
+   local a = torch.Timer()
+   lm:training()
+   local sumErr = 0
+   for i, inputs, targets in trainset:subiter(opt.seqlen, opt.trainsize) do
+      targets = targetmodule:forward(targets)
+      inputs = {inputs, targets}
+      -- forward
+      local outputs = lm:forward(inputs)
+      local err = criterion:forward(outputs, targets)
+      sumErr = sumErr + err
+      -- backward 
+      local gradOutputs = criterion:backward(outputs, targets)
+      local a = torch.Timer()
+      lm:zeroGradParameters()
+      lm:backward(inputs, gradOutputs)
+      
+      -- update
+      if opt.cutoff > 0 then
+         local norm = lm:gradParamClip(opt.cutoff) -- affects gradParams
+         opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm
+      end
+      lm:updateGradParameters(opt.momentum) -- affects gradParams
+      lm:updateParameters(opt.lr) -- affects params
+      lm:maxParamNorm(opt.maxnormout) -- affects params
+
+      if opt.progress then
+         xlua.progress(i, opt.trainsize)
+      end
+
+      if i % 2000 == 0 then
+         collectgarbage()
+      end
+
+   end
+   
+   -- learning rate decay
+   if opt.schedule then
+      opt.lr = opt.schedule[epoch] or opt.lr
+   else
+      opt.lr = opt.lr + (opt.minlr - opt.startlr)/opt.saturate
+   end
+   opt.lr = math.max(opt.minlr, opt.lr)
+   
+   if not opt.silent then
+      print("learning rate", opt.lr)
+      if opt.meanNorm then
+         print("mean gradParam norm", opt.meanNorm)
+      end
+   end
+
+   if cutorch then cutorch.synchronize() end
+   local speed = opt.trainsize*opt.batchsize/a:time().real
+   print(string.format("Speed : %f words/second; %f ms/word", speed, 1000/speed))
+
+   local nceloss = sumErr/opt.trainsize
+   print("Training error : "..nceloss)
+
+   xplog.trainnceloss[epoch] = nceloss
+
+   -- 2. cross-validation
+
+   lm:evaluate()
+   local sumErr = 0
+   for i, inputs, targets in validset:subiter(opt.seqlen, opt.validsize) do
+      targets = targetmodule:forward(targets)
+      local outputs = lm:forward{inputs, targets}
+      local err = criterion:forward(outputs, targets)
+      sumErr = sumErr + err
+      
+      if opt.progress then
+         xlua.progress(i, opt.validsize)
+      end
+   end
+
+   local nceloss = sumErr/opt.validsize
+   print("Validation error : "..nceloss)
+
+   xplog.valnceloss[epoch] = nceloss
+   ntrial = ntrial + 1
+
+   -- early-stopping
+   if nceloss < xplog.minvalnceloss then
+      -- save best version of model
+      xplog.minvalnceloss = nceloss
+      xplog.epoch = epoch 
+      local filename = paths.concat(opt.savepath, opt.id..'.t7')
+      if not opt.dontsave then
+         print("Found new minima. Saving to "..filename)
+         torch.save(filename, xplog)
+      end
+      ntrial = 0
+   elseif ntrial >= opt.earlystop then
+      print("No new minima found after "..ntrial.." epochs.")
+      print("Stopping experiment.")
+      print("Best model can be found in "..paths.concat(opt.savepath, opt.id..'.t7'))
+      os.exit()
+   end
+
+   collectgarbage()
+   epoch = epoch + 1
+end
diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index 4d6f3e7..ab2a2c7 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -2,7 +2,7 @@ require 'paths'
 require 'rnn'
 require 'nngraph'
 local dl = require 'dataload'
-assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 3, "update dpnn : luarocks install dpnn")
+assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 4, "update dpnn : luarocks install dpnn")
 
 --[[ command line arguments ]]--
 cmd = torch.CmdLine()
@@ -32,6 +32,7 @@ cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution b
 cmd:option('--k', 25, 'how many noise samples to use for NCE')
 cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.')
 cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).')
+cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module')
 -- rnn layer 
 cmd:option('--seqlen', 50, 'sequence length : back-propagate through time (BPTT) for this many time-steps')
 cmd:option('--inputsize', -1, 'size of lookup table embeddings. -1 defaults to hiddensize[1]')
@@ -45,7 +46,6 @@ cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory
 cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)')
 cmd:option('--tiny', false, 'use train_tiny.th7 training file')
 cmd:option('--dontsave', false, 'dont save the model')
-cmd:option('--multigpu', false, 'distribute the model over 4 gpus')
 
 cmd:text()
 local opt = cmd:parse(arg or {})
@@ -56,7 +56,7 @@ if not opt.silent then
    table.print(opt)
 end
 opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id
-opt.version = 4
+opt.version = 5 -- refactored multigpu into its own file
 
 if opt.cuda then -- do this before building model to prevent segfault
    require 'cunn' 
@@ -91,110 +91,51 @@ end
 --[[ language model ]]--
 
 if not lm then
-   if opt.multigpu then
-      assert(opt.maxnormout <= 0)
-      lm = nn.Sequential()
-      lm:add(nn.Convert())
-      
-      -- input layer (i.e. word embedding space)
-      local concat = nn.Concat(3)
-      for device=1,2 do
-         local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2)
-         local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize)
-         lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
-         concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize
-      end
-      
-      lm:add(nn.GPU(concat, 2))
-      if opt.dropout > 0 then
-         lm:add(nn.GPU(nn.Dropout(opt.dropout), 2))
-      end
-
-      -- rnn layers
-      local inputsize = opt.inputsize
-      for i,hiddensize in ipairs(opt.hiddensize) do
-         -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
-         local rnn = nn.SeqLSTM(inputsize, hiddensize)
-         rnn.maskzero = true
-         local device = 2 -- i < #opt.hiddensize/2 and 1 or 2
-         lm:add(nn.GPU(rnn, device))
-         if opt.dropout > 0 then
-            lm:add(nn.GPU(nn.Dropout(opt.dropout), device))
-         end
-         inputsize = hiddensize
-      end
-
-      lm:add(nn.GPU(nn.SplitTable(1), 3))
-
-      -- output layer
-      local unigram = trainset.wordfreq:float()
-      ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
-      -- distribute weight, gradWeight and momentum on devices 3 and 4
-      ncemodule:multicuda(3,4) 
-
-      -- NCE requires {input, target} as inputs
-      lm = nn.Sequential()
-         :add(nn.ParallelTable()
-            :add(lm):add(nn.Identity()))
-         :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
-
-      -- encapsulate stepmodule into a Sequencer
-      lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device))
-
-      -- remember previous state between batches
-      lm:remember()
-
-      if opt.uniform > 0 then
-         for k,param in ipairs(lm:parameters()) do
-            param:uniform(-opt.uniform, opt.uniform)
-         end
-      end
-   else
-      lm = nn.Sequential()
+   lm = nn.Sequential()
+
+   -- input layer (i.e. word embedding space)
+   local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize)
+   lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
+   lm:add(lookup) -- input is seqlen x batchsize
+   if opt.dropout > 0 then
+      lm:add(nn.Dropout(opt.dropout))
+   end
 
-      -- input layer (i.e. word embedding space)
-      local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize)
-      lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
-      lm:add(lookup) -- input is seqlen x batchsize
+   -- rnn layers
+   local inputsize = opt.inputsize
+   for i,hiddensize in ipairs(opt.hiddensize) do
+      -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
+      local rnn = nn.SeqLSTM(inputsize, hiddensize)
+      rnn.maskzero = true
+      lm:add(rnn)
       if opt.dropout > 0 then
          lm:add(nn.Dropout(opt.dropout))
       end
+      inputsize = hiddensize
+   end
 
-      -- rnn layers
-      local inputsize = opt.inputsize
-      for i,hiddensize in ipairs(opt.hiddensize) do
-         -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
-         local rnn = nn.SeqLSTM(inputsize, hiddensize)
-         rnn.maskzero = true
-         lm:add(rnn)
-         if opt.dropout > 0 then
-            lm:add(nn.Dropout(opt.dropout))
-         end
-         inputsize = hiddensize
-      end
-
-      lm:add(nn.SplitTable(1))
-
-      -- output layer
-      local unigram = trainset.wordfreq:float()
-      local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+   lm:add(nn.SplitTable(1))
 
-      -- NCE requires {input, target} as inputs
-      lm = nn.Sequential()
-         :add(nn.ParallelTable()
-            :add(lm):add(nn.Identity()))
-         :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
+   -- output layer
+   local unigram = trainset.wordfreq:float()
+   local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+   ncemodule.batchnoise = not opt.rownoise
 
-      -- encapsulate stepmodule into a Sequencer
-      lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
+   -- NCE requires {input, target} as inputs
+   lm = nn.Sequential()
+      :add(nn.ParallelTable()
+         :add(lm):add(nn.Identity()))
+      :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
 
-      -- remember previous state between batches
-      lm:remember()
+   -- encapsulate stepmodule into a Sequencer
+   lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
+   
+   -- remember previous state between batches
+   lm:remember()
 
-      if opt.uniform > 0 then
-         for k,param in ipairs(lm:parameters()) do
-            param:uniform(-opt.uniform, opt.uniform)
-         end
+   if opt.uniform > 0 then
+      for k,param in ipairs(lm:parameters()) do
+         param:uniform(-opt.uniform, opt.uniform)
       end
    end
 end
diff --git a/test/test.lua b/test/test.lua
index 2c4c01c..05b678b 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -6400,6 +6400,151 @@ function rnntest.FastLSTM_batchNorm()
    nn.FastLSTM.bn = false
 end
 
+function rnntest.inplaceBackward()
+   local lr = 0.1
+   local seqlen, batchsize, hiddensize = 3, 4, 5
+   local input = torch.randn(seqlen, batchsize, hiddensize)
+   local gradOutput = torch.randn(seqlen, batchsize, hiddensize)
+   
+   -- test sequencer(linear)
+   
+   local seq = nn.Sequencer(nn.Linear(hiddensize, hiddensize))
+   local seq2 = seq:clone()
+   seq2:inplaceBackward()
+   
+   local output = seq:forward(input)
+   local output2 = seq2:forward(input)
+   
+   mytester:assertTensorEq(output, output2, 0.000001)
+   
+   seq:zeroGradParameters()
+   local gradInput = seq:backward(input, gradOutput)
+   seq:updateParameters(lr)
+   
+   local gradInput2 = seq2:backward(input, gradOutput, -lr)
+   
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+   
+   local params = seq:parameters()
+   local params2 = seq2:parameters()
+   
+   for i=1,#params do
+      mytester:assertTensorEq(params[i], params2[i], 0.000001)
+   end
+   
+   -- test seqlstm
+   
+   local seq = nn.SeqLSTM(hiddensize, hiddensize)
+   local seq2 = seq:clone()
+   seq2:inplaceBackward()
+   
+   local output = seq:forward(input)
+   local output2 = seq2:forward(input)
+   
+   mytester:assertTensorEq(output, output2, 0.000001)
+   
+   seq:zeroGradParameters()
+   local gradInput = seq:backward(input, gradOutput)
+   seq:updateParameters(lr)
+   
+   local gradInput2 = seq2:backward(input, gradOutput, -lr)
+   
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+   
+   local params = seq:parameters()
+   local params2 = seq2:parameters()
+   
+   for i=1,#params do
+      mytester:assertTensorEq(params[i], params2[i], 0.000001)
+   end
+   
+   
+   if true then return end
+   -- test language model
+   
+   local vocabsize = 100
+   local input = torch.LongTensor(seqlen, batchsize):random(1,vocabsize)
+   local target = torch.LongTensor(seqlen, batchsize):random(1,vocabsize)
+   
+   local lm = nn.Sequential()
+   local lookup = nn.LookupTableMaskZero(vocabsize, hiddensize)
+   lm:add(lookup)
+
+   for i=1,2 do
+      local rnn = nn.SeqLSTM(hiddensize, hiddensize)
+      rnn.maskzero = true
+      lm:add(rnn)
+   end
+
+   lm:add(nn.SplitTable(1))
+
+   local unigram = torch.FloatTensor(vocabsize):uniform(1,10)
+   local ncemodule = nn.NCEModule(hiddensize, vocabsize, 10, unigram, -1)
+   local _sampleidx = torch.Tensor(1,10):random(1,vocabsize)
+   
+   function ncemodule.noiseSample(self, sampleidx, batchsize, k)
+      assert(batchsize == 1)
+      assert(k == 10)
+      sampleidx:resize(1, k):copy(_sampleidx)
+      return sampleidx
+   end
+
+   lm = nn.Sequential()
+      :add(nn.ParallelTable()
+         :add(lm):add(nn.Identity()))
+      :add(nn.ZipTable()) 
+
+   lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1)))
+   lm:remember()
+   
+   local crit = nn.MaskZeroCriterion(nn.NCECriterion(), 0)
+   local targetmodule = nn.SplitTable(1)
+   local criterion = nn.SequencerCriterion(crit)
+   
+   local lm2 = lm:clone()
+   lm2:inplaceBackward()
+   
+   local criterion2 = criterion:clone()
+   
+   local target = targetmodule:forward(target)
+   
+   local inputTable = {input, target}
+   
+   local output = lm:forward(inputTable)
+   local output2 = lm2:forward(inputTable)
+   
+   for i=1,seqlen do
+      mytester:assertTensorEq(output[i][1], output2[i][1], 0.000001)
+      mytester:assertTensorEq(output[i][2], output2[i][2], 0.000001)
+      mytester:assertTensorEq(output[i][3], output2[i][3], 0.000001)
+      mytester:assertTensorEq(output[i][4], output2[i][4], 0.000001)
+   end
+   
+   local loss = criterion:forward(output, target)
+   local loss2 = criterion2:forward(output, target)
+   
+   local gradOutput = criterion:backward(output, target)
+   local gradOutput2 = criterion2:backward(output, target)
+   
+   for i=1,seqlen do
+      mytester:assertTensorEq(gradOutput[i][1], gradOutput2[i][1], 0.000001)
+      mytester:assertTensorEq(gradOutput[i][2], gradOutput2[i][2], 0.000001)
+   end
+   
+   lm:zeroGradParameters()
+   lm:backward(inputTable, gradOutput)
+   lm:updateParameters(lr)
+   
+   lm2:backward(inputTable, gradOutput2, -lr)
+   
+   local params = lm:parameters()
+   local params2 = lm2:parameters()
+   
+   for i=1,#params do
+      mytester:assertTensorEq(params[i], params2[i], 0.000001, "error in params "..i..": "..tostring(params[i]:size()))
+   end
+end
+
 function rnn.test(tests, benchmark_)
    mytester = torch.Tester()
    benchmark = benchmark_

From f7a7fd7e0240f300ea247931b7ab70a546ec98ed Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Tue, 21 Jun 2016 15:06:51 -0400
Subject: [PATCH 5/7] small fixes

---
 CMakeLists.txt                        |  2 +-
 examples/multigpu-nce-rnnlm.lua       |  2 +-
 examples/recurrent-language-model.lua |  1 -
 init.lua                              |  1 +
 scripts/evaluate-rnnlm.lua            | 15 +++++++++++++--
 test/test.lua                         | 12 +++++-------
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b99715..6ca16ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,5 +10,5 @@ FIND_PACKAGE(Torch REQUIRED)
 
 SET(src)
 FILE(GLOB luasrc *.lua)
-SET(luasrc ${luasrc} test/test.lua test/mnistsample.t7)
+SET(luasrc ${luasrc} test/test.lua test/mnistsample.t7 test/bigtest.lua)
 ADD_TORCH_PACKAGE(rnn "${src}" "${luasrc}" "Recurrent Neural Networks")
diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua
index 5dc42e8..bd088b5 100644
--- a/examples/multigpu-nce-rnnlm.lua
+++ b/examples/multigpu-nce-rnnlm.lua
@@ -8,7 +8,7 @@ require 'cunn'
 --[[ command line arguments ]]--
 cmd = torch.CmdLine()
 cmd:text()
-cmd:text('Train a Language Model using stacked LSTM on Google Billion Words dataset')
+cmd:text('Train a multi-GPU Language Model using stacked LSTM on Google Billion Words dataset')
 cmd:text('Example:')
 cmd:text("th examples/multigpu-nce-rnnlm.lua --progress --earlystop 50 --device 2 --seqlen 20 --hiddensize '{200,200}' --batchsize 20 --startlr 1 --uniform 0.1 --cutoff 5 --schedule '{[5]=0.5,[6]=0.25,[7]=0.125,[8]=0.0625,[9]=0.03125,[10]=0.015625,[11]=0.0078125,[12]=0.00390625}'")
 cmd:text("th examples/multigpu-nce-rnnlm.lua.lua --trainsize 400000 --validsize 40000 --cutoff 10 --batchsize 128 --seqlen 100 --hiddensize '{250,250}' --progress --device 2")
diff --git a/examples/recurrent-language-model.lua b/examples/recurrent-language-model.lua
index 635fa74..60c5aac 100644
--- a/examples/recurrent-language-model.lua
+++ b/examples/recurrent-language-model.lua
@@ -260,7 +260,6 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do
    -- Note :
    -- Perplexity = exp( sum ( NLL ) / #w)
    -- Bits Per Word = log2(Perplexity)
-   -- Bits per Char = BPW * (#w / #c)
    print("Validation PPL : "..ppl)
 
    xplog.valppl[epoch] = ppl
diff --git a/init.lua b/init.lua
index 02a4f2d..c210f91 100644
--- a/init.lua
+++ b/init.lua
@@ -19,6 +19,7 @@ torch.include('rnn', 'Dropout.lua')
 
 -- for testing:
 torch.include('rnn', 'test.lua')
+torch.include('rnn', 'bigtest.lua')
 
 -- support modules
 torch.include('rnn', 'ZeroGrad.lua')
diff --git a/scripts/evaluate-rnnlm.lua b/scripts/evaluate-rnnlm.lua
index 1c3d434..8228847 100644
--- a/scripts/evaluate-rnnlm.lua
+++ b/scripts/evaluate-rnnlm.lua
@@ -116,17 +116,28 @@ if opt.nsample > 0 then
       print(table.concat(sampletext, ' '))
    end
 else
-   local sumErr = 0
+   local sumErr, count = 0, 0
    
    for i, inputs, targets in testset:subiter(xplog.opt.seqlen or 100) do
+      inputs:apply(function(x)
+         if x > 0 then
+            count = count + 1
+         end
+      end)
       local targets = targetmodule:forward(targets)
       local inputs = opt.nce and {inputs, targets} or inputs
       local outputs = lm:forward(inputs)
       local err = criterion:forward(outputs, targets)
       sumErr = sumErr + err
    end
+   
+   if count ~= testset:size() then
+      local meanseqlen = testset:size()/(testset:size() - count)
+      print("mean sequence length : "..meanseqlen)
+      print("Old (wrong) Test PPL : "..torch.exp(sumErr/testset:size()))
+   end
 
-   local ppl = torch.exp(sumErr/testset:size())
+   local ppl = torch.exp(sumErr/count)
    print("Test PPL : "..ppl)
 end
 
diff --git a/test/test.lua b/test/test.lua
index 05b678b..5c84b82 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -2647,13 +2647,11 @@ function rnntest.SequencerCriterion()
       sc:cuda()
    
       local gradInput4 = {}
-      for i=1,nStep do
-         input[i] = input[i]:cuda()
-         target[i] = target[i]:cuda()
-      end
+      input = input:cuda()
+      target = target:cuda()
       
       local err4 = sc:forward(input, target)
-      mytester:assert(math.abs(err - err4) < 0.000001, "SequencerCriterion forward cuda err") 
+      mytester:assert(math.abs(errTensorInput - err4) < 0.000001, "SequencerCriterion forward cuda err") 
       local gradInput4 = sc:backward(input, target)
       for i=1,nStep do
          mytester:assertTensorEq(gradInput4[i]:float(), gradInput3[i], 0.000001, "SequencerCriterion backward cuda err "..i)
@@ -5891,7 +5889,7 @@ function rnntest.NCE_MaskZero()
       batchsize = 4,
       seqlen = 5,
       uniform = 0.1,
-      hiddensize = {10},
+      hiddensize = {100},
       vocabsize = 100,
       dropout = 0,
       k = 25
@@ -5989,7 +5987,7 @@ function rnntest.NCE_MaskZero()
       end
    end
    mytester:assert(found)
-   mytester:assert(err < starterr)
+   mytester:assert(err < starterr, string.format("err=%f should be smaller than starterr=%f", err, starterr))
 end
 
 local function check_size(x, dims)

From 539f0cb6740e92661f2105e33469830c0d62c27a Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Wed, 22 Jun 2016 16:32:59 -0400
Subject: [PATCH 6/7] multigpu runs again

---
 examples/multigpu-nce-rnnlm.lua         | 60 ++++++++++++-------------
 examples/noise-contrastive-estimate.lua | 17 +++----
 2 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua
index bd088b5..ebb2777 100644
--- a/examples/multigpu-nce-rnnlm.lua
+++ b/examples/multigpu-nce-rnnlm.lua
@@ -15,13 +15,13 @@ cmd:text("th examples/multigpu-nce-rnnlm.lua.lua --trainsize 400000 --validsize
 cmd:text("th scripts/evaluate-rnnlm.lua --xplogpath /data/save/rnnlm/ptb:atlas:1458081269:1.t7 --cuda")
 cmd:text('Options:')
 -- training
-cmd:option('--startlr', 0.05, 'learning rate at t=0')
-cmd:option('--minlr', 0.00001, 'minimum learning rate')
-cmd:option('--saturate', 400, 'epoch at which linear decayed LR will reach minlr')
+cmd:option('--startlr', 0.7, 'learning rate at t=0')
+cmd:option('--minlr', 0.001, 'minimum learning rate')
+cmd:option('--saturate', 300, 'epoch at which linear decayed LR will reach minlr')
 cmd:option('--schedule', '', 'learning rate schedule. e.g. {[5] = 0.004, [6] = 0.001}')
-cmd:option('--momentum', 0.9, 'momentum')
+cmd:option('--momentum', -1, 'momentum (requires an additional copy of all params)')
 cmd:option('--maxnormout', -1, 'max l2-norm of each layer\'s output neuron weights')
-cmd:option('--cutoff', -1, 'max l2-norm of concatenation of all gradParam tensors')
+cmd:option('--cutoff', 10, 'max l2-norm of concatenation of all gradParam tensors')
 cmd:option('--device', 1, 'sets the device (GPU) to use')
 cmd:option('--profile', false, 'profile updateOutput,updateGradInput and accGradParameters in Sequential')
 cmd:option('--maxepoch', 1000, 'maximum number of epochs to run')
@@ -29,7 +29,7 @@ cmd:option('--earlystop', 50, 'maximum number of epochs to wait to find a better
 cmd:option('--progress', false, 'print progress bar')
 cmd:option('--silent', false, 'don\'t print anything to stdout')
 cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization')
-cmd:option('--k', 25, 'how many noise samples to use for NCE')
+cmd:option('--k', 400, 'how many noise samples to use for NCE')
 cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.')
 cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).')
 cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module')
@@ -98,12 +98,12 @@ if not lm then
       local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2)
       local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize)
       lookup.maxnormout = -1 -- prevent weird maxnormout behaviour
-      concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize
+      concat:add(nn.GPU(lookup, device):cuda()) -- input is seqlen x batchsize
    end
    
-   lm:add(nn.GPU(concat, 2))
+   lm:add(nn.GPU(concat, 2):cuda())
    if opt.dropout > 0 then
-      lm:add(nn.GPU(nn.Dropout(opt.dropout), 2))
+      lm:add(nn.GPU(nn.Dropout(opt.dropout), 2):cuda())
    end
 
    -- rnn layers
@@ -112,19 +112,27 @@ if not lm then
       -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize))
       local rnn = nn.SeqLSTM(inputsize, hiddensize)
       rnn.maskzero = true
-      local device = 2 -- i < #opt.hiddensize/2 and 1 or 2
-      lm:add(nn.GPU(rnn, device))
+      local device = i <= #opt.hiddensize/2 and 1 or 2
+      lm:add(nn.GPU(rnn, device):cuda())
       if opt.dropout > 0 then
-         lm:add(nn.GPU(nn.Dropout(opt.dropout), device))
+         lm:add(nn.GPU(nn.Dropout(opt.dropout), device):cuda())
       end
       inputsize = hiddensize
    end
 
-   lm:add(nn.GPU(nn.SplitTable(1), 3))
+   lm:add(nn.GPU(nn.SplitTable(1), 3):cuda())
+   
+   if opt.uniform > 0 then
+      for k,param in ipairs(lm:parameters()) do
+         assert(torch.type(param) == 'torch.CudaTensor')
+         cutorch.withDevice(param:getDevice(), function() param:uniform(-opt.uniform, opt.uniform) end)
+      end
+   end
 
    -- output layer
    local unigram = trainset.wordfreq:float()
    ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z)
+   ncemodule:reset() -- initializes bias to get approx. Z = 1
    ncemodule.batchnoise = not opt.rownoise
    -- distribute weight, gradWeight and momentum on devices 3 and 4
    ncemodule:multicuda(3,4) 
@@ -136,16 +144,11 @@ if not lm then
       :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...}
 
    -- encapsulate stepmodule into a Sequencer
-   lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device))
+   local masked = nn.MaskZero(ncemodule, 1):cuda()
+   lm:add(nn.GPU(nn.Sequencer(masked), 3, opt.device):cuda())
    
    -- remember previous state between batches
    lm:remember()
-
-   if opt.uniform > 0 then
-      for k,param in ipairs(lm:parameters()) do
-         param:uniform(-opt.uniform, opt.uniform)
-      end
-   end
 end
 
 if opt.profile then
@@ -163,23 +166,18 @@ if not (criterion and targetmodule) then
    local crit = nn.MaskZeroCriterion(nn.NCECriterion(), 0)
 
    -- target is also seqlen x batchsize.
-   targetmodule = nn.SplitTable(1)
-   if opt.cuda then
-      targetmodule = nn.Sequential()
-         :add(nn.Convert())
-         :add(targetmodule)
-   end
+   targetmodule = nn.Sequential()
+      :add(nn.Convert())
+      :add(nn.SplitTable(1))
     
    criterion = nn.SequencerCriterion(crit)
 end
 
 --[[ CUDA ]]--
 
-if opt.cuda then
-   lm:cuda()
-   criterion:cuda()
-   targetmodule:cuda()
-end
+lm:cuda()
+criterion:cuda()
+targetmodule:cuda()
 
 --[[ experiment log ]]--
 
diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index ab2a2c7..d896707 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -2,7 +2,7 @@ require 'paths'
 require 'rnn'
 require 'nngraph'
 local dl = require 'dataload'
-assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 4, "update dpnn : luarocks install dpnn")
+assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 6, "update dpnn : luarocks install dpnn")
 
 --[[ command line arguments ]]--
 cmd = torch.CmdLine()
@@ -29,19 +29,19 @@ cmd:option('--earlystop', 50, 'maximum number of epochs to wait to find a better
 cmd:option('--progress', false, 'print progress bar')
 cmd:option('--silent', false, 'don\'t print anything to stdout')
 cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization')
-cmd:option('--k', 25, 'how many noise samples to use for NCE')
+cmd:option('--k', 100, 'how many noise samples to use for NCE')
 cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.')
-cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).')
+cmd:option('--Z', 1, 'normalization constant for NCE module (-1 approximates it from first batch).')
 cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module')
 -- rnn layer 
 cmd:option('--seqlen', 50, 'sequence length : back-propagate through time (BPTT) for this many time-steps')
 cmd:option('--inputsize', -1, 'size of lookup table embeddings. -1 defaults to hiddensize[1]')
-cmd:option('--hiddensize', '{200,200}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs/GRUs are stacked')
+cmd:option('--hiddensize', '{256,256}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs/GRUs are stacked')
 cmd:option('--dropout', 0, 'ancelossy dropout with this probability after each rnn layer. dropout <= 0 disables it.')
 -- data
-cmd:option('--batchsize', 32, 'number of examples per batch')
-cmd:option('--trainsize', -1, 'number of train time-steps seen between each epoch')
-cmd:option('--validsize', -1, 'number of valid time-steps used for early stopping and cross-validation') 
+cmd:option('--batchsize', 128, 'number of examples per batch')
+cmd:option('--trainsize', 400000, 'number of train time-steps seen between each epoch')
+cmd:option('--validsize', 40000, 'number of valid time-steps used for early stopping and cross-validation') 
 cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory where experiment log (includes model) will be saved')
 cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)')
 cmd:option('--tiny', false, 'use train_tiny.th7 training file')
@@ -56,7 +56,7 @@ if not opt.silent then
    table.print(opt)
 end
 opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id
-opt.version = 5 -- refactored multigpu into its own file
+opt.version = 6 -- better NCE bias initialization + new default hyper-params
 
 if opt.cuda then -- do this before building model to prevent segfault
    require 'cunn' 
@@ -137,6 +137,7 @@ if not lm then
       for k,param in ipairs(lm:parameters()) do
          param:uniform(-opt.uniform, opt.uniform)
       end
+      ncemodule:reset()
    end
 end
 

From bcad6c9a05ec49edf2ad487ba4ba6d92ef480906 Mon Sep 17 00:00:00 2001
From: nicholas-leonard <nick@nikopia.org>
Date: Tue, 28 Jun 2016 18:08:08 -0400
Subject: [PATCH 7/7] version first

---
 examples/noise-contrastive-estimate.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua
index d896707..88df764 100644
--- a/examples/noise-contrastive-estimate.lua
+++ b/examples/noise-contrastive-estimate.lua
@@ -52,11 +52,11 @@ local opt = cmd:parse(arg or {})
 opt.hiddensize = loadstring(" return "..opt.hiddensize)()
 opt.schedule = loadstring(" return "..opt.schedule)()
 opt.inputsize = opt.inputsize == -1 and opt.hiddensize[1] or opt.inputsize
+opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id
+opt.version = 6 -- better NCE bias initialization + new default hyper-params
 if not opt.silent then
    table.print(opt)
 end
-opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id
-opt.version = 6 -- better NCE bias initialization + new default hyper-params
 
 if opt.cuda then -- do this before building model to prevent segfault
    require 'cunn'