From 290306c8d4d3271663d06ee69d03626c1990dfbe Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Thu, 2 Jun 2016 11:22:37 -0400 Subject: [PATCH 1/7] fix cpu bug --- examples/recurrent-visual-attention.lua | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/recurrent-visual-attention.lua b/examples/recurrent-visual-attention.lua index eba8282..a7eedf3 100644 --- a/examples/recurrent-visual-attention.lua +++ b/examples/recurrent-visual-attention.lua @@ -246,6 +246,8 @@ if opt.cuda then require 'cunn' cutorch.setDevice(opt.useDevice) xp:cuda() +else + xp:float() end xp:verbose(not opt.silent) From 2612ee23b227f25cb96243f20b1b81dca08ebbbb Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Thu, 2 Jun 2016 13:23:38 -0400 Subject: [PATCH 2/7] initial commit for multi-gpu --- examples/noise-contrastive-estimate.lua | 130 +++++++++++++++++------- 1 file changed, 91 insertions(+), 39 deletions(-) diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua index c9f618f..3f346f8 100644 --- a/examples/noise-contrastive-estimate.lua +++ b/examples/noise-contrastive-estimate.lua @@ -45,7 +45,7 @@ cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)') cmd:option('--tiny', false, 'use train_tiny.th7 training file') cmd:option('--dontsave', false, 'dont save the model') -cmd:option('--cpulookup', false, 'keep lookuptable on CPU') +cmd:option('--multigpu', false, 'distribute the model over 4 gpus') cmd:text() local opt = cmd:parse(arg or {}) @@ -91,54 +91,106 @@ end --[[ language model ]]-- if not lm then - lm = nn.Sequential() - - -- input layer (i.e. word embedding space) - local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize) - lookup.maxnormout = -1 -- prevent weird maxnormout behaviour - if opt.cpulookup then - -- this will be slower but will use up less memory. - lookup = nn.DontCast(lookup:float(), false, true) - end - lm:add(lookup) -- input is seqlen x batchsize - if opt.dropout > 0 then - lm:add(nn.Dropout(opt.dropout)) - end + if opt.multigpu then + lm = nn.Sequential() + + -- input layer (i.e. word embedding space) + local concat = nn.Concat(3) + for device=1,2 do + local inpusize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2) + local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize) + lookup.maxnormout = -1 -- prevent weird maxnormout behaviour + concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize + end + + lm:add(nn.GPU(concat, 2)) + if opt.dropout > 0 then + lm:add(nn.GPU(nn.Dropout(opt.dropout), 2)) + end + + -- rnn layers + local inputsize = opt.inputsize + for i,hiddensize in ipairs(opt.hiddensize) do + -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) + local rnn = nn.SeqLSTM(inputsize, hiddensize) + rnn.maskzero = true + local device = i < opt.hiddensize/2 and 2 or 3 + lm:add(nn.GPU(rnn, device)) + if opt.dropout > 0 then + lm:add(nn.GPU(nn.Dropout(opt.dropout), device)) + end + inputsize = hiddensize + end + + lm:add(nn.GPU(nn.SplitTable(1), 3)) + + -- output layer + local unigram = trainset.wordfreq:float() + local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) - -- rnn layers - local inputsize = opt.inputsize - for i,hiddensize in ipairs(opt.hiddensize) do - -- this is a faster version of nnSequencer(nn.FastLSTM(inpusize, hiddensize)) - local rnn = nn.SeqLSTM(inputsize, hiddensize) - rnn.maskzero = true - lm:add(rnn) + -- NCE requires {input, target} as inputs + lm = nn.Sequential() + :add(nn.ParallelTable() + :add(lm):add(nn.Identity())) + :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} + + -- encapsulate stepmodule into a Sequencer + lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) + + -- remember previous state between batches + lm:remember() + + if opt.uniform > 0 then + for k,param in ipairs(lm:parameters()) do + param:uniform(-opt.uniform, opt.uniform) + end + end + else + lm = nn.Sequential() + + -- input layer (i.e. word embedding space) + local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize) + lookup.maxnormout = -1 -- prevent weird maxnormout behaviour + lm:add(lookup) -- input is seqlen x batchsize if opt.dropout > 0 then lm:add(nn.Dropout(opt.dropout)) end - inputsize = hiddensize - end - lm:add(nn.SplitTable(1)) + -- rnn layers + local inputsize = opt.inputsize + for i,hiddensize in ipairs(opt.hiddensize) do + -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) + local rnn = nn.SeqLSTM(inputsize, hiddensize) + rnn.maskzero = true + lm:add(rnn) + if opt.dropout > 0 then + lm:add(nn.Dropout(opt.dropout)) + end + inputsize = hiddensize + end + + lm:add(nn.SplitTable(1)) - -- output layer - local unigram = trainset.wordfreq:float() - local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + -- output layer + local unigram = trainset.wordfreq:float() + local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) - -- NCE requires {input, target} as inputs - lm = nn.Sequential() - :add(nn.ParallelTable() - :add(lm):add(nn.Identity())) - :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} + -- NCE requires {input, target} as inputs + lm = nn.Sequential() + :add(nn.ParallelTable() + :add(lm):add(nn.Identity())) + :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} - -- encapsulate stepmodule into a Sequencer - lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) + -- encapsulate stepmodule into a Sequencer + lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) - -- remember previous state between batches - lm:remember() + -- remember previous state between batches + lm:remember() - if opt.uniform > 0 then - for k,param in ipairs(lm:parameters()) do - param:uniform(-opt.uniform, opt.uniform) + if opt.uniform > 0 then + for k,param in ipairs(lm:parameters()) do + param:uniform(-opt.uniform, opt.uniform) + end end end end From 5d6058c5bbaac94b3bc393ac5c83c47decdb4a24 Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Fri, 3 Jun 2016 17:05:02 -0400 Subject: [PATCH 3/7] nce --multigpu --- examples/noise-contrastive-estimate.lua | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua index 3f346f8..4d6f3e7 100644 --- a/examples/noise-contrastive-estimate.lua +++ b/examples/noise-contrastive-estimate.lua @@ -92,12 +92,14 @@ end if not lm then if opt.multigpu then + assert(opt.maxnormout <= 0) lm = nn.Sequential() - + lm:add(nn.Convert()) + -- input layer (i.e. word embedding space) local concat = nn.Concat(3) for device=1,2 do - local inpusize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2) + local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2) local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize) lookup.maxnormout = -1 -- prevent weird maxnormout behaviour concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize @@ -114,7 +116,7 @@ if not lm then -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) local rnn = nn.SeqLSTM(inputsize, hiddensize) rnn.maskzero = true - local device = i < opt.hiddensize/2 and 2 or 3 + local device = 2 -- i < #opt.hiddensize/2 and 1 or 2 lm:add(nn.GPU(rnn, device)) if opt.dropout > 0 then lm:add(nn.GPU(nn.Dropout(opt.dropout), device)) @@ -126,7 +128,9 @@ if not lm then -- output layer local unigram = trainset.wordfreq:float() - local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + -- distribute weight, gradWeight and momentum on devices 3 and 4 + ncemodule:multicuda(3,4) -- NCE requires {input, target} as inputs lm = nn.Sequential() @@ -135,7 +139,7 @@ if not lm then :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} -- encapsulate stepmodule into a Sequencer - lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) + lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device)) -- remember previous state between batches lm:remember() @@ -347,8 +351,8 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do xplog.minvalnceloss = nceloss xplog.epoch = epoch local filename = paths.concat(opt.savepath, opt.id..'.t7') - print("Found new minima. Saving to "..filename) if not opt.dontsave then + print("Found new minima. Saving to "..filename) torch.save(filename, xplog) end ntrial = 0 From 08c0706ce8f88b17ae50f591e2de79138f45de46 Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Tue, 14 Jun 2016 10:22:52 -0400 Subject: [PATCH 4/7] refactored multigpu nce --- examples/multigpu-nce-rnnlm.lua | 317 ++++++++++++++++++++++++ examples/noise-contrastive-estimate.lua | 137 +++------- test/test.lua | 145 +++++++++++ 3 files changed, 501 insertions(+), 98 deletions(-) create mode 100644 examples/multigpu-nce-rnnlm.lua diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua new file mode 100644 index 0000000..5dc42e8 --- /dev/null +++ b/examples/multigpu-nce-rnnlm.lua @@ -0,0 +1,317 @@ +require 'paths' +require 'rnn' +require 'nngraph' +local dl = require 'dataload' +assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 4, "update dpnn : luarocks install dpnn") +require 'cunn' + +--[[ command line arguments ]]-- +cmd = torch.CmdLine() +cmd:text() +cmd:text('Train a Language Model using stacked LSTM on Google Billion Words dataset') +cmd:text('Example:') +cmd:text("th examples/multigpu-nce-rnnlm.lua --progress --earlystop 50 --device 2 --seqlen 20 --hiddensize '{200,200}' --batchsize 20 --startlr 1 --uniform 0.1 --cutoff 5 --schedule '{[5]=0.5,[6]=0.25,[7]=0.125,[8]=0.0625,[9]=0.03125,[10]=0.015625,[11]=0.0078125,[12]=0.00390625}'") +cmd:text("th examples/multigpu-nce-rnnlm.lua.lua --trainsize 400000 --validsize 40000 --cutoff 10 --batchsize 128 --seqlen 100 --hiddensize '{250,250}' --progress --device 2") +cmd:text("th scripts/evaluate-rnnlm.lua --xplogpath /data/save/rnnlm/ptb:atlas:1458081269:1.t7 --cuda") +cmd:text('Options:') +-- training +cmd:option('--startlr', 0.05, 'learning rate at t=0') +cmd:option('--minlr', 0.00001, 'minimum learning rate') +cmd:option('--saturate', 400, 'epoch at which linear decayed LR will reach minlr') +cmd:option('--schedule', '', 'learning rate schedule. e.g. {[5] = 0.004, [6] = 0.001}') +cmd:option('--momentum', 0.9, 'momentum') +cmd:option('--maxnormout', -1, 'max l2-norm of each layer\'s output neuron weights') +cmd:option('--cutoff', -1, 'max l2-norm of concatenation of all gradParam tensors') +cmd:option('--device', 1, 'sets the device (GPU) to use') +cmd:option('--profile', false, 'profile updateOutput,updateGradInput and accGradParameters in Sequential') +cmd:option('--maxepoch', 1000, 'maximum number of epochs to run') +cmd:option('--earlystop', 50, 'maximum number of epochs to wait to find a better local minima for early-stopping') +cmd:option('--progress', false, 'print progress bar') +cmd:option('--silent', false, 'don\'t print anything to stdout') +cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization') +cmd:option('--k', 25, 'how many noise samples to use for NCE') +cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.') +cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).') +cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module') +-- rnn layer +cmd:option('--seqlen', 50, 'sequence length : back-propagate through time (BPTT) for this many time-steps') +cmd:option('--inputsize', -1, 'size of lookup table embeddings. -1 defaults to hiddensize[1]') +cmd:option('--hiddensize', '{200,200}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs/GRUs are stacked') +cmd:option('--dropout', 0, 'ancelossy dropout with this probability after each rnn layer. dropout <= 0 disables it.') +-- data +cmd:option('--batchsize', 32, 'number of examples per batch') +cmd:option('--trainsize', -1, 'number of train time-steps seen between each epoch') +cmd:option('--validsize', -1, 'number of valid time-steps used for early stopping and cross-validation') +cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory where experiment log (includes model) will be saved') +cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)') +cmd:option('--tiny', false, 'use train_tiny.th7 training file') +cmd:option('--dontsave', false, 'dont save the model') + +cmd:text() +local opt = cmd:parse(arg or {}) +opt.hiddensize = loadstring(" return "..opt.hiddensize)() +opt.schedule = loadstring(" return "..opt.schedule)() +opt.inputsize = opt.inputsize == -1 and opt.hiddensize[1] or opt.inputsize +if not opt.silent then + table.print(opt) +end +opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id +opt.version = 1 + +cutorch.setDevice(opt.device) + +local xplog, lm, criterion, targetmodule +if opt.continue ~= '' then + xplog = torch.load(opt.continue) + xplog.opt.cuda = true + xplog.opt.device = opt.device + xplog.opt.tiny = opt.tiny + opt = xplog.opt + lm = xplog.model.module + -- prevent re-casting bug + for i,lookup in ipairs(lm:findModules('nn.LookupTableMaskZero')) do + lookup.__input = nil + end + criterion = xplog.criterion + targetmodule = xplog.targetmodule + assert(opt) +end + +--[[ data set ]]-- + +local trainset, validset, testset = dl.loadGBW({opt.batchsize,opt.batchsize,opt.batchsize}, opt.tiny and 'train_tiny.th7' or nil) +if not opt.silent then + print("Vocabulary size : "..#trainset.ivocab) + print("Train set split into "..opt.batchsize.." sequences of length "..trainset:size()) +end + +--[[ language model ]]-- + +if not lm then + assert(opt.maxnormout <= 0) + lm = nn.Sequential() + lm:add(nn.Convert()) + + -- input layer (i.e. word embedding space) + local concat = nn.Concat(3) + for device=1,2 do + local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2) + local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize) + lookup.maxnormout = -1 -- prevent weird maxnormout behaviour + concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize + end + + lm:add(nn.GPU(concat, 2)) + if opt.dropout > 0 then + lm:add(nn.GPU(nn.Dropout(opt.dropout), 2)) + end + + -- rnn layers + local inputsize = opt.inputsize + for i,hiddensize in ipairs(opt.hiddensize) do + -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) + local rnn = nn.SeqLSTM(inputsize, hiddensize) + rnn.maskzero = true + local device = 2 -- i < #opt.hiddensize/2 and 1 or 2 + lm:add(nn.GPU(rnn, device)) + if opt.dropout > 0 then + lm:add(nn.GPU(nn.Dropout(opt.dropout), device)) + end + inputsize = hiddensize + end + + lm:add(nn.GPU(nn.SplitTable(1), 3)) + + -- output layer + local unigram = trainset.wordfreq:float() + ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + ncemodule.batchnoise = not opt.rownoise + -- distribute weight, gradWeight and momentum on devices 3 and 4 + ncemodule:multicuda(3,4) + + -- NCE requires {input, target} as inputs + lm = nn.Sequential() + :add(nn.ParallelTable() + :add(lm):add(nn.Identity())) + :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} + + -- encapsulate stepmodule into a Sequencer + lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device)) + + -- remember previous state between batches + lm:remember() + + if opt.uniform > 0 then + for k,param in ipairs(lm:parameters()) do + param:uniform(-opt.uniform, opt.uniform) + end + end +end + +if opt.profile then + lm:profile() +end + +if not opt.silent then + print"Language Model:" + print(lm) +end + +if not (criterion and targetmodule) then + --[[ loss function ]]-- + + local crit = nn.MaskZeroCriterion(nn.NCECriterion(), 0) + + -- target is also seqlen x batchsize. + targetmodule = nn.SplitTable(1) + if opt.cuda then + targetmodule = nn.Sequential() + :add(nn.Convert()) + :add(targetmodule) + end + + criterion = nn.SequencerCriterion(crit) +end + +--[[ CUDA ]]-- + +if opt.cuda then + lm:cuda() + criterion:cuda() + targetmodule:cuda() +end + +--[[ experiment log ]]-- + +-- is saved to file every time a new validation minima is found +if not xplog then + xplog = {} + xplog.opt = opt -- save all hyper-parameters and such + xplog.dataset = 'GoogleBillionWords' + xplog.vocab = trainset.vocab + -- will only serialize params + xplog.model = nn.Serial(lm) + xplog.model:mediumSerial() + xplog.criterion = criterion + xplog.targetmodule = targetmodule + -- keep a log of NLL for each epoch + xplog.trainnceloss = {} + xplog.valnceloss = {} + -- will be used for early-stopping + xplog.minvalnceloss = 99999999 + xplog.epoch = 0 + paths.mkdir(opt.savepath) +end +local ntrial = 0 + +local epoch = xplog.epoch+1 +opt.lr = opt.lr or opt.startlr +opt.trainsize = opt.trainsize == -1 and trainset:size() or opt.trainsize +opt.validsize = opt.validsize == -1 and validset:size() or opt.validsize +while opt.maxepoch <= 0 or epoch <= opt.maxepoch do + print("") + print("Epoch #"..epoch.." :") + + -- 1. training + + local a = torch.Timer() + lm:training() + local sumErr = 0 + for i, inputs, targets in trainset:subiter(opt.seqlen, opt.trainsize) do + targets = targetmodule:forward(targets) + inputs = {inputs, targets} + -- forward + local outputs = lm:forward(inputs) + local err = criterion:forward(outputs, targets) + sumErr = sumErr + err + -- backward + local gradOutputs = criterion:backward(outputs, targets) + local a = torch.Timer() + lm:zeroGradParameters() + lm:backward(inputs, gradOutputs) + + -- update + if opt.cutoff > 0 then + local norm = lm:gradParamClip(opt.cutoff) -- affects gradParams + opt.meanNorm = opt.meanNorm and (opt.meanNorm*0.9 + norm*0.1) or norm + end + lm:updateGradParameters(opt.momentum) -- affects gradParams + lm:updateParameters(opt.lr) -- affects params + lm:maxParamNorm(opt.maxnormout) -- affects params + + if opt.progress then + xlua.progress(i, opt.trainsize) + end + + if i % 2000 == 0 then + collectgarbage() + end + + end + + -- learning rate decay + if opt.schedule then + opt.lr = opt.schedule[epoch] or opt.lr + else + opt.lr = opt.lr + (opt.minlr - opt.startlr)/opt.saturate + end + opt.lr = math.max(opt.minlr, opt.lr) + + if not opt.silent then + print("learning rate", opt.lr) + if opt.meanNorm then + print("mean gradParam norm", opt.meanNorm) + end + end + + if cutorch then cutorch.synchronize() end + local speed = opt.trainsize*opt.batchsize/a:time().real + print(string.format("Speed : %f words/second; %f ms/word", speed, 1000/speed)) + + local nceloss = sumErr/opt.trainsize + print("Training error : "..nceloss) + + xplog.trainnceloss[epoch] = nceloss + + -- 2. cross-validation + + lm:evaluate() + local sumErr = 0 + for i, inputs, targets in validset:subiter(opt.seqlen, opt.validsize) do + targets = targetmodule:forward(targets) + local outputs = lm:forward{inputs, targets} + local err = criterion:forward(outputs, targets) + sumErr = sumErr + err + + if opt.progress then + xlua.progress(i, opt.validsize) + end + end + + local nceloss = sumErr/opt.validsize + print("Validation error : "..nceloss) + + xplog.valnceloss[epoch] = nceloss + ntrial = ntrial + 1 + + -- early-stopping + if nceloss < xplog.minvalnceloss then + -- save best version of model + xplog.minvalnceloss = nceloss + xplog.epoch = epoch + local filename = paths.concat(opt.savepath, opt.id..'.t7') + if not opt.dontsave then + print("Found new minima. Saving to "..filename) + torch.save(filename, xplog) + end + ntrial = 0 + elseif ntrial >= opt.earlystop then + print("No new minima found after "..ntrial.." epochs.") + print("Stopping experiment.") + print("Best model can be found in "..paths.concat(opt.savepath, opt.id..'.t7')) + os.exit() + end + + collectgarbage() + epoch = epoch + 1 +end diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua index 4d6f3e7..ab2a2c7 100644 --- a/examples/noise-contrastive-estimate.lua +++ b/examples/noise-contrastive-estimate.lua @@ -2,7 +2,7 @@ require 'paths' require 'rnn' require 'nngraph' local dl = require 'dataload' -assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 3, "update dpnn : luarocks install dpnn") +assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 4, "update dpnn : luarocks install dpnn") --[[ command line arguments ]]-- cmd = torch.CmdLine() @@ -32,6 +32,7 @@ cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution b cmd:option('--k', 25, 'how many noise samples to use for NCE') cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.') cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).') +cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module') -- rnn layer cmd:option('--seqlen', 50, 'sequence length : back-propagate through time (BPTT) for this many time-steps') cmd:option('--inputsize', -1, 'size of lookup table embeddings. -1 defaults to hiddensize[1]') @@ -45,7 +46,6 @@ cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)') cmd:option('--tiny', false, 'use train_tiny.th7 training file') cmd:option('--dontsave', false, 'dont save the model') -cmd:option('--multigpu', false, 'distribute the model over 4 gpus') cmd:text() local opt = cmd:parse(arg or {}) @@ -56,7 +56,7 @@ if not opt.silent then table.print(opt) end opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id -opt.version = 4 +opt.version = 5 -- refactored multigpu into its own file if opt.cuda then -- do this before building model to prevent segfault require 'cunn' @@ -91,110 +91,51 @@ end --[[ language model ]]-- if not lm then - if opt.multigpu then - assert(opt.maxnormout <= 0) - lm = nn.Sequential() - lm:add(nn.Convert()) - - -- input layer (i.e. word embedding space) - local concat = nn.Concat(3) - for device=1,2 do - local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2) - local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize) - lookup.maxnormout = -1 -- prevent weird maxnormout behaviour - concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize - end - - lm:add(nn.GPU(concat, 2)) - if opt.dropout > 0 then - lm:add(nn.GPU(nn.Dropout(opt.dropout), 2)) - end - - -- rnn layers - local inputsize = opt.inputsize - for i,hiddensize in ipairs(opt.hiddensize) do - -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) - local rnn = nn.SeqLSTM(inputsize, hiddensize) - rnn.maskzero = true - local device = 2 -- i < #opt.hiddensize/2 and 1 or 2 - lm:add(nn.GPU(rnn, device)) - if opt.dropout > 0 then - lm:add(nn.GPU(nn.Dropout(opt.dropout), device)) - end - inputsize = hiddensize - end - - lm:add(nn.GPU(nn.SplitTable(1), 3)) - - -- output layer - local unigram = trainset.wordfreq:float() - ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) - -- distribute weight, gradWeight and momentum on devices 3 and 4 - ncemodule:multicuda(3,4) - - -- NCE requires {input, target} as inputs - lm = nn.Sequential() - :add(nn.ParallelTable() - :add(lm):add(nn.Identity())) - :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} - - -- encapsulate stepmodule into a Sequencer - lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device)) - - -- remember previous state between batches - lm:remember() - - if opt.uniform > 0 then - for k,param in ipairs(lm:parameters()) do - param:uniform(-opt.uniform, opt.uniform) - end - end - else - lm = nn.Sequential() + lm = nn.Sequential() + + -- input layer (i.e. word embedding space) + local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize) + lookup.maxnormout = -1 -- prevent weird maxnormout behaviour + lm:add(lookup) -- input is seqlen x batchsize + if opt.dropout > 0 then + lm:add(nn.Dropout(opt.dropout)) + end - -- input layer (i.e. word embedding space) - local lookup = nn.LookupTableMaskZero(#trainset.ivocab, opt.inputsize) - lookup.maxnormout = -1 -- prevent weird maxnormout behaviour - lm:add(lookup) -- input is seqlen x batchsize + -- rnn layers + local inputsize = opt.inputsize + for i,hiddensize in ipairs(opt.hiddensize) do + -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) + local rnn = nn.SeqLSTM(inputsize, hiddensize) + rnn.maskzero = true + lm:add(rnn) if opt.dropout > 0 then lm:add(nn.Dropout(opt.dropout)) end + inputsize = hiddensize + end - -- rnn layers - local inputsize = opt.inputsize - for i,hiddensize in ipairs(opt.hiddensize) do - -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) - local rnn = nn.SeqLSTM(inputsize, hiddensize) - rnn.maskzero = true - lm:add(rnn) - if opt.dropout > 0 then - lm:add(nn.Dropout(opt.dropout)) - end - inputsize = hiddensize - end - - lm:add(nn.SplitTable(1)) - - -- output layer - local unigram = trainset.wordfreq:float() - local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + lm:add(nn.SplitTable(1)) - -- NCE requires {input, target} as inputs - lm = nn.Sequential() - :add(nn.ParallelTable() - :add(lm):add(nn.Identity())) - :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} + -- output layer + local unigram = trainset.wordfreq:float() + local ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + ncemodule.batchnoise = not opt.rownoise - -- encapsulate stepmodule into a Sequencer - lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) + -- NCE requires {input, target} as inputs + lm = nn.Sequential() + :add(nn.ParallelTable() + :add(lm):add(nn.Identity())) + :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} - -- remember previous state between batches - lm:remember() + -- encapsulate stepmodule into a Sequencer + lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) + + -- remember previous state between batches + lm:remember() - if opt.uniform > 0 then - for k,param in ipairs(lm:parameters()) do - param:uniform(-opt.uniform, opt.uniform) - end + if opt.uniform > 0 then + for k,param in ipairs(lm:parameters()) do + param:uniform(-opt.uniform, opt.uniform) end end end diff --git a/test/test.lua b/test/test.lua index 2c4c01c..05b678b 100644 --- a/test/test.lua +++ b/test/test.lua @@ -6400,6 +6400,151 @@ function rnntest.FastLSTM_batchNorm() nn.FastLSTM.bn = false end +function rnntest.inplaceBackward() + local lr = 0.1 + local seqlen, batchsize, hiddensize = 3, 4, 5 + local input = torch.randn(seqlen, batchsize, hiddensize) + local gradOutput = torch.randn(seqlen, batchsize, hiddensize) + + -- test sequencer(linear) + + local seq = nn.Sequencer(nn.Linear(hiddensize, hiddensize)) + local seq2 = seq:clone() + seq2:inplaceBackward() + + local output = seq:forward(input) + local output2 = seq2:forward(input) + + mytester:assertTensorEq(output, output2, 0.000001) + + seq:zeroGradParameters() + local gradInput = seq:backward(input, gradOutput) + seq:updateParameters(lr) + + local gradInput2 = seq2:backward(input, gradOutput, -lr) + + mytester:assertTensorEq(gradInput, gradInput2, 0.000001) + + local params = seq:parameters() + local params2 = seq2:parameters() + + for i=1,#params do + mytester:assertTensorEq(params[i], params2[i], 0.000001) + end + + -- test seqlstm + + local seq = nn.SeqLSTM(hiddensize, hiddensize) + local seq2 = seq:clone() + seq2:inplaceBackward() + + local output = seq:forward(input) + local output2 = seq2:forward(input) + + mytester:assertTensorEq(output, output2, 0.000001) + + seq:zeroGradParameters() + local gradInput = seq:backward(input, gradOutput) + seq:updateParameters(lr) + + local gradInput2 = seq2:backward(input, gradOutput, -lr) + + mytester:assertTensorEq(gradInput, gradInput2, 0.000001) + + local params = seq:parameters() + local params2 = seq2:parameters() + + for i=1,#params do + mytester:assertTensorEq(params[i], params2[i], 0.000001) + end + + + if true then return end + -- test language model + + local vocabsize = 100 + local input = torch.LongTensor(seqlen, batchsize):random(1,vocabsize) + local target = torch.LongTensor(seqlen, batchsize):random(1,vocabsize) + + local lm = nn.Sequential() + local lookup = nn.LookupTableMaskZero(vocabsize, hiddensize) + lm:add(lookup) + + for i=1,2 do + local rnn = nn.SeqLSTM(hiddensize, hiddensize) + rnn.maskzero = true + lm:add(rnn) + end + + lm:add(nn.SplitTable(1)) + + local unigram = torch.FloatTensor(vocabsize):uniform(1,10) + local ncemodule = nn.NCEModule(hiddensize, vocabsize, 10, unigram, -1) + local _sampleidx = torch.Tensor(1,10):random(1,vocabsize) + + function ncemodule.noiseSample(self, sampleidx, batchsize, k) + assert(batchsize == 1) + assert(k == 10) + sampleidx:resize(1, k):copy(_sampleidx) + return sampleidx + end + + lm = nn.Sequential() + :add(nn.ParallelTable() + :add(lm):add(nn.Identity())) + :add(nn.ZipTable()) + + lm:add(nn.Sequencer(nn.MaskZero(ncemodule, 1))) + lm:remember() + + local crit = nn.MaskZeroCriterion(nn.NCECriterion(), 0) + local targetmodule = nn.SplitTable(1) + local criterion = nn.SequencerCriterion(crit) + + local lm2 = lm:clone() + lm2:inplaceBackward() + + local criterion2 = criterion:clone() + + local target = targetmodule:forward(target) + + local inputTable = {input, target} + + local output = lm:forward(inputTable) + local output2 = lm2:forward(inputTable) + + for i=1,seqlen do + mytester:assertTensorEq(output[i][1], output2[i][1], 0.000001) + mytester:assertTensorEq(output[i][2], output2[i][2], 0.000001) + mytester:assertTensorEq(output[i][3], output2[i][3], 0.000001) + mytester:assertTensorEq(output[i][4], output2[i][4], 0.000001) + end + + local loss = criterion:forward(output, target) + local loss2 = criterion2:forward(output, target) + + local gradOutput = criterion:backward(output, target) + local gradOutput2 = criterion2:backward(output, target) + + for i=1,seqlen do + mytester:assertTensorEq(gradOutput[i][1], gradOutput2[i][1], 0.000001) + mytester:assertTensorEq(gradOutput[i][2], gradOutput2[i][2], 0.000001) + end + + lm:zeroGradParameters() + lm:backward(inputTable, gradOutput) + lm:updateParameters(lr) + + lm2:backward(inputTable, gradOutput2, -lr) + + local params = lm:parameters() + local params2 = lm2:parameters() + + for i=1,#params do + mytester:assertTensorEq(params[i], params2[i], 0.000001, "error in params "..i..": "..tostring(params[i]:size())) + end +end + function rnn.test(tests, benchmark_) mytester = torch.Tester() benchmark = benchmark_ From f7a7fd7e0240f300ea247931b7ab70a546ec98ed Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Tue, 21 Jun 2016 15:06:51 -0400 Subject: [PATCH 5/7] small fixes --- CMakeLists.txt | 2 +- examples/multigpu-nce-rnnlm.lua | 2 +- examples/recurrent-language-model.lua | 1 - init.lua | 1 + scripts/evaluate-rnnlm.lua | 15 +++++++++++++-- test/test.lua | 12 +++++------- 6 files changed, 21 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b99715..6ca16ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,5 +10,5 @@ FIND_PACKAGE(Torch REQUIRED) SET(src) FILE(GLOB luasrc *.lua) -SET(luasrc ${luasrc} test/test.lua test/mnistsample.t7) +SET(luasrc ${luasrc} test/test.lua test/mnistsample.t7 test/bigtest.lua) ADD_TORCH_PACKAGE(rnn "${src}" "${luasrc}" "Recurrent Neural Networks") diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua index 5dc42e8..bd088b5 100644 --- a/examples/multigpu-nce-rnnlm.lua +++ b/examples/multigpu-nce-rnnlm.lua @@ -8,7 +8,7 @@ require 'cunn' --[[ command line arguments ]]-- cmd = torch.CmdLine() cmd:text() -cmd:text('Train a Language Model using stacked LSTM on Google Billion Words dataset') +cmd:text('Train a multi-GPU Language Model using stacked LSTM on Google Billion Words dataset') cmd:text('Example:') cmd:text("th examples/multigpu-nce-rnnlm.lua --progress --earlystop 50 --device 2 --seqlen 20 --hiddensize '{200,200}' --batchsize 20 --startlr 1 --uniform 0.1 --cutoff 5 --schedule '{[5]=0.5,[6]=0.25,[7]=0.125,[8]=0.0625,[9]=0.03125,[10]=0.015625,[11]=0.0078125,[12]=0.00390625}'") cmd:text("th examples/multigpu-nce-rnnlm.lua.lua --trainsize 400000 --validsize 40000 --cutoff 10 --batchsize 128 --seqlen 100 --hiddensize '{250,250}' --progress --device 2") diff --git a/examples/recurrent-language-model.lua b/examples/recurrent-language-model.lua index 635fa74..60c5aac 100644 --- a/examples/recurrent-language-model.lua +++ b/examples/recurrent-language-model.lua @@ -260,7 +260,6 @@ while opt.maxepoch <= 0 or epoch <= opt.maxepoch do -- Note : -- Perplexity = exp( sum ( NLL ) / #w) -- Bits Per Word = log2(Perplexity) - -- Bits per Char = BPW * (#w / #c) print("Validation PPL : "..ppl) xplog.valppl[epoch] = ppl diff --git a/init.lua b/init.lua index 02a4f2d..c210f91 100644 --- a/init.lua +++ b/init.lua @@ -19,6 +19,7 @@ torch.include('rnn', 'Dropout.lua') -- for testing: torch.include('rnn', 'test.lua') +torch.include('rnn', 'bigtest.lua') -- support modules torch.include('rnn', 'ZeroGrad.lua') diff --git a/scripts/evaluate-rnnlm.lua b/scripts/evaluate-rnnlm.lua index 1c3d434..8228847 100644 --- a/scripts/evaluate-rnnlm.lua +++ b/scripts/evaluate-rnnlm.lua @@ -116,17 +116,28 @@ if opt.nsample > 0 then print(table.concat(sampletext, ' ')) end else - local sumErr = 0 + local sumErr, count = 0, 0 for i, inputs, targets in testset:subiter(xplog.opt.seqlen or 100) do + inputs:apply(function(x) + if x > 0 then + count = count + 1 + end + end) local targets = targetmodule:forward(targets) local inputs = opt.nce and {inputs, targets} or inputs local outputs = lm:forward(inputs) local err = criterion:forward(outputs, targets) sumErr = sumErr + err end + + if count ~= testset:size() then + local meanseqlen = testset:size()/(testset:size() - count) + print("mean sequence length : "..meanseqlen) + print("Old (wrong) Test PPL : "..torch.exp(sumErr/testset:size())) + end - local ppl = torch.exp(sumErr/testset:size()) + local ppl = torch.exp(sumErr/count) print("Test PPL : "..ppl) end diff --git a/test/test.lua b/test/test.lua index 05b678b..5c84b82 100644 --- a/test/test.lua +++ b/test/test.lua @@ -2647,13 +2647,11 @@ function rnntest.SequencerCriterion() sc:cuda() local gradInput4 = {} - for i=1,nStep do - input[i] = input[i]:cuda() - target[i] = target[i]:cuda() - end + input = input:cuda() + target = target:cuda() local err4 = sc:forward(input, target) - mytester:assert(math.abs(err - err4) < 0.000001, "SequencerCriterion forward cuda err") + mytester:assert(math.abs(errTensorInput - err4) < 0.000001, "SequencerCriterion forward cuda err") local gradInput4 = sc:backward(input, target) for i=1,nStep do mytester:assertTensorEq(gradInput4[i]:float(), gradInput3[i], 0.000001, "SequencerCriterion backward cuda err "..i) @@ -5891,7 +5889,7 @@ function rnntest.NCE_MaskZero() batchsize = 4, seqlen = 5, uniform = 0.1, - hiddensize = {10}, + hiddensize = {100}, vocabsize = 100, dropout = 0, k = 25 @@ -5989,7 +5987,7 @@ function rnntest.NCE_MaskZero() end end mytester:assert(found) - mytester:assert(err < starterr) + mytester:assert(err < starterr, string.format("err=%f should be smaller than starterr=%f", err, starterr)) end local function check_size(x, dims) From 539f0cb6740e92661f2105e33469830c0d62c27a Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Wed, 22 Jun 2016 16:32:59 -0400 Subject: [PATCH 6/7] multigpu runs again --- examples/multigpu-nce-rnnlm.lua | 60 ++++++++++++------------- examples/noise-contrastive-estimate.lua | 17 +++---- 2 files changed, 38 insertions(+), 39 deletions(-) diff --git a/examples/multigpu-nce-rnnlm.lua b/examples/multigpu-nce-rnnlm.lua index bd088b5..ebb2777 100644 --- a/examples/multigpu-nce-rnnlm.lua +++ b/examples/multigpu-nce-rnnlm.lua @@ -15,13 +15,13 @@ cmd:text("th examples/multigpu-nce-rnnlm.lua.lua --trainsize 400000 --validsize cmd:text("th scripts/evaluate-rnnlm.lua --xplogpath /data/save/rnnlm/ptb:atlas:1458081269:1.t7 --cuda") cmd:text('Options:') -- training -cmd:option('--startlr', 0.05, 'learning rate at t=0') -cmd:option('--minlr', 0.00001, 'minimum learning rate') -cmd:option('--saturate', 400, 'epoch at which linear decayed LR will reach minlr') +cmd:option('--startlr', 0.7, 'learning rate at t=0') +cmd:option('--minlr', 0.001, 'minimum learning rate') +cmd:option('--saturate', 300, 'epoch at which linear decayed LR will reach minlr') cmd:option('--schedule', '', 'learning rate schedule. e.g. {[5] = 0.004, [6] = 0.001}') -cmd:option('--momentum', 0.9, 'momentum') +cmd:option('--momentum', -1, 'momentum (requires an additional copy of all params)') cmd:option('--maxnormout', -1, 'max l2-norm of each layer\'s output neuron weights') -cmd:option('--cutoff', -1, 'max l2-norm of concatenation of all gradParam tensors') +cmd:option('--cutoff', 10, 'max l2-norm of concatenation of all gradParam tensors') cmd:option('--device', 1, 'sets the device (GPU) to use') cmd:option('--profile', false, 'profile updateOutput,updateGradInput and accGradParameters in Sequential') cmd:option('--maxepoch', 1000, 'maximum number of epochs to run') @@ -29,7 +29,7 @@ cmd:option('--earlystop', 50, 'maximum number of epochs to wait to find a better cmd:option('--progress', false, 'print progress bar') cmd:option('--silent', false, 'don\'t print anything to stdout') cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization') -cmd:option('--k', 25, 'how many noise samples to use for NCE') +cmd:option('--k', 400, 'how many noise samples to use for NCE') cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.') cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).') cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module') @@ -98,12 +98,12 @@ if not lm then local inputsize = device == 1 and torch.floor(opt.inputsize/2) or torch.ceil(opt.inputsize/2) local lookup = nn.LookupTableMaskZero(#trainset.ivocab, inputsize) lookup.maxnormout = -1 -- prevent weird maxnormout behaviour - concat:add(nn.GPU(lookup, device)) -- input is seqlen x batchsize + concat:add(nn.GPU(lookup, device):cuda()) -- input is seqlen x batchsize end - lm:add(nn.GPU(concat, 2)) + lm:add(nn.GPU(concat, 2):cuda()) if opt.dropout > 0 then - lm:add(nn.GPU(nn.Dropout(opt.dropout), 2)) + lm:add(nn.GPU(nn.Dropout(opt.dropout), 2):cuda()) end -- rnn layers @@ -112,19 +112,27 @@ if not lm then -- this is a faster version of nn.Sequencer(nn.FastLSTM(inpusize, hiddensize)) local rnn = nn.SeqLSTM(inputsize, hiddensize) rnn.maskzero = true - local device = 2 -- i < #opt.hiddensize/2 and 1 or 2 - lm:add(nn.GPU(rnn, device)) + local device = i <= #opt.hiddensize/2 and 1 or 2 + lm:add(nn.GPU(rnn, device):cuda()) if opt.dropout > 0 then - lm:add(nn.GPU(nn.Dropout(opt.dropout), device)) + lm:add(nn.GPU(nn.Dropout(opt.dropout), device):cuda()) end inputsize = hiddensize end - lm:add(nn.GPU(nn.SplitTable(1), 3)) + lm:add(nn.GPU(nn.SplitTable(1), 3):cuda()) + + if opt.uniform > 0 then + for k,param in ipairs(lm:parameters()) do + assert(torch.type(param) == 'torch.CudaTensor') + cutorch.withDevice(param:getDevice(), function() param:uniform(-opt.uniform, opt.uniform) end) + end + end -- output layer local unigram = trainset.wordfreq:float() ncemodule = nn.NCEModule(inputsize, #trainset.ivocab, opt.k, unigram, opt.Z) + ncemodule:reset() -- initializes bias to get approx. Z = 1 ncemodule.batchnoise = not opt.rownoise -- distribute weight, gradWeight and momentum on devices 3 and 4 ncemodule:multicuda(3,4) @@ -136,16 +144,11 @@ if not lm then :add(nn.ZipTable()) -- {{x1,x2,...}, {t1,t2,...}} -> {{x1,t1},{x2,t2},...} -- encapsulate stepmodule into a Sequencer - lm:add(nn.GPU(nn.Sequencer(nn.MaskZero(ncemodule, 1)), 1, opt.device)) + local masked = nn.MaskZero(ncemodule, 1):cuda() + lm:add(nn.GPU(nn.Sequencer(masked), 3, opt.device):cuda()) -- remember previous state between batches lm:remember() - - if opt.uniform > 0 then - for k,param in ipairs(lm:parameters()) do - param:uniform(-opt.uniform, opt.uniform) - end - end end if opt.profile then @@ -163,23 +166,18 @@ if not (criterion and targetmodule) then local crit = nn.MaskZeroCriterion(nn.NCECriterion(), 0) -- target is also seqlen x batchsize. - targetmodule = nn.SplitTable(1) - if opt.cuda then - targetmodule = nn.Sequential() - :add(nn.Convert()) - :add(targetmodule) - end + targetmodule = nn.Sequential() + :add(nn.Convert()) + :add(nn.SplitTable(1)) criterion = nn.SequencerCriterion(crit) end --[[ CUDA ]]-- -if opt.cuda then - lm:cuda() - criterion:cuda() - targetmodule:cuda() -end +lm:cuda() +criterion:cuda() +targetmodule:cuda() --[[ experiment log ]]-- diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua index ab2a2c7..d896707 100644 --- a/examples/noise-contrastive-estimate.lua +++ b/examples/noise-contrastive-estimate.lua @@ -2,7 +2,7 @@ require 'paths' require 'rnn' require 'nngraph' local dl = require 'dataload' -assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 4, "update dpnn : luarocks install dpnn") +assert(nn.NCEModule and nn.NCEModule.version and nn.NCEModule.version >= 6, "update dpnn : luarocks install dpnn") --[[ command line arguments ]]-- cmd = torch.CmdLine() @@ -29,19 +29,19 @@ cmd:option('--earlystop', 50, 'maximum number of epochs to wait to find a better cmd:option('--progress', false, 'print progress bar') cmd:option('--silent', false, 'don\'t print anything to stdout') cmd:option('--uniform', 0.1, 'initialize parameters using uniform distribution between -uniform and uniform. -1 means default initialization') -cmd:option('--k', 25, 'how many noise samples to use for NCE') +cmd:option('--k', 100, 'how many noise samples to use for NCE') cmd:option('--continue', '', 'path to model for which training should be continued. Note that current options (except for device, cuda and tiny) will be ignored.') -cmd:option('--Z', -1, 'normalization constant for NCE module (-1 approximates it from first batch).') +cmd:option('--Z', 1, 'normalization constant for NCE module (-1 approximates it from first batch).') cmd:option('--rownoise', false, 'sample k noise samples for each row for NCE module') -- rnn layer cmd:option('--seqlen', 50, 'sequence length : back-propagate through time (BPTT) for this many time-steps') cmd:option('--inputsize', -1, 'size of lookup table embeddings. -1 defaults to hiddensize[1]') -cmd:option('--hiddensize', '{200,200}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs/GRUs are stacked') +cmd:option('--hiddensize', '{256,256}', 'number of hidden units used at output of each recurrent layer. When more than one is specified, RNN/LSTMs/GRUs are stacked') cmd:option('--dropout', 0, 'ancelossy dropout with this probability after each rnn layer. dropout <= 0 disables it.') -- data -cmd:option('--batchsize', 32, 'number of examples per batch') -cmd:option('--trainsize', -1, 'number of train time-steps seen between each epoch') -cmd:option('--validsize', -1, 'number of valid time-steps used for early stopping and cross-validation') +cmd:option('--batchsize', 128, 'number of examples per batch') +cmd:option('--trainsize', 400000, 'number of train time-steps seen between each epoch') +cmd:option('--validsize', 40000, 'number of valid time-steps used for early stopping and cross-validation') cmd:option('--savepath', paths.concat(dl.SAVE_PATH, 'rnnlm'), 'path to directory where experiment log (includes model) will be saved') cmd:option('--id', '', 'id string of this experiment (used to name output file) (defaults to a unique id)') cmd:option('--tiny', false, 'use train_tiny.th7 training file') @@ -56,7 +56,7 @@ if not opt.silent then table.print(opt) end opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id -opt.version = 5 -- refactored multigpu into its own file +opt.version = 6 -- better NCE bias initialization + new default hyper-params if opt.cuda then -- do this before building model to prevent segfault require 'cunn' @@ -137,6 +137,7 @@ if not lm then for k,param in ipairs(lm:parameters()) do param:uniform(-opt.uniform, opt.uniform) end + ncemodule:reset() end end From bcad6c9a05ec49edf2ad487ba4ba6d92ef480906 Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Tue, 28 Jun 2016 18:08:08 -0400 Subject: [PATCH 7/7] version first --- examples/noise-contrastive-estimate.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/noise-contrastive-estimate.lua b/examples/noise-contrastive-estimate.lua index d896707..88df764 100644 --- a/examples/noise-contrastive-estimate.lua +++ b/examples/noise-contrastive-estimate.lua @@ -52,11 +52,11 @@ local opt = cmd:parse(arg or {}) opt.hiddensize = loadstring(" return "..opt.hiddensize)() opt.schedule = loadstring(" return "..opt.schedule)() opt.inputsize = opt.inputsize == -1 and opt.hiddensize[1] or opt.inputsize +opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id +opt.version = 6 -- better NCE bias initialization + new default hyper-params if not opt.silent then table.print(opt) end -opt.id = opt.id == '' and ('gbw' .. ':' .. dl.uniqueid()) or opt.id -opt.version = 6 -- better NCE bias initialization + new default hyper-params if opt.cuda then -- do this before building model to prevent segfault require 'cunn'