diff --git a/CMakeLists.txt b/CMakeLists.txt index 1687a20..12ac42e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ if(DEFINED ENV{BOOST_ROOT}) set(Boost_NO_SYSTEM_PATHS ON) endif() set(Boost_REALPATH ON) -find_package(Boost COMPONENTS program_options serialization REQUIRED) +find_package(Boost COMPONENTS program_options serialization iostreams REQUIRED) include_directories(${Boost_INCLUDE_DIR}) set(LIBS ${LIBS} ${Boost_LIBRARIES}) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index 3f8b006..0441898 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -2,29 +2,25 @@ #include #include #include -#include -#include -#include #include -#include #include #include -#include #include #include +#include #include #include +#include +#include #include #include "cnn/training.h" #include "cnn/cnn.h" #include "cnn/expr.h" -#include "cnn/nodes.h" #include "cnn/lstm.h" -#include "cnn/rnn.h" #include "c2.h" cpyp::Corpus corpus; @@ -80,6 +76,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("rel_dim", po::value()->default_value(10), "relation dimension") ("lstm_input_dim", po::value()->default_value(60), "LSTM input dimension") ("train,t", "Should training be run?") + ("maxit,M", po::value()->default_value(8000), "Maximum number of training iterations") + ("tolerance", po::value()->default_value(-1.0), "Tolerance on dev uas for stopping training") ("words,w", po::value(), "Pretrained word embeddings") ("use_spelling,S", "Use spelling model") //Miguel. Spelling model ("help,h", "Help"); @@ -847,7 +845,8 @@ void signal_callback_handler(int /* signum */) { requested_stop = true; } -unsigned compute_correct(const map& ref, const map& hyp, unsigned len) { +template +unsigned compute_correct(const map& ref, const map& hyp, unsigned len) { unsigned res = 0; for (unsigned i = 0; i < len; ++i) { auto ri = ref.find(i); @@ -859,20 +858,38 @@ unsigned compute_correct(const map& ref, const map& hyp, unsig return res; } +template +unsigned compute_correct(const map& ref1, const map& hyp1, + const map& ref2, const map& hyp2, unsigned len) { + unsigned res = 0; + for (unsigned i = 0; i < len; ++i) { + auto r1 = ref1.find(i); + auto h1 = hyp1.find(i); + auto r2 = ref2.find(i); + auto h2 = hyp2.find(i); + assert(r1 != ref1.end()); + assert(h1 != hyp1.end()); + assert(r2 != ref2.end()); + assert(h2 != hyp2.end()); + if (r1->second == h1->second && r2->second == h2->second) ++res; + } + return res; +} + void output_conll(const vector& sentence, const vector& pos, - const vector& sentenceUnkStrings, - const map& intToWords, - const map& intToPos, + const vector& sentenceUnkStrings, + const map& intToWords, + const map& intToPos, const map& hyp, const map& rel_hyp) { for (unsigned i = 0; i < (sentence.size()-1); ++i) { auto index = i + 1; - assert(i < sentenceUnkStrings.size() && + assert(i < sentenceUnkStrings.size() && ((sentence[i] == corpus.get_or_add_word(cpyp::Corpus::UNK) && sentenceUnkStrings[i].size() > 0) || (sentence[i] != corpus.get_or_add_word(cpyp::Corpus::UNK) && sentenceUnkStrings[i].size() == 0 && intToWords.find(sentence[i]) != intToWords.end()))); - string wit = (sentenceUnkStrings[i].size() > 0)? + string wit = (sentenceUnkStrings[i].size() > 0)? sentenceUnkStrings[i] : intToWords.find(sentence[i])->second; auto pit = intToPos.find(pos[i]); assert(hyp.find(i) != hyp.end()); @@ -884,10 +901,10 @@ void output_conll(const vector& sentence, const vector& pos, size_t first_char_in_rel = hyp_rel.find('(') + 1; size_t last_char_in_rel = hyp_rel.rfind(')') - 1; hyp_rel = hyp_rel.substr(first_char_in_rel, last_char_in_rel - first_char_in_rel + 1); - cout << index << '\t' // 1. ID + cout << index << '\t' // 1. ID << wit << '\t' // 2. FORM - << "_" << '\t' // 3. LEMMA - << "_" << '\t' // 4. CPOSTAG + << "_" << '\t' // 3. LEMMA + << "_" << '\t' // 4. CPOSTAG << pit->second << '\t' // 5. POSTAG << "_" << '\t' // 6. FEATS << hyp_head << '\t' // 7. HEAD @@ -898,10 +915,26 @@ void output_conll(const vector& sentence, const vector& pos, cout << endl; } +void init_pretrained(istream &in) { + string line; + vector v(PRETRAINED_DIM, 0); + string word; + while (getline(in, line)) { + if (word.empty() && line.find('.') == std::string::npos) + continue; // first line contains vocabulary size and dimensions + istringstream lin(line); + lin >> word; + for (unsigned i = 0; i < PRETRAINED_DIM; ++i) lin >> v[i]; + unsigned id = corpus.get_or_add_word(word); + pretrained[id] = v; + } +} + + int main(int argc, char** argv) { cnn::Initialize(argc, argv); - cerr << "COMMAND:"; + cerr << "COMMAND:"; for (unsigned i = 0; i < static_cast(argc); ++i) cerr << ' ' << argv[i]; cerr << endl; unsigned status_every_i_iterations = 100; @@ -931,6 +964,12 @@ int main(int argc, char** argv) { } const double unk_prob = conf["unk_prob"].as(); assert(unk_prob >= 0.); assert(unk_prob <= 1.); + const unsigned maxit = conf["maxit"].as(); + cerr << "Maximum number of iterations: " << maxit << "\n"; + const double tolerance = conf["tolerance"].as(); + if (tolerance > 0.0) { + cerr << "Optimization tolerance: " << tolerance << "\n"; + } ostringstream os; os << "parser_" << (USE_POS ? "pos" : "nopos") << '_' << LAYERS @@ -945,24 +984,24 @@ int main(int argc, char** argv) { const string fname = os.str(); cerr << "Writing parameters to file: " << fname << endl; bool softlinkCreated = false; - corpus.load_correct_actions(conf["training_data"].as()); + corpus.load_correct_actions(conf["training_data"].as()); const unsigned kUNK = corpus.get_or_add_word(cpyp::Corpus::UNK); kROOT_SYMBOL = corpus.get_or_add_word(ROOT_SYMBOL); if (conf.count("words")) { pretrained[kUNK] = vector(PRETRAINED_DIM, 0); - cerr << "Loading from " << conf["words"].as() << " with" << PRETRAINED_DIM << " dimensions\n"; - ifstream in(conf["words"].as().c_str()); - string line; - getline(in, line); - vector v(PRETRAINED_DIM, 0); - string word; - while (getline(in, line)) { - istringstream lin(line); - lin >> word; - for (unsigned i = 0; i < PRETRAINED_DIM; ++i) lin >> v[i]; - unsigned id = corpus.get_or_add_word(word); - pretrained[id] = v; + const string& words_fname = conf["words"].as(); + cerr << "Loading from " << words_fname << " with " << PRETRAINED_DIM << " dimensions\n"; + if (boost::algorithm::ends_with(words_fname, ".gz")) { + ifstream file(words_fname.c_str(), ios_base::in | ios_base::binary); + boost::iostreams::filtering_streambuf zip; + zip.push(boost::iostreams::zlib_decompressor()); + zip.push(file); + istream in(&zip); + init_pretrained(in); + } else { + ifstream in(words_fname.c_str()); + init_pretrained(in); // read as normal text } } @@ -1019,9 +1058,11 @@ int main(int argc, char** argv) { double right = 0; double llh = 0; bool first = true; - int iter = -1; - while(!requested_stop) { - ++iter; + unsigned iter = 0; + double uas = -1; + double prev_uas = -1; + while(!requested_stop && iter < maxit && + (tolerance < 0 || uas < 0 || prev_uas < 0 || abs(prev_uas - uas) > tolerance)) { for (unsigned sii = 0; sii < status_every_i_iterations; ++sii) { if (si == corpus.nsentences) { si = 0; @@ -1036,8 +1077,8 @@ int main(int argc, char** argv) { for (auto& w : tsentence) if (singletons.count(w) && cnn::rand01() < unk_prob) w = kUNK; } - const vector& sentencePos=corpus.sentencesPos[order[si]]; - const vector& actions=corpus.correct_act_sent[order[si]]; + const vector& sentencePos=corpus.sentencesPos[order[si]]; + const vector& actions=corpus.correct_act_sent[order[si]]; ComputationGraph hg; parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,actions,corpus.actions,corpus.intToWords,&right); double lp = as_scalar(hg.incremental_forward()); @@ -1068,8 +1109,8 @@ int main(int argc, char** argv) { auto t_start = std::chrono::high_resolution_clock::now(); for (unsigned sii = 0; sii < dev_size; ++sii) { const vector& sentence=corpus.sentencesDev[sii]; - const vector& sentencePos=corpus.sentencesPosDev[sii]; - const vector& actions=corpus.correct_act_sentDev[sii]; + const vector& sentencePos=corpus.sentencesPosDev[sii]; + const vector& actions=corpus.correct_act_sentDev[sii]; vector tsentence=sentence; if (!USE_SPELLING) { for (auto& w : tsentence) @@ -1077,8 +1118,8 @@ int main(int argc, char** argv) { } ComputationGraph hg; - vector pred = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,&right); - double lp = 0; + vector pred = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,&right); + double lp = 0; //vector pred = parser.log_prob_parser_beam(&hg,sentence,sentencePos,corpus.actions,beam_size,&lp); llh -= lp; trs += actions.size(); @@ -1089,7 +1130,9 @@ int main(int argc, char** argv) { total_heads += sentence.size() - 1; } auto t_end = std::chrono::high_resolution_clock::now(); - cerr << " **dev (iter=" << iter << " epoch=" << (tot_seen / corpus.nsentences) << ")\tllh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << (correct_heads / total_heads) << "\t[" << dev_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; + prev_uas = uas; + uas = correct_heads / total_heads; + cerr << " **dev (iter=" << iter << " epoch=" << (tot_seen / corpus.nsentences) << ")\tllh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << uas << "\t[" << dev_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; if (correct_heads > best_correct_heads) { best_correct_heads = correct_heads; ofstream out(fname); @@ -1099,29 +1142,36 @@ int main(int argc, char** argv) { // easier to refer to it in a shell script. if (!softlinkCreated) { string softlink = " latest_model"; - if (system((string("rm -f ") + softlink).c_str()) == 0 && + if (system((string("rm -f ") + softlink).c_str()) == 0 && system((string("ln -s ") + fname + softlink).c_str()) == 0) { - cerr << "Created " << softlink << " as a soft link to " << fname + cerr << "Created " << softlink << " as a soft link to " << fname << " for convenience." << endl; } softlinkCreated = true; } } } + ++iter; + } + if (iter >= maxit) { + cerr << "\nMaximum number of iterations reached (" << iter << "), terminating optimization...\n"; + } else if (!requested_stop) { + cerr << "\nScore tolerance reached (" << tolerance << "), terminating optimization...\n"; } } // should do training? if (true) { // do test evaluation double llh = 0; double trs = 0; double right = 0; - double correct_heads = 0; + double correct_heads_unlabeled = 0; + double correct_heads_labeled = 0; double total_heads = 0; auto t_start = std::chrono::high_resolution_clock::now(); unsigned corpus_size = corpus.nsentencesDev; for (unsigned sii = 0; sii < corpus_size; ++sii) { const vector& sentence=corpus.sentencesDev[sii]; - const vector& sentencePos=corpus.sentencesPosDev[sii]; - const vector& sentenceUnkStr=corpus.sentencesStrDev[sii]; + const vector& sentencePos=corpus.sentencesPosDev[sii]; + const vector& sentenceUnkStr=corpus.sentencesStrDev[sii]; const vector& actions=corpus.correct_act_sentDev[sii]; vector tsentence=sentence; if (!USE_SPELLING) { @@ -1141,11 +1191,12 @@ int main(int argc, char** argv) { map ref = parser.compute_heads(sentence.size(), actions, corpus.actions, &rel_ref); map hyp = parser.compute_heads(sentence.size(), pred, corpus.actions, &rel_hyp); output_conll(sentence, sentencePos, sentenceUnkStr, corpus.intToWords, corpus.intToPos, hyp, rel_hyp); - correct_heads += compute_correct(ref, hyp, sentence.size() - 1); + correct_heads_unlabeled += compute_correct(ref, hyp, sentence.size() - 1); + correct_heads_labeled += compute_correct(ref, hyp, rel_ref, rel_hyp, sentence.size() - 1); total_heads += sentence.size() - 1; } auto t_end = std::chrono::high_resolution_clock::now(); - cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << (correct_heads / total_heads) << "\t[" << corpus_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; + cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << (correct_heads_unlabeled / total_heads) << " las: " << (correct_heads_labeled / total_heads) << "\t[" << corpus_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; } for (unsigned i = 0; i < corpus.actions.size(); ++i) { //cerr << corpus.actions[i] << '\t' << parser.p_r->values[i].transpose() << endl;