From 5f93c727612813a3edca3185f3a801bc9aee9a5b Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 11:32:44 -0500 Subject: [PATCH 01/88] Minor style & documentation updates --- parser/corpus.cc | 4 ++-- parser/corpus.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 1928cfa..9815f48 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -134,8 +134,8 @@ void TrainingCorpus::OracleTransitionsCorpusReader::LoadCorrectActions( }; while (getline(actionsFile, lineS)) { - ReplaceStringInPlace(lineS, "-RRB-", "_RRB_"); - ReplaceStringInPlace(lineS, "-LRB-", "_LRB_"); + ReplaceStringInPlace(&lineS, "-RRB-", "_RRB_"); + ReplaceStringInPlace(&lineS, "-LRB-", "_LRB_"); // An empty line marks the end of a sentence. if (lineS.empty()) { next_is_action_line = false; diff --git a/parser/corpus.h b/parser/corpus.h index 9be1a9d..9a92c0a 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -250,17 +250,17 @@ class TrainingCorpus : public Corpus { else return 0; } private: - bool is_training; + bool is_training; // can be dev rather than actual training void LoadCorrectActions(const std::string& file, TrainingCorpus* corpus) const; }; - static inline void ReplaceStringInPlace(std::string& subject, + static inline void ReplaceStringInPlace(std::string* subject, const std::string& search, const std::string& replace) { size_t pos = 0; - while ((pos = subject.find(search, pos)) != std::string::npos) { - subject.replace(pos, search.length(), replace); + while ((pos = subject->find(search, pos)) != std::string::npos) { + subject->replace(pos, search.length(), replace); pos += replace.length(); } } From 51cda203336a5ccf247056a97cc205c07ac6b354 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 12:08:41 -0500 Subject: [PATCH 02/88] Made TrainingCorpus an intermediate level in the hierarchy --- parser/corpus.cc | 6 +++--- parser/corpus.h | 19 +++++++++++++------ parser/lstm-parser-driver.cc | 16 +++++++++------- parser/lstm-parser.cc | 16 ++++++++-------- parser/lstm-parser.h | 7 ++++--- 5 files changed, 37 insertions(+), 27 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 9815f48..63b353c 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -79,7 +79,7 @@ void ConllUCorpusReader::ReadSentences(const string& file, -void TrainingCorpus::CountSingletons() { +void ParserTrainingCorpus::CountSingletons() { // compute the singletons in the parser's training data map counts; for (const auto& sent : sentences) { @@ -94,8 +94,8 @@ void TrainingCorpus::CountSingletons() { } -void TrainingCorpus::OracleTransitionsCorpusReader::LoadCorrectActions( - const string& file, TrainingCorpus* corpus) const { +void ParserTrainingCorpus::OracleTransitionsCorpusReader::LoadCorrectActions( + const string& file, ParserTrainingCorpus* corpus) const { // TODO: break up this function? cerr << "Loading " << (is_training ? "training" : "dev") << " corpus from " << file << "..." << endl; diff --git a/parser/corpus.h b/parser/corpus.h index 9a92c0a..2562f40 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -207,11 +207,17 @@ class Corpus { // Corpus for subclasses to inherit and use. Subclasses are then responsible // for doing any corpus-reading or setup. Corpus(CorpusVocabulary* vocab) : vocab(vocab) {} - }; class TrainingCorpus : public Corpus { +public: + TrainingCorpus(CorpusVocabulary* vocab) : Corpus(vocab) {} + std::vector> correct_act_sent; +}; + + +class ParserTrainingCorpus : public TrainingCorpus { public: friend class OracleTransitionsCorpusReader; @@ -220,9 +226,9 @@ class TrainingCorpus : public Corpus { std::vector> correct_act_sent; std::set singletons; - TrainingCorpus(CorpusVocabulary* vocab, const std::string& file, - bool is_training) : - Corpus(vocab) { + ParserTrainingCorpus(CorpusVocabulary* vocab, const std::string& file, + bool is_training) : + TrainingCorpus(vocab) { OracleTransitionsCorpusReader reader(is_training); reader.ReadSentences(file, this); } @@ -234,7 +240,8 @@ class TrainingCorpus : public Corpus { is_training(is_training) {} virtual void ReadSentences(const std::string& file, Corpus* corpus) const { - TrainingCorpus* training_corpus = static_cast(corpus); + ParserTrainingCorpus* training_corpus = + static_cast(corpus); LoadCorrectActions(file, training_corpus); } @@ -252,7 +259,7 @@ class TrainingCorpus : public Corpus { private: bool is_training; // can be dev rather than actual training void LoadCorrectActions(const std::string& file, - TrainingCorpus* corpus) const; + ParserTrainingCorpus* corpus) const; }; static inline void ReplaceStringInPlace(std::string* subject, diff --git a/parser/lstm-parser-driver.cc b/parser/lstm-parser-driver.cc index 8d6b235..a1caacc 100644 --- a/parser/lstm-parser-driver.cc +++ b/parser/lstm-parser-driver.cc @@ -135,7 +135,7 @@ int main(int argc, char** argv) { parser.reset(new LSTMParser(cmd_options, words, false)); } - unique_ptr dev_corpus; // shared by train/evaluate + unique_ptr dev_corpus; // shared by train/evaluate if (train) { if (!conf.count("training_data") || !conf.count("dev_data")) { @@ -145,14 +145,16 @@ int main(int argc, char** argv) { } signal(SIGINT, signal_callback_handler); - TrainingCorpus training_corpus(&parser->vocab, - conf["training_data"].as(), true); + ParserTrainingCorpus training_corpus(&parser->vocab, + conf["training_data"].as(), + true); parser->FinalizeVocab(); cerr << "Total number of words: " << training_corpus.vocab->CountWords() << endl; // OOV words will be replaced by UNK tokens - dev_corpus.reset(new TrainingCorpus(&parser->vocab, - conf["dev_data"].as(), false)); + dev_corpus.reset( + new ParserTrainingCorpus(&parser->vocab, conf["dev_data"].as(), + false)); ostringstream os; os << "parser_" << (parser->options.use_pos ? "pos" : "nopos") @@ -180,8 +182,8 @@ int main(int argc, char** argv) { cerr << "Evaluating model on " << conf["dev_data"].as() << endl; if (!train) { // Didn't already load dev corpus for training dev_corpus.reset( - new TrainingCorpus(&parser->vocab, conf["dev_data"].as(), - false)); + new ParserTrainingCorpus(&parser->vocab, + conf["dev_data"].as(), false)); } parser->Evaluate(*dev_corpus); } diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 790aba0..ad1b1cf 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -447,9 +447,9 @@ void LSTMParser::SaveModel(const string& model_fname, bool softlink_created) { } -void LSTMParser::Train(const TrainingCorpus& corpus, - const TrainingCorpus& dev_corpus, const double unk_prob, - const string& model_fname, +void LSTMParser::Train(const ParserTrainingCorpus& corpus, + const ParserTrainingCorpus& dev_corpus, + const double unk_prob, const string& model_fname, const volatile bool* requested_stop) { bool softlink_created = false; int best_correct_heads = 0; @@ -627,11 +627,11 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, } if (evaluate) { - // Downcast to TrainingCorpus to get gold-standard data. We can only get - // here if this function was called by Evaluate, which statically checks - // that the corpus is in fact a TrainingCorpus, so this cast is safe. - const TrainingCorpus& training_corpus = - static_cast(corpus); + // Downcast to ParserTrainingCorpus to get gold-standard data. We can only + // get here if this function was called by Evaluate, which statically + // checks that the corpus is in fact a TrainingCorpus, so casting is safe. + const ParserTrainingCorpus& training_corpus = + static_cast(corpus); const vector& actions = training_corpus.correct_act_sent[sii]; ParseTree ref = RecoverParseTree(sentence, actions, corpus.vocab->actions, corpus.vocab->actions_to_arc_labels, diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 5d89922..ad2c9b1 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -191,15 +191,16 @@ class LSTMParser { const std::vector& actions_to_arc_labels, bool labeled = false); - void Train(const TrainingCorpus& corpus, const TrainingCorpus& dev_corpus, - const double unk_prob, const std::string& model_fname, + void Train(const ParserTrainingCorpus& corpus, + const ParserTrainingCorpus& dev_corpus, const double unk_prob, + const std::string& model_fname, const volatile bool* requested_stop = nullptr); void Test(const Corpus& corpus) { DoTest(corpus, false, true); } - void Evaluate(const TrainingCorpus& corpus, bool output_parses=false) { + void Evaluate(const ParserTrainingCorpus& corpus, bool output_parses=false) { DoTest(corpus, true, output_parses); } From 4423e464a85daa07062a15372b78e0e8bee6d566 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 12:44:20 -0500 Subject: [PATCH 03/88] Unified sentence-reading for oracle transition readers --- parser/corpus.cc | 2 +- parser/corpus.h | 34 ++++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 63b353c..3e5d532 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -94,7 +94,7 @@ void ParserTrainingCorpus::CountSingletons() { } -void ParserTrainingCorpus::OracleTransitionsCorpusReader::LoadCorrectActions( +void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( const string& file, ParserTrainingCorpus* corpus) const { // TODO: break up this function? cerr << "Loading " << (is_training ? "training" : "dev") diff --git a/parser/corpus.h b/parser/corpus.h index 2562f40..c18fa93 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -212,8 +212,24 @@ class Corpus { class TrainingCorpus : public Corpus { public: - TrainingCorpus(CorpusVocabulary* vocab) : Corpus(vocab) {} std::vector> correct_act_sent; + +protected: + class OracleTransitionsCorpusReader : public CorpusReader { + public: + OracleTransitionsCorpusReader(bool is_training) : + is_training(is_training) { + } + protected: + bool is_training; // can be dev rather than actual training + }; + + TrainingCorpus(CorpusVocabulary* vocab, + const OracleTransitionsCorpusReader& reader, + const std::string& file) : + Corpus(vocab, reader, file) {} + + TrainingCorpus(CorpusVocabulary* vocab) : Corpus(vocab) {} }; @@ -222,22 +238,17 @@ class ParserTrainingCorpus : public TrainingCorpus { friend class OracleTransitionsCorpusReader; bool USE_SPELLING = false; - - std::vector> correct_act_sent; std::set singletons; ParserTrainingCorpus(CorpusVocabulary* vocab, const std::string& file, bool is_training) : - TrainingCorpus(vocab) { - OracleTransitionsCorpusReader reader(is_training); - reader.ReadSentences(file, this); - } + TrainingCorpus(vocab, OracleParseTransitionsReader(is_training), file) {} private: - class OracleTransitionsCorpusReader : public CorpusReader { + class OracleParseTransitionsReader : public OracleTransitionsCorpusReader{ public: - OracleTransitionsCorpusReader(bool is_training) : - is_training(is_training) {} + OracleParseTransitionsReader(bool is_training) : + OracleTransitionsCorpusReader(is_training) {} virtual void ReadSentences(const std::string& file, Corpus* corpus) const { ParserTrainingCorpus* training_corpus = @@ -245,7 +256,7 @@ class ParserTrainingCorpus : public TrainingCorpus { LoadCorrectActions(file, training_corpus); } - virtual ~OracleTransitionsCorpusReader() {}; + virtual ~OracleParseTransitionsReader() {}; static inline unsigned UTF8Len(unsigned char x) { if (x < 0x80) return 1; @@ -257,7 +268,6 @@ class ParserTrainingCorpus : public TrainingCorpus { else return 0; } private: - bool is_training; // can be dev rather than actual training void LoadCorrectActions(const std::string& file, ParserTrainingCorpus* corpus) const; }; From 5e27f5b2ba7e1ea0e9e1ca3be1c6dc9b0fe8a121 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 17:58:10 -0500 Subject: [PATCH 04/88] Fixed build settings to be better cmake style --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d065fb4..7d1b1f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,12 @@ project(lstm-parser) cmake_minimum_required(VERSION 2.8 FATAL_ERROR) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif(NOT CMAKE_BUILD_TYPE) + set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) -set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -O3 -g") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++11") enable_testing() From 1514d35cae0576a8b36a46ff6208256b4ab33eac Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 18:07:06 -0500 Subject: [PATCH 05/88] Split up transition-reading code Shortens function; also allows it to be reused for other transition-based NLP systems. --- parser/corpus.cc | 165 +++++++++++++++++++++++------------------- parser/corpus.h | 26 +++++-- parser/lstm-parser.cc | 3 +- 3 files changed, 111 insertions(+), 83 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 3e5d532..4214e8e 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -94,9 +94,93 @@ void ParserTrainingCorpus::CountSingletons() { } +void ParserTrainingCorpus::OracleParseTransitionsReader::RecordWord( + const string& word, const string& pos, unsigned next_token_index, + CorpusVocabulary* vocab, ParserTrainingCorpus* corpus, + map* sentence, map* sentence_pos, + map* sentence_unk_surface_forms) const { + // We assume that we'll have seen all POS tags in training, so don't + // worry about OOV tags. + unsigned pos_id = vocab->GetOrAddEntry(pos, &vocab->pos_to_int, + &vocab->int_to_pos); + + unsigned word_id; + if (is_training) { + unsigned num_words = vocab->CountWords(); // store for later check + word_id = vocab->GetOrAddWord(word, true); + if (vocab->CountWords() > num_words) { + // A new word was added; add its chars, too. + unsigned j = 0; + while (j < word.length()) { + unsigned char_utf8_len = UTF8Len(word[j]); + string next_utf8_char = word.substr(j, char_utf8_len); + vocab->GetOrAddEntry(next_utf8_char, &vocab->chars_to_int, + &vocab->int_to_chars); + j += char_utf8_len; + } + } else { + // It's an old word. Make sure it's marked as present in training. + vocab->int_to_training_word[word_id] = true; + } + } else { + // add an empty string for any token except OOVs (it is easy to + // recover the surface form of non-OOV using intToWords(id)). + // OOV word + if (corpus->USE_SPELLING) { + word_id = vocab->GetOrAddWord(word); // don't record as training + (*sentence_unk_surface_forms)[next_token_index] = ""; + } else { + auto word_iter = vocab->words_to_int.find(word); + if (word_iter == vocab->words_to_int.end()) { + // Save the surface form of this OOV. + (*sentence_unk_surface_forms)[next_token_index] = word; + word_id = vocab->words_to_int[vocab->UNK]; + } else { + (*sentence_unk_surface_forms)[next_token_index] = ""; + word_id = word_iter->second; + } + } + } + + (*sentence)[next_token_index] = word_id; + (*sentence_pos)[next_token_index] = pos_id; +} + +void ParserTrainingCorpus::OracleParseTransitionsReader::RecordAction( + const string& action, bool start_of_sentence, CorpusVocabulary* vocab, + ParserTrainingCorpus* corpus) const { + auto PushAction = // should be inlined; defined here for DRY reasons + [corpus, start_of_sentence](unsigned action_index) { + if (start_of_sentence) + corpus->correct_act_sent.push_back( {action_index} ); + else + corpus->correct_act_sent.back().push_back(action_index); + }; + + auto action_iter = find(vocab->actions.begin(), vocab->actions.end(), action); + if (action_iter != vocab->actions.end()) { + unsigned action_index = distance(vocab->actions.begin(), action_iter); + PushAction(action_index); + } else { // A not-previously-seen action + if (is_training) { + vocab->actions.push_back(action); + unsigned action_index = vocab->actions.size() - 1; + PushAction(action_index); + } else { + // TODO: right now, new actions which haven't been observed in + // training are not added to correct_act_sent. In dev, this may + // be a problem if there is little training data. + cerr << "WARNING: encountered unknown transition in dev corpus: " + << action << endl; + if (start_of_sentence) + corpus->correct_act_sent.push_back({}); + } + } +} + + void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( const string& file, ParserTrainingCorpus* corpus) const { - // TODO: break up this function? cerr << "Loading " << (is_training ? "training" : "dev") << " corpus from " << file << "..." << endl; ifstream actionsFile(file); @@ -192,84 +276,14 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( pos = CorpusVocabulary::ROOT; } - // We assume that we'll have seen all POS tags in training, so don't - // worry about OOV tags. - unsigned pos_id = vocab->GetOrAddEntry(pos, &vocab->pos_to_int, - &vocab->int_to_pos); // Use 1-indexed token IDs to leave room for ROOT in position 0. unsigned next_token_index = sentence.size() + 1; - unsigned word_id; - if (is_training) { - unsigned num_words = vocab->CountWords(); // store for later check - word_id = vocab->GetOrAddWord(word, true); - if (vocab->CountWords() > num_words) { - // A new word was added; add its chars, too. - unsigned j = 0; - while (j < word.length()) { - unsigned char_utf8_len = UTF8Len(word[j]); - string next_utf8_char = word.substr(j, char_utf8_len); - vocab->GetOrAddEntry(next_utf8_char, &vocab->chars_to_int, - &vocab->int_to_chars); - j += char_utf8_len; - } - } else { - // It's an old word. Make sure it's marked as present in training. - vocab->int_to_training_word[word_id] = true; - } - } else { - // add an empty string for any token except OOVs (it is easy to - // recover the surface form of non-OOV using intToWords(id)). - // OOV word - if (corpus->USE_SPELLING) { - word_id = vocab->GetOrAddWord(word); // don't record as training - sentence_unk_surface_forms[next_token_index] = ""; - } else { - auto word_iter = vocab->words_to_int.find(word); - if (word_iter == vocab->words_to_int.end()) { - // Save the surface form of this OOV. - sentence_unk_surface_forms[next_token_index] = word; - word_id = vocab->words_to_int[vocab->UNK]; - } else { - sentence_unk_surface_forms[next_token_index] = ""; - word_id = word_iter->second; - } - } - } - - sentence[next_token_index] = word_id; - sentence_pos[next_token_index] = pos_id; + RecordWord(word, pos, next_token_index, vocab, corpus, &sentence, + &sentence_pos, &sentence_unk_surface_forms); } while (iss); } - } else if (next_is_action_line) { - auto action_iter = find(vocab->actions.begin(), vocab->actions.end(), - lineS); - if (action_iter != vocab->actions.end()) { - unsigned action_index = distance(vocab->actions.begin(), action_iter); - if (start_of_sentence) - corpus->correct_act_sent.push_back({action_index}); - else - corpus->correct_act_sent.back().push_back(action_index); - } else { // A not-previously-seen action - if (is_training) { - vocab->actions.push_back(lineS); - vocab->actions_to_arc_labels.push_back( - vocab->GetLabelForAction(lineS)); - - unsigned action_index = vocab->actions.size() - 1; - if (start_of_sentence) - corpus->correct_act_sent.push_back({action_index}); - else - corpus->correct_act_sent.back().push_back(action_index); - } else { - // TODO: right now, new actions which haven't been observed in - // training are not added to correct_act_sent. In dev, this may - // be a problem if there is little training data. - cerr << "WARNING: encountered unknown transition in dev corpus: " - << lineS << endl; - if (start_of_sentence) - corpus->correct_act_sent.push_back({}); - } - } + } else { // next_is_action_line + RecordAction(lineS, start_of_sentence, vocab, corpus); start_of_sentence = false; } @@ -292,6 +306,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( cerr << "done." << "\n"; if (is_training) { for (auto a : vocab->actions) { + vocab->actions_to_arc_labels.push_back(vocab->GetLabelForAction(a)); cerr << a << "\n"; } } diff --git a/parser/corpus.h b/parser/corpus.h index c18fa93..05710ac 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -91,7 +91,7 @@ class CorpusVocabulary { } } - static inline std::string GetLabelForAction(const std::string& action) { + virtual std::string GetLabelForAction(const std::string& action) { if (boost::starts_with(action, "RIGHT-ARC") || boost::starts_with(action, "LEFT-ARC")) { size_t first_char_in_rel = action.find('(') + 1; @@ -224,11 +224,8 @@ class TrainingCorpus : public Corpus { bool is_training; // can be dev rather than actual training }; - TrainingCorpus(CorpusVocabulary* vocab, - const OracleTransitionsCorpusReader& reader, - const std::string& file) : - Corpus(vocab, reader, file) {} - + // Don't provide access to reader constructor -- object won't be fully + // constructed yet, so it would segfault. TrainingCorpus(CorpusVocabulary* vocab) : Corpus(vocab) {} }; @@ -242,7 +239,9 @@ class ParserTrainingCorpus : public TrainingCorpus { ParserTrainingCorpus(CorpusVocabulary* vocab, const std::string& file, bool is_training) : - TrainingCorpus(vocab, OracleParseTransitionsReader(is_training), file) {} + TrainingCorpus(vocab) { + OracleParseTransitionsReader(is_training).ReadSentences(file, this); + } private: class OracleParseTransitionsReader : public OracleTransitionsCorpusReader{ @@ -267,6 +266,19 @@ class ParserTrainingCorpus : public TrainingCorpus { else if ((x >> 1) == 0x7e) return 6; else return 0; } + + protected: + void RecordWord( + const std::string& word, const std::string& pos, + unsigned next_token_index, CorpusVocabulary* vocab, + ParserTrainingCorpus* corpus, std::map* sentence, + std::map* sentence_pos, + std::map* sentence_unk_surface_forms) const; + + void RecordAction(const std::string& action, bool start_of_sentence, + CorpusVocabulary* vocab, + ParserTrainingCorpus* corpus) const; + private: void LoadCorrectActions(const std::string& file, ParserTrainingCorpus* corpus) const; diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index ad1b1cf..720f323 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -629,7 +629,8 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, if (evaluate) { // Downcast to ParserTrainingCorpus to get gold-standard data. We can only // get here if this function was called by Evaluate, which statically - // checks that the corpus is in fact a TrainingCorpus, so casting is safe. + // checks that the corpus is in fact a ParserTrainingCorpus, so this cast + // is safe. const ParserTrainingCorpus& training_corpus = static_cast(corpus); const vector& actions = training_corpus.correct_act_sent[sii]; From a49a559d4433ffcced8f384eb6d96d4b78e98173 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 18:24:01 -0500 Subject: [PATCH 06/88] Moved string replacement function --- parser/corpus.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index 05710ac..1098c00 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -220,6 +220,16 @@ class TrainingCorpus : public Corpus { OracleTransitionsCorpusReader(bool is_training) : is_training(is_training) { } + + static inline void ReplaceStringInPlace(std::string* subject, + const std::string& search, + const std::string& replace) { + size_t pos = 0; + while ((pos = subject->find(search, pos)) != std::string::npos) { + subject->replace(pos, search.length(), replace); + pos += replace.length(); + } + } protected: bool is_training; // can be dev rather than actual training }; @@ -284,16 +294,6 @@ class ParserTrainingCorpus : public TrainingCorpus { ParserTrainingCorpus* corpus) const; }; - static inline void ReplaceStringInPlace(std::string* subject, - const std::string& search, - const std::string& replace) { - size_t pos = 0; - while ((pos = subject->find(search, pos)) != std::string::npos) { - subject->replace(pos, search.length(), replace); - pos += replace.length(); - } - } - void CountSingletons(); }; From 39d0efcae5c7e09fca0b021a4b56c2217b2829a1 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 18:39:56 -0500 Subject: [PATCH 07/88] Moved recording functions into base training corpus reader Also some corrections for consistent variable naming conventions --- parser/corpus.cc | 67 ++++++++++++++++++++++++++---------------------- parser/corpus.h | 52 ++++++++++++++++++++----------------- 2 files changed, 64 insertions(+), 55 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 4214e8e..1aa0845 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -78,7 +78,6 @@ void ConllUCorpusReader::ReadSentences(const string& file, } - void ParserTrainingCorpus::CountSingletons() { // compute the singletons in the parser's training data map counts; @@ -94,9 +93,9 @@ void ParserTrainingCorpus::CountSingletons() { } -void ParserTrainingCorpus::OracleParseTransitionsReader::RecordWord( +void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( const string& word, const string& pos, unsigned next_token_index, - CorpusVocabulary* vocab, ParserTrainingCorpus* corpus, + CorpusVocabulary* vocab, TrainingCorpus* corpus, map* sentence, map* sentence_pos, map* sentence_unk_surface_forms) const { // We assume that we'll have seen all POS tags in training, so don't @@ -146,9 +145,10 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::RecordWord( (*sentence_pos)[next_token_index] = pos_id; } -void ParserTrainingCorpus::OracleParseTransitionsReader::RecordAction( + +void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( const string& action, bool start_of_sentence, CorpusVocabulary* vocab, - ParserTrainingCorpus* corpus) const { + TrainingCorpus* corpus) const { auto PushAction = // should be inlined; defined here for DRY reasons [corpus, start_of_sentence](unsigned action_index) { if (start_of_sentence) @@ -179,12 +179,29 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::RecordAction( } +void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( + TrainingCorpus* corpus, map* sentence, + map* sentence_pos, + map* sentence_unk_surface_forms) const { + // Store the sentence variables and clear them for the next sentence. + corpus->sentences.push_back({}); + corpus->sentences.back().swap(*sentence); + corpus->sentences_pos.push_back({}); + corpus->sentences_pos.back().swap(*sentence_pos); + if (!is_training) { + corpus->sentences_unk_surface_forms.push_back({}); + corpus->sentences_unk_surface_forms.back().swap( + *sentence_unk_surface_forms); + } +} + + void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( const string& file, ParserTrainingCorpus* corpus) const { cerr << "Loading " << (is_training ? "training" : "dev") << " corpus from " << file << "..." << endl; - ifstream actionsFile(file); - string lineS; + ifstream actions_file(file); + string line; CorpusVocabulary* vocab = corpus->vocab; bool next_is_action_line = false; @@ -217,24 +234,16 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( } }; - while (getline(actionsFile, lineS)) { - ReplaceStringInPlace(&lineS, "-RRB-", "_RRB_"); - ReplaceStringInPlace(&lineS, "-LRB-", "_LRB_"); + while (getline(actions_file, line)) { + ReplaceStringInPlace(&line, "-RRB-", "_RRB_"); + ReplaceStringInPlace(&line, "-LRB-", "_LRB_"); // An empty line marks the end of a sentence. - if (lineS.empty()) { + if (line.empty()) { next_is_action_line = false; if (!first) { // if first, first line is blank, but no sentence yet FixRootID(); - // Store the sentence variables and clear them for the next sentence. - corpus->sentences.push_back({}); - corpus->sentences.back().swap(sentence); - corpus->sentences_pos.push_back({}); - corpus->sentences_pos.back().swap(sentence_pos); - if (!is_training) { - corpus->sentences_unk_surface_forms.push_back({}); - corpus->sentences_unk_surface_forms.back().swap( - sentence_unk_surface_forms); - } + RecordSentence(corpus, &sentence, &sentence_pos, + &sentence_unk_surface_forms); } start_of_sentence = true; continue; // don't update next_is_action_line @@ -247,9 +256,9 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( // the initial line in each sentence should look like: // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT] // first, get rid of the square brackets. - lineS = lineS.substr(3, lineS.size() - 4); + line = line.substr(3, line.size() - 4); // read the initial line, token by token "the-det," "cat-noun," ... - istringstream iss(lineS); + istringstream iss(line); do { string word; iss >> word; @@ -283,7 +292,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( } while (iss); } } else { // next_is_action_line - RecordAction(lineS, start_of_sentence, vocab, corpus); + RecordAction(line, start_of_sentence, vocab, corpus); start_of_sentence = false; } @@ -293,15 +302,11 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( // Add the last sentence. if (sentence.size() > 0) { FixRootID(); - corpus->sentences.push_back(move(sentence)); - corpus->sentences_pos.push_back(move(sentence_pos)); - if (!is_training) { - corpus->sentences_unk_surface_forms.push_back( - move(sentence_unk_surface_forms)); - } + RecordSentence(corpus, &sentence, &sentence_pos, + &sentence_unk_surface_forms); } - actionsFile.close(); + actions_file.close(); cerr << "done." << "\n"; if (is_training) { diff --git a/parser/corpus.h b/parser/corpus.h index 1098c00..c80659d 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -213,6 +213,7 @@ class Corpus { class TrainingCorpus : public Corpus { public: std::vector> correct_act_sent; + bool USE_SPELLING = false; protected: class OracleTransitionsCorpusReader : public CorpusReader { @@ -230,8 +231,34 @@ class TrainingCorpus : public Corpus { pos += replace.length(); } } + protected: bool is_training; // can be dev rather than actual training + + void RecordWord( + const std::string& word, const std::string& pos, + unsigned next_token_index, CorpusVocabulary* vocab, + TrainingCorpus* corpus, std::map* sentence, + std::map* sentence_pos, + std::map* sentence_unk_surface_forms) const; + + void RecordAction(const std::string& action, bool start_of_sentence, + CorpusVocabulary* vocab, TrainingCorpus* corpus) const; + + void RecordSentence( + TrainingCorpus* corpus, std::map* sentence, + std::map* sentence_pos, + std::map* sentence_unk_surface_forms) const; + + static inline unsigned UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else if ((x >> 2) == 0x3e) return 5; + else if ((x >> 1) == 0x7e) return 6; + else return 0; + } }; // Don't provide access to reader constructor -- object won't be fully @@ -244,7 +271,6 @@ class ParserTrainingCorpus : public TrainingCorpus { public: friend class OracleTransitionsCorpusReader; - bool USE_SPELLING = false; std::set singletons; ParserTrainingCorpus(CorpusVocabulary* vocab, const std::string& file, @@ -261,34 +287,12 @@ class ParserTrainingCorpus : public TrainingCorpus { virtual void ReadSentences(const std::string& file, Corpus* corpus) const { ParserTrainingCorpus* training_corpus = - static_cast(corpus); + static_cast(corpus); LoadCorrectActions(file, training_corpus); } virtual ~OracleParseTransitionsReader() {}; - static inline unsigned UTF8Len(unsigned char x) { - if (x < 0x80) return 1; - else if ((x >> 5) == 0x06) return 2; - else if ((x >> 4) == 0x0e) return 3; - else if ((x >> 3) == 0x1e) return 4; - else if ((x >> 2) == 0x3e) return 5; - else if ((x >> 1) == 0x7e) return 6; - else return 0; - } - - protected: - void RecordWord( - const std::string& word, const std::string& pos, - unsigned next_token_index, CorpusVocabulary* vocab, - ParserTrainingCorpus* corpus, std::map* sentence, - std::map* sentence_pos, - std::map* sentence_unk_surface_forms) const; - - void RecordAction(const std::string& action, bool start_of_sentence, - CorpusVocabulary* vocab, - ParserTrainingCorpus* corpus) const; - private: void LoadCorrectActions(const std::string& file, ParserTrainingCorpus* corpus) const; From 40ec8b8b185fc16d3ececc8e477df6f1c58d5caf Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 20:26:25 -0500 Subject: [PATCH 08/88] Style fix --- parser/corpus.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 1aa0845..3696238 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -270,14 +270,14 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( word = word.substr(0, word.size() - 1); } // split the string (at '-') into word and POS tag. - size_t posIndex = word.rfind('-'); - if (posIndex == string::npos) { + size_t pos_index = word.rfind('-'); + if (pos_index == string::npos) { cerr << "can't find the dash in '" << word << "'" << endl; } - assert(posIndex != string::npos); - string pos = word.substr(posIndex + 1); - word = word.substr(0, posIndex); + assert(pos_index != string::npos); + string pos = word.substr(pos_index + 1); + word = word.substr(0, pos_index); if (pos == ORACLE_ROOT_POS) { // Prevent any confusion with the actual word "ROOT". From 04013830158b62e9b5853b6bb3a063f1d967bfd0 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 20:57:42 -0500 Subject: [PATCH 09/88] Indentation fix --- parser/corpus.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 3696238..7470d5b 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -173,7 +173,7 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( cerr << "WARNING: encountered unknown transition in dev corpus: " << action << endl; if (start_of_sentence) - corpus->correct_act_sent.push_back({}); + corpus->correct_act_sent.push_back({}); } } } From dfd6a6e3ee8bf7b715b93c88723d169beaea32df Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 22:07:20 -0500 Subject: [PATCH 10/88] Cleaned up oracle action recording a bit --- parser/corpus.cc | 26 ++++++++++---------------- parser/corpus.h | 7 ++++--- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 7470d5b..9c299da 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -147,33 +147,23 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( - const string& action, bool start_of_sentence, CorpusVocabulary* vocab, + const string& action, CorpusVocabulary* vocab, TrainingCorpus* corpus) const { - auto PushAction = // should be inlined; defined here for DRY reasons - [corpus, start_of_sentence](unsigned action_index) { - if (start_of_sentence) - corpus->correct_act_sent.push_back( {action_index} ); - else - corpus->correct_act_sent.back().push_back(action_index); - }; - auto action_iter = find(vocab->actions.begin(), vocab->actions.end(), action); if (action_iter != vocab->actions.end()) { unsigned action_index = distance(vocab->actions.begin(), action_iter); - PushAction(action_index); + corpus->correct_act_sent.back().push_back(action_index); } else { // A not-previously-seen action if (is_training) { vocab->actions.push_back(action); unsigned action_index = vocab->actions.size() - 1; - PushAction(action_index); + corpus->correct_act_sent.back().push_back(action_index); } else { // TODO: right now, new actions which haven't been observed in // training are not added to correct_act_sent. In dev, this may // be a problem if there is little training data. cerr << "WARNING: encountered unknown transition in dev corpus: " << action << endl; - if (start_of_sentence) - corpus->correct_act_sent.push_back({}); } } } @@ -182,7 +172,7 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( TrainingCorpus* corpus, map* sentence, map* sentence_pos, - map* sentence_unk_surface_forms) const { + map* sentence_unk_surface_forms, bool final) const { // Store the sentence variables and clear them for the next sentence. corpus->sentences.push_back({}); corpus->sentences.back().swap(*sentence); @@ -193,6 +183,9 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( corpus->sentences_unk_surface_forms.back().swap( *sentence_unk_surface_forms); } + if (!final) { + corpus->correct_act_sent.push_back({}); + } } @@ -211,6 +204,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( map sentence; map sentence_pos; map sentence_unk_surface_forms; + corpus->correct_act_sent.push_back({}); // We'll need to make sure ROOT token has a consistent ID. // (Should get inlined; defined here for DRY purposes.) @@ -292,7 +286,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( } while (iss); } } else { // next_is_action_line - RecordAction(line, start_of_sentence, vocab, corpus); + RecordAction(line, vocab, corpus); start_of_sentence = false; } @@ -303,7 +297,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( if (sentence.size() > 0) { FixRootID(); RecordSentence(corpus, &sentence, &sentence_pos, - &sentence_unk_surface_forms); + &sentence_unk_surface_forms, true); } actions_file.close(); diff --git a/parser/corpus.h b/parser/corpus.h index c80659d..a7d1b09 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -242,13 +242,14 @@ class TrainingCorpus : public Corpus { std::map* sentence_pos, std::map* sentence_unk_surface_forms) const; - void RecordAction(const std::string& action, bool start_of_sentence, - CorpusVocabulary* vocab, TrainingCorpus* corpus) const; + void RecordAction(const std::string& action, CorpusVocabulary* vocab, + TrainingCorpus* corpus) const; void RecordSentence( TrainingCorpus* corpus, std::map* sentence, std::map* sentence_pos, - std::map* sentence_unk_surface_forms) const; + std::map* sentence_unk_surface_forms, + bool final = false) const; static inline unsigned UTF8Len(unsigned char x) { if (x < 0x80) return 1; From 1991a7595fd4f9cbac498bce3dfc131d199d6a01 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 22:30:36 -0500 Subject: [PATCH 11/88] Switched action-to-label to be more general and use boost::regex --- CMakeLists.txt | 2 +- parser/corpus.cc | 5 +++++ parser/corpus.h | 14 +++++++------- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d1b1f1..a4e62ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ if(DEFINED ENV{BOOST_ROOT}) set(Boost_NO_SYSTEM_PATHS ON) endif() set(Boost_REALPATH ON) -find_package(Boost COMPONENTS program_options serialization iostreams REQUIRED) +find_package(Boost COMPONENTS program_options serialization iostreams regex REQUIRED) include_directories(${Boost_INCLUDE_DIR}) set(LIBS ${LIBS} ${Boost_LIBRARIES}) diff --git a/parser/corpus.cc b/parser/corpus.cc index 9c299da..37e14ed 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -16,6 +16,11 @@ constexpr unsigned Corpus::ROOT_TOKEN_ID; const string CorpusVocabulary::BAD0 = ""; const string CorpusVocabulary::UNK = ""; const string CorpusVocabulary::ROOT = ""; +// We assume that actions with arcs will be of the form +// "action-name(arc-label)". Allow any non-paren characters, followed by the +// label name in parens. (Group 1 is the label name.) +const boost::regex CorpusVocabulary::ARC_ACTION_REGEX( + {"[^\\(\\)]+\\(([^\\(\\)]+)\\)"}); const string ORACLE_ROOT_POS = "ROOT"; diff --git a/parser/corpus.h b/parser/corpus.h index a7d1b09..cf00a8f 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -2,6 +2,7 @@ #define CORPUS_H #include +#include #include #include #include @@ -91,13 +92,10 @@ class CorpusVocabulary { } } - virtual std::string GetLabelForAction(const std::string& action) { - if (boost::starts_with(action, "RIGHT-ARC") || - boost::starts_with(action, "LEFT-ARC")) { - size_t first_char_in_rel = action.find('(') + 1; - size_t last_char_in_rel = action.rfind(')') - 1; - return action.substr( - first_char_in_rel, last_char_in_rel - first_char_in_rel + 1); + static inline std::string GetLabelForAction(const std::string& action) { + boost::smatch match; + if (boost::regex_search(action, match, ARC_ACTION_REGEX)) { + return match[1]; } else { return "NONE"; } @@ -106,6 +104,8 @@ class CorpusVocabulary { private: friend class boost::serialization::access; + static const boost::regex ARC_ACTION_REGEX; + template // Shared code: serialize the number-to-string mappings, from which the // reverse mappings can be reconstructed. From 566cf5789a31f500074cff7bbd2c04a66a4176cb Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 15 Jan 2017 23:17:11 -0500 Subject: [PATCH 12/88] Removed unnecessary record fn params; added vocab copy ctor --- parser/corpus.cc | 14 ++++++++------ parser/corpus.h | 16 ++++++++++++---- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 37e14ed..3daf5a0 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -100,11 +100,12 @@ void ParserTrainingCorpus::CountSingletons() { void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( const string& word, const string& pos, unsigned next_token_index, - CorpusVocabulary* vocab, TrainingCorpus* corpus, - map* sentence, map* sentence_pos, + TrainingCorpus* corpus, map* sentence, + map* sentence_pos, map* sentence_unk_surface_forms) const { // We assume that we'll have seen all POS tags in training, so don't // worry about OOV tags. + CorpusVocabulary* vocab = corpus->vocab; unsigned pos_id = vocab->GetOrAddEntry(pos, &vocab->pos_to_int, &vocab->int_to_pos); @@ -152,8 +153,8 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( - const string& action, CorpusVocabulary* vocab, - TrainingCorpus* corpus) const { + const string& action, TrainingCorpus* corpus) const { + CorpusVocabulary* vocab = corpus->vocab; auto action_iter = find(vocab->actions.begin(), vocab->actions.end(), action); if (action_iter != vocab->actions.end()) { unsigned action_index = distance(vocab->actions.begin(), action_iter); @@ -163,6 +164,7 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( vocab->actions.push_back(action); unsigned action_index = vocab->actions.size() - 1; corpus->correct_act_sent.back().push_back(action_index); + vocab->actions_to_arc_labels.push_back(vocab->GetLabelForAction(action)); } else { // TODO: right now, new actions which haven't been observed in // training are not added to correct_act_sent. In dev, this may @@ -286,12 +288,12 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( // Use 1-indexed token IDs to leave room for ROOT in position 0. unsigned next_token_index = sentence.size() + 1; - RecordWord(word, pos, next_token_index, vocab, corpus, &sentence, + RecordWord(word, pos, next_token_index, corpus, &sentence, &sentence_pos, &sentence_unk_surface_forms); } while (iss); } } else { // next_is_action_line - RecordAction(line, vocab, corpus); + RecordAction(line, corpus); start_of_sentence = false; } diff --git a/parser/corpus.h b/parser/corpus.h index cf00a8f..bdf036d 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -44,6 +44,15 @@ class CorpusVocabulary { AddEntry(BAD0, &chars_to_int, &int_to_chars); } + // Copy constructor: Copy everything except action-related stuff, on the + // assumption that we're copying the vocabulary for use in another task with + // different actions. + CorpusVocabulary(const CorpusVocabulary& other) : + words_to_int(other.words_to_int), int_to_words(other.int_to_words), + int_to_training_word(other.int_to_training_word), + pos_to_int(other.pos_to_int), int_to_pos(other.int_to_pos), + chars_to_int(other.chars_to_int), int_to_chars(other.int_to_chars) {} + inline unsigned CountPOS() { return pos_to_int.size(); } inline unsigned CountWords() { return words_to_int.size(); } inline unsigned CountChars() { return chars_to_int.size(); } @@ -237,13 +246,12 @@ class TrainingCorpus : public Corpus { void RecordWord( const std::string& word, const std::string& pos, - unsigned next_token_index, CorpusVocabulary* vocab, - TrainingCorpus* corpus, std::map* sentence, + unsigned next_token_index, TrainingCorpus* corpus, + std::map* sentence, std::map* sentence_pos, std::map* sentence_unk_surface_forms) const; - void RecordAction(const std::string& action, CorpusVocabulary* vocab, - TrainingCorpus* corpus) const; + void RecordAction(const std::string& action, TrainingCorpus* corpus) const; void RecordSentence( TrainingCorpus* corpus, std::map* sentence, From 96fadfc1788f7f041ce84c29fc5c18bfd5279f5c Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Fri, 20 Jan 2017 10:53:33 -0500 Subject: [PATCH 13/88] Fixed training command in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b7d53bd..369808e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Given a `training.conll` file and a `development.conll` formatted according to t java -jar ParserOracleArcStdWithSwap.jar -t -1 -l 1 -c training.conll > trainingOracle.txt java -jar ParserOracleArcStdWithSwap.jar -t -1 -l 1 -c development.conll > devOracle.txt - parser/lstm-parse -P -t trainingOracle.txt -d devOracle.txt --hidden_dim 100 --lstm_input_dim 100 -w sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 + parser/lstm-parse --train -t trainingOracle.txt -d devOracle.txt --hidden_dim 100 --lstm_input_dim 100 --words sskip.100.vectors --rel_dim 20 --action_dim 20 --use_pos_tags Link to the word vectors used in the ACL 2015 paper for English: [sskip.100.vectors](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing). From 9fc237dfb07bca04b0c35d6c6fa6935e3cf86e5e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 24 Jan 2017 14:15:00 -0500 Subject: [PATCH 14/88] Correct-count pointer for logprob defaults to null --- parser/lstm-parser.cc | 2 +- parser/lstm-parser.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 720f323..bea19b9 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -339,7 +339,7 @@ vector LSTMParser::LogProbParser( // If we have reference actions (for training), use the reference action. if (build_training_graph) { action = correct_actions[action_count]; - if (best_a == action) { + if (correct && best_a == action) { (*correct)++; } } diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index ad2c9b1..5ec3d2d 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -181,7 +181,8 @@ class LSTMParser { ParseTree Parse(const std::map& sentence, const std::map& sentence_pos, - const CorpusVocabulary& vocab, bool labeled, double* correct); + const CorpusVocabulary& vocab, bool labeled, + double* correct = nullptr); // take a vector of actions and return a parse tree ParseTree RecoverParseTree( @@ -209,7 +210,7 @@ class LSTMParser { const std::map& sentence, const std::map& sentence_pos, const CorpusVocabulary& vocab, cnn::ComputationGraph *cg, - double* correct); + double* correct = nullptr); void LoadPretrainedWords(const std::string& words_path); @@ -230,7 +231,7 @@ class LSTMParser { const std::map& sentPos, const std::vector& correct_actions, const std::vector& action_names, - const std::vector& int_to_words, double* right); + const std::vector& int_to_words, double* correct); void SaveModel(const std::string& model_fname, bool softlink_created); From 651a3cafc8f168db9289d3f67dc6994732154f5e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 24 Jan 2017 17:02:07 -0500 Subject: [PATCH 15/88] Minor variable name cleanup for consistency --- parser/lstm-parser.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index bea19b9..5a89940 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -348,16 +348,16 @@ vector LSTMParser::LogProbParser( results.push_back(action); // add current action to action LSTM - Expression actione = lookup(*hg, p_a, action); - action_lstm.add_input(actione); + Expression action_e = lookup(*hg, p_a, action); + action_lstm.add_input(action_e); // get relation embedding from action (TODO: convert to rel from action?) Expression relation = lookup(*hg, p_r, action); // do action - const string& actionString = action_names[action]; - const char ac = actionString[0]; - const char ac2 = actionString[1]; + const string& action_string = action_names[action]; + const char ac = action_string[0]; + const char ac2 = action_string[1]; if (ac == 'S' && ac2 == 'H') { // SHIFT assert(buffer.size() > 1); // dummy symbol means > 1 (not >= 1) From e6afc7375d3a4550184b1825baf48f47e4399a64 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 31 Jan 2017 15:27:30 -0500 Subject: [PATCH 16/88] Allowed accessing final parser state outside LogProbParser --- parser/lstm-parser.cc | 15 +++++++++++---- parser/lstm-parser.h | 6 ++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 5a89940..35096f5 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -227,7 +227,8 @@ vector LSTMParser::LogProbParser( const map& sent, // sentence with OOVs replaced const map& sent_pos, const vector& correct_actions, const vector& action_names, - const vector& int_to_words, double* correct) { + const vector& int_to_words, double* correct, + Expression* final_parser_state) { // TODO: break up this function? assert(finalized); vector results; @@ -306,6 +307,7 @@ vector LSTMParser::LogProbParser( stack_lstm.add_input(stack.back()); vector log_probs; unsigned action_count = 0; // incremented at each prediction + Expression p_t; // declared outside to allow access later while (stack.size() > 2 || buffer.size() > 1) { // get list of possible actions for the current parser state vector current_valid_actions; @@ -317,7 +319,7 @@ vector LSTMParser::LogProbParser( } // p_t = pbias + S * slstm + B * blstm + A * almst - Expression p_t = affine_transform( + p_t = affine_transform( {pbias, S, stack_lstm.back(), B, buffer_lstm.back(), A, action_lstm.back()}); Expression nlp_t = rectify(p_t); @@ -424,6 +426,10 @@ vector LSTMParser::LogProbParser( assert(bufferi.size() == 1); Expression tot_neglogprob = -sum(log_probs); assert(tot_neglogprob.pg != nullptr); + + if (final_parser_state) { + *final_parser_state = p_t; + } return results; } @@ -573,10 +579,11 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } +// TODO: fix this so that correct actually does something sometimes vector LSTMParser::LogProbParser( const map& sentence, const map& sentence_pos, const CorpusVocabulary& vocab, - ComputationGraph *cg, double* correct) { + ComputationGraph *cg, double* correct, Expression* final_parser_state) { map tsentence(sentence); // sentence with OOVs replaced for (auto& index_and_id : tsentence) { // use reference to overwrite if (!vocab.int_to_training_word[index_and_id.second]) { @@ -585,7 +592,7 @@ vector LSTMParser::LogProbParser( } return LogProbParser(cg, sentence, tsentence, sentence_pos, vector(), vocab.actions, - vocab.int_to_words, correct); + vocab.int_to_words, correct, final_parser_state); } diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 5ec3d2d..6096056 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -210,7 +210,8 @@ class LSTMParser { const std::map& sentence, const std::map& sentence_pos, const CorpusVocabulary& vocab, cnn::ComputationGraph *cg, - double* correct = nullptr); + double* correct = nullptr, + cnn::expr::Expression* final_parser_state = nullptr); void LoadPretrainedWords(const std::string& words_path); @@ -231,7 +232,8 @@ class LSTMParser { const std::map& sentPos, const std::vector& correct_actions, const std::vector& action_names, - const std::vector& int_to_words, double* correct); + const std::vector& int_to_words, double* correct, + cnn::expr::Expression* final_parser_state = nullptr); void SaveModel(const std::string& model_fname, bool softlink_created); From 3f77fcfb9ceec9ef169bec1c5e1bae825fed4c01 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 31 Jan 2017 16:05:44 -0500 Subject: [PATCH 17/88] Fixed command line processing to output help if no args given --- parser/lstm-parser-driver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/lstm-parser-driver.cc b/parser/lstm-parser-driver.cc index a1caacc..e5aaf91 100644 --- a/parser/lstm-parser-driver.cc +++ b/parser/lstm-parser-driver.cc @@ -54,7 +54,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("help")) { + if (conf->count("help") || argc == 1) { cerr << dcmdline_options << endl; exit(0); } From 6975c90b2fdbadbe5dc66f1e85360f260cebf314 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 31 Jan 2017 16:27:29 -0500 Subject: [PATCH 18/88] More naming consistency --- parser/lstm-parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 6096056..09587f0 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -229,7 +229,7 @@ class LSTMParser { cnn::ComputationGraph* hg, const std::map& raw_sent, // raw sentence const std::map& sent, // sentence with OOVs replaced - const std::map& sentPos, + const std::map& sent_pos, const std::vector& correct_actions, const std::vector& action_names, const std::vector& int_to_words, double* correct, From 0f05ecde018360aeba7197c7f4e8c067a210b180 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 31 Jan 2017 16:41:32 -0500 Subject: [PATCH 19/88] Got rid of meaningless 'err' reporting when evaluating --- parser/lstm-parser.cc | 34 ++++++++++++++++------------------ parser/lstm-parser.h | 4 +--- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 35096f5..82e8543 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -541,7 +541,6 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, // dev_size = 100; double llh = 0; double trs = 0; - double correct = 0; double correct_heads = 0; double total_heads = 0; auto t_start = chrono::high_resolution_clock::now(); @@ -549,7 +548,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, const map& sentence = dev_corpus.sentences[sii]; const map& sentence_pos = dev_corpus.sentences_pos[sii]; - ParseTree hyp = Parse(sentence, sentence_pos, vocab, false, &correct); + ParseTree hyp = Parse(sentence, sentence_pos, vocab, false); double lp = 0; llh -= lp; @@ -562,13 +561,15 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, correct_heads += ComputeCorrect(ref, hyp); total_heads += sentence.size() - 1; // -1 to account for ROOT } + auto t_end = chrono::high_resolution_clock::now(); + auto ms = chrono::duration(t_end - t_start).count(); cerr << " **dev (iter=" << iter << " epoch=" - << (tot_seen / num_sentences) << ")\tllh=" << llh << " ppl: " - << exp(llh / trs) << " err: " << (trs - correct) / trs << " uas: " - << (correct_heads / total_heads) << "\t[" << dev_size << " sents in " - << chrono::duration(t_end - t_start).count() << " ms]" - << endl; + << (tot_seen / num_sentences) << ")\tllh=" << llh + << " ppl: " << exp(llh / trs) + << " uas: " << (correct_heads / total_heads) + << "\t[" << dev_size << " sents in " << ms << " ms]" << endl; + if (correct_heads > best_correct_heads) { best_correct_heads = correct_heads; SaveModel(model_fname, softlink_created); @@ -583,7 +584,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, vector LSTMParser::LogProbParser( const map& sentence, const map& sentence_pos, const CorpusVocabulary& vocab, - ComputationGraph *cg, double* correct, Expression* final_parser_state) { + ComputationGraph *cg, Expression* final_parser_state) { map tsentence(sentence); // sentence with OOVs replaced for (auto& index_and_id : tsentence) { // use reference to overwrite if (!vocab.int_to_training_word[index_and_id.second]) { @@ -592,17 +593,15 @@ vector LSTMParser::LogProbParser( } return LogProbParser(cg, sentence, tsentence, sentence_pos, vector(), vocab.actions, - vocab.int_to_words, correct, final_parser_state); + vocab.int_to_words, nullptr, final_parser_state); } ParseTree LSTMParser::Parse(const map& sentence, - const map& sentence_pos, - const CorpusVocabulary& vocab, - bool labeled, double* correct) { + const map& sentence_pos, + const CorpusVocabulary& vocab, bool labeled) { ComputationGraph cg; - vector pred = LogProbParser(sentence, sentence_pos, vocab, &cg, - correct); + vector pred = LogProbParser(sentence, sentence_pos, vocab, &cg); return RecoverParseTree(sentence, pred, vocab.actions, vocab.actions_to_arc_labels, labeled); } @@ -616,7 +615,6 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, } double llh = 0; double trs = 0; - double correct = 0; double correct_heads = 0; double total_heads = 0; auto t_start = chrono::high_resolution_clock::now(); @@ -626,7 +624,7 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, const map& sentence_pos = corpus.sentences_pos[sii]; const map& sentence_unk_str = corpus.sentences_unk_surface_forms[sii]; - ParseTree hyp = Parse(sentence, sentence_pos, vocab, true, &correct); + ParseTree hyp = Parse(sentence, sentence_pos, vocab, true); if (output_parses) { OutputConll(sentence, sentence_pos, sentence_unk_str, corpus.vocab->int_to_words, corpus.vocab->int_to_pos, @@ -651,8 +649,8 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, } auto t_end = chrono::high_resolution_clock::now(); if (evaluate) { - cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " - << (trs - correct) / trs << " uas: " << (correct_heads / total_heads) + cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) + << " uas: " << (correct_heads / total_heads) << "\t[" << corpus_size << " sents in " << chrono::duration(t_end - t_start).count() << " ms]" << endl; diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 09587f0..680594a 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -181,8 +181,7 @@ class LSTMParser { ParseTree Parse(const std::map& sentence, const std::map& sentence_pos, - const CorpusVocabulary& vocab, bool labeled, - double* correct = nullptr); + const CorpusVocabulary& vocab, bool labeled); // take a vector of actions and return a parse tree ParseTree RecoverParseTree( @@ -210,7 +209,6 @@ class LSTMParser { const std::map& sentence, const std::map& sentence_pos, const CorpusVocabulary& vocab, cnn::ComputationGraph *cg, - double* correct = nullptr, cnn::expr::Expression* final_parser_state = nullptr); void LoadPretrainedWords(const std::string& words_path); From 01736cb41204ce7db994bca4606b8c28302d6c5a Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 31 Jan 2017 17:04:07 -0500 Subject: [PATCH 20/88] Added logprob from NN to ParseTree data structure Makes numbers in logging accurate --- parser/lstm-parser.cc | 13 +++++++++---- parser/lstm-parser.h | 7 ++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 82e8543..512a093 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -174,7 +174,7 @@ bool LSTMParser::IsActionForbidden(const string& a, unsigned bsize, ParseTree LSTMParser::RecoverParseTree( const map& sentence, const vector& actions, const vector& action_names, - const vector& actions_to_arc_labels, bool labeled) { + const vector& actions_to_arc_labels, double logprob, bool labeled) { ParseTree tree(sentence, labeled); vector bufferi(sentence.size() + 1); bufferi[0] = -999; @@ -217,6 +217,8 @@ ParseTree LSTMParser::RecoverParseTree( } assert(bufferi.size() == 1); //assert(stacki.size() == 2); + + tree.logprob = logprob; return tree; } @@ -548,10 +550,10 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, const map& sentence = dev_corpus.sentences[sii]; const map& sentence_pos = dev_corpus.sentences_pos[sii]; + ParseTree hyp = Parse(sentence, sentence_pos, vocab, false); + llh += hyp.logprob; - double lp = 0; - llh -= lp; const vector& actions = dev_corpus.correct_act_sent[sii]; ParseTree ref = RecoverParseTree( sentence, actions, dev_corpus.vocab->actions, @@ -602,8 +604,9 @@ ParseTree LSTMParser::Parse(const map& sentence, const CorpusVocabulary& vocab, bool labeled) { ComputationGraph cg; vector pred = LogProbParser(sentence, sentence_pos, vocab, &cg); + double lp = as_scalar(cg.incremental_forward()); return RecoverParseTree(sentence, pred, vocab.actions, - vocab.actions_to_arc_labels, labeled); + vocab.actions_to_arc_labels, labeled, lp); } @@ -643,10 +646,12 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, corpus.vocab->actions_to_arc_labels, true); trs += actions.size(); + llh += hyp.logprob; correct_heads += ComputeCorrect(ref, hyp); total_heads += sentence.size() - 1; // -1 to account for ROOT } } + auto t_end = chrono::high_resolution_clock::now(); if (evaluate) { cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 680594a..9f90f48 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -71,11 +71,12 @@ class ParseTree { static std::string NO_LABEL; // Barebones representation of a parse tree. const std::map& sentence; + double logprob; ParseTree(const std::map& sentence, bool labeled = true) : sentence(sentence), - arc_labels( labeled ? new std::map : nullptr) { - } + logprob(0), + arc_labels( labeled ? new std::map : nullptr) {} inline void SetParent(unsigned child_index, unsigned parent_index, const std::string& arc_label="") { @@ -188,7 +189,7 @@ class LSTMParser { const std::map& sentence, const std::vector& actions, const std::vector& action_names, - const std::vector& actions_to_arc_labels, + const std::vector& actions_to_arc_labels, double logprob = 0, bool labeled = false); void Train(const ParserTrainingCorpus& corpus, From c693089e9040e7885f0fd13a902f7626c1990024 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 2 Feb 2017 16:02:07 -0500 Subject: [PATCH 21/88] Some typedefs --- parser/corpus.cc | 22 ++++++++++------------ parser/corpus.h | 18 ++++++++++-------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 3daf5a0..929aeed 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -27,9 +27,9 @@ const string ORACLE_ROOT_POS = "ROOT"; void ConllUCorpusReader::ReadSentences(const string& file, Corpus* corpus) const { string next_line; - map current_sentence_unk_surface_forms; - map current_sentence; - map current_sentence_pos; + Corpus::SentenceUnkMap current_sentence_unk_surface_forms; + Corpus::SentenceMap current_sentence; + Corpus::SentenceMap current_sentence_pos; ifstream conll_file(file); unsigned unk_word_symbol = corpus->vocab->GetWord(CorpusVocabulary::UNK); @@ -100,9 +100,8 @@ void ParserTrainingCorpus::CountSingletons() { void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( const string& word, const string& pos, unsigned next_token_index, - TrainingCorpus* corpus, map* sentence, - map* sentence_pos, - map* sentence_unk_surface_forms) const { + TrainingCorpus* corpus, SentenceMap* sentence, SentenceMap* sentence_pos, + SentenceUnkMap* sentence_unk_surface_forms) const { // We assume that we'll have seen all POS tags in training, so don't // worry about OOV tags. CorpusVocabulary* vocab = corpus->vocab; @@ -177,9 +176,8 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( - TrainingCorpus* corpus, map* sentence, - map* sentence_pos, - map* sentence_unk_surface_forms, bool final) const { + TrainingCorpus* corpus, SentenceMap* sentence, SentenceMap* sentence_pos, + SentenceUnkMap* sentence_unk_surface_forms, bool final) const { // Store the sentence variables and clear them for the next sentence. corpus->sentences.push_back({}); corpus->sentences.back().swap(*sentence); @@ -208,9 +206,9 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( bool start_of_sentence = false; bool first = true; - map sentence; - map sentence_pos; - map sentence_unk_surface_forms; + SentenceMap sentence; + SentenceMap sentence_pos; + SentenceUnkMap sentence_unk_surface_forms; corpus->correct_act_sent.push_back({}); // We'll need to make sure ROOT token has a consistent ID. diff --git a/parser/corpus.h b/parser/corpus.h index bdf036d..22ad869 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -201,9 +201,12 @@ class Corpus { // when iterating over a list of tokens in order of IDs. static constexpr unsigned ROOT_TOKEN_ID = -1; - std::vector> sentences; - std::vector> sentences_pos; - std::vector> sentences_unk_surface_forms; + typedef std::map SentenceMap; + typedef std::map SentenceUnkMap; + + std::vector sentences; + std::vector sentences_pos; + std::vector sentences_unk_surface_forms; CorpusVocabulary* vocab; Corpus(CorpusVocabulary* vocab, const CorpusReader& reader, @@ -253,11 +256,10 @@ class TrainingCorpus : public Corpus { void RecordAction(const std::string& action, TrainingCorpus* corpus) const; - void RecordSentence( - TrainingCorpus* corpus, std::map* sentence, - std::map* sentence_pos, - std::map* sentence_unk_surface_forms, - bool final = false) const; + void RecordSentence(TrainingCorpus* corpus, SentenceMap* sentence, + SentenceMap* sentence_pos, + SentenceUnkMap* sentence_unk_surface_forms, + bool final = false) const; static inline unsigned UTF8Len(unsigned char x) { if (x < 0x80) return 1; From a77168d2a5d0f8fdaeb015694c928c097f4cd841 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 2 Feb 2017 17:06:54 -0500 Subject: [PATCH 22/88] Switched everything over to Sentence objects --- parser/corpus.cc | 51 ++++++++++++++-------------- parser/corpus.h | 33 ++++++++++++------- parser/lstm-parser.cc | 77 +++++++++++++++++++------------------------ parser/lstm-parser.h | 27 +++++++-------- 4 files changed, 90 insertions(+), 98 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 929aeed..4f04952 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -27,9 +27,10 @@ const string ORACLE_ROOT_POS = "ROOT"; void ConllUCorpusReader::ReadSentences(const string& file, Corpus* corpus) const { string next_line; - Corpus::SentenceUnkMap current_sentence_unk_surface_forms; - Corpus::SentenceMap current_sentence; - Corpus::SentenceMap current_sentence_pos; + // TODO: Replace this code with simpler Sentence-based code. + Sentence::SentenceUnkMap current_sentence_unk_surface_forms; + Sentence::SentenceMap current_sentence; + Sentence::SentenceMap current_sentence_pos; ifstream conll_file(file); unsigned unk_word_symbol = corpus->vocab->GetWord(CorpusVocabulary::UNK); @@ -43,15 +44,11 @@ void ConllUCorpusReader::ReadSentences(const string& file, current_sentence_pos[Corpus::ROOT_TOKEN_ID] = root_pos_symbol; current_sentence_unk_surface_forms[Corpus::ROOT_TOKEN_ID] = ""; - corpus->sentences.push_back(move(current_sentence)); - current_sentence.clear(); - - corpus->sentences_pos.push_back(move(current_sentence_pos)); - current_sentence_pos.clear(); - - corpus->sentences_unk_surface_forms.push_back( - move(current_sentence_unk_surface_forms)); - current_sentence_unk_surface_forms.clear(); + corpus->sentences.push_back({}); + corpus->sentences.back().words.swap(current_sentence); + corpus->sentences.back().poses.swap(current_sentence_pos); + corpus->sentences.back().unk_surface_forms.swap( + current_sentence_unk_surface_forms); } continue; } else if (next_line[0] == '#') { @@ -87,7 +84,7 @@ void ParserTrainingCorpus::CountSingletons() { // compute the singletons in the parser's training data map counts; for (const auto& sent : sentences) { - for (const auto& index_and_word_id : sent) { + for (const auto& index_and_word_id : sent.words) { counts[index_and_word_id.second]++; } } @@ -100,8 +97,9 @@ void ParserTrainingCorpus::CountSingletons() { void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( const string& word, const string& pos, unsigned next_token_index, - TrainingCorpus* corpus, SentenceMap* sentence, SentenceMap* sentence_pos, - SentenceUnkMap* sentence_unk_surface_forms) const { + TrainingCorpus* corpus, Sentence::SentenceMap* sentence, + Sentence::SentenceMap* sentence_pos, + Sentence::SentenceUnkMap* sentence_unk_surface_forms) const { // We assume that we'll have seen all POS tags in training, so don't // worry about OOV tags. CorpusVocabulary* vocab = corpus->vocab; @@ -176,18 +174,18 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( - TrainingCorpus* corpus, SentenceMap* sentence, SentenceMap* sentence_pos, - SentenceUnkMap* sentence_unk_surface_forms, bool final) const { + TrainingCorpus* corpus, Sentence::SentenceMap* words, + Sentence::SentenceMap* sentence_pos, + Sentence::SentenceUnkMap* sentence_unk_surface_forms, bool final) const { // Store the sentence variables and clear them for the next sentence. corpus->sentences.push_back({}); - corpus->sentences.back().swap(*sentence); - corpus->sentences_pos.push_back({}); - corpus->sentences_pos.back().swap(*sentence_pos); + Sentence* sentence = &corpus->sentences.back(); + sentence->words.swap(*words); + sentence->poses.swap(*sentence_pos); if (!is_training) { - corpus->sentences_unk_surface_forms.push_back({}); - corpus->sentences_unk_surface_forms.back().swap( - *sentence_unk_surface_forms); + sentence->unk_surface_forms.swap(*sentence_unk_surface_forms); } + if (!final) { corpus->correct_act_sent.push_back({}); } @@ -206,9 +204,10 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( bool start_of_sentence = false; bool first = true; - SentenceMap sentence; - SentenceMap sentence_pos; - SentenceUnkMap sentence_unk_surface_forms; + // TODO: replace this code with simpler Sentence-based code. + Sentence::SentenceMap sentence; + Sentence::SentenceMap sentence_pos; + Sentence::SentenceUnkMap sentence_unk_surface_forms; corpus->correct_act_sent.push_back({}); // We'll need to make sure ROOT token has a consistent ID. diff --git a/parser/corpus.h b/parser/corpus.h index 22ad869..1b75ed8 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -195,18 +195,27 @@ class ConllUCorpusReader : public CorpusReader { }; +struct Sentence { + typedef std::map SentenceMap; + typedef std::map SentenceUnkMap; + + SentenceMap words; + SentenceMap poses; + SentenceUnkMap unk_surface_forms; + + size_t Size() const { + return words.size(); + } +}; + + class Corpus { public: // Store root tokens with unsigned ID -1 internally to make root come last // when iterating over a list of tokens in order of IDs. static constexpr unsigned ROOT_TOKEN_ID = -1; - typedef std::map SentenceMap; - typedef std::map SentenceUnkMap; - - std::vector sentences; - std::vector sentences_pos; - std::vector sentences_unk_surface_forms; + std::vector sentences; CorpusVocabulary* vocab; Corpus(CorpusVocabulary* vocab, const CorpusReader& reader, @@ -250,15 +259,15 @@ class TrainingCorpus : public Corpus { void RecordWord( const std::string& word, const std::string& pos, unsigned next_token_index, TrainingCorpus* corpus, - std::map* sentence, - std::map* sentence_pos, - std::map* sentence_unk_surface_forms) const; + Sentence::SentenceMap* sentence, + Sentence::SentenceMap* sentence_pos, + Sentence::SentenceUnkMap* sentence_unk_surface_forms) const; void RecordAction(const std::string& action, TrainingCorpus* corpus) const; - void RecordSentence(TrainingCorpus* corpus, SentenceMap* sentence, - SentenceMap* sentence_pos, - SentenceUnkMap* sentence_unk_surface_forms, + void RecordSentence(TrainingCorpus* corpus, Sentence::SentenceMap* words, + Sentence::SentenceMap* sentence_pos, + Sentence::SentenceUnkMap* sentence_unk_surface_forms, bool final = false) const; static inline unsigned UTF8Len(unsigned char x) { diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 512a093..fbfa9f2 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -172,17 +172,18 @@ bool LSTMParser::IsActionForbidden(const string& a, unsigned bsize, ParseTree LSTMParser::RecoverParseTree( - const map& sentence, const vector& actions, + const Sentence& sentence, const vector& actions, const vector& action_names, const vector& actions_to_arc_labels, double logprob, bool labeled) { ParseTree tree(sentence, labeled); - vector bufferi(sentence.size() + 1); + vector bufferi(sentence.Size() + 1); bufferi[0] = -999; vector stacki(1, -999); unsigned added_to_buffer = 0; - for (const auto& index_and_word_id : sentence) { + for (const auto& index_and_word_id : sentence.words) { // ROOT is set to -1, so it'll come last in a sequence of unsigned ints. - bufferi[sentence.size() - added_to_buffer++] = index_and_word_id.first; + bufferi[sentence.Size() - added_to_buffer++] = + index_and_word_id.first; } for (auto action : actions) { // loop over transitions for sentence const string& action_string = action_names[action]; @@ -225,9 +226,8 @@ ParseTree LSTMParser::RecoverParseTree( vector LSTMParser::LogProbParser( ComputationGraph* hg, - const map& raw_sent, // raw sentence - const map& sent, // sentence with OOVs replaced - const map& sent_pos, + const Sentence& raw_sent, // raw sentence + const Sentence::SentenceMap& sent, // sentence with OOVs replaced const vector& correct_actions, const vector& action_names, const vector& int_to_words, double* correct, Expression* final_parser_state) { @@ -280,12 +280,12 @@ vector LSTMParser::LogProbParser( vector args = {ib, w2l, w}; // learn embeddings if (options.use_pos) { // learn POS tag? - unsigned pos_id = sent_pos.find(token_index)->second; + unsigned pos_id = raw_sent.poses.find(token_index)->second; Expression p = lookup(*hg, p_p, pos_id); args.push_back(p2l); args.push_back(p); } - unsigned raw_word_id = raw_sent.find(token_index)->second; + unsigned raw_word_id = raw_sent.words.find(token_index)->second; if (p_t && pretrained.count(raw_word_id)) { // include pretrained vectors? Expression t = const_lookup(*hg, p_t, raw_word_id); args.push_back(t2l); @@ -498,8 +498,8 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, random_shuffle(order.begin(), order.end()); } tot_seen += 1; - const map& sentence = corpus.sentences[order[si]]; - map tsentence(sentence); + const Sentence& sentence = corpus.sentences[order[si]]; + Sentence::SentenceMap tsentence(sentence.words); if (options.unk_strategy == 1) { for (auto& index_and_id : tsentence) { // use reference to overwrite if (corpus.singletons.count(index_and_id.second) @@ -508,11 +508,9 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } } } - const map& sentence_pos = - corpus.sentences_pos[order[si]]; const vector& actions = corpus.correct_act_sent[order[si]]; ComputationGraph hg; - LogProbParser(&hg, sentence, tsentence, sentence_pos, actions, + LogProbParser(&hg, sentence, tsentence, actions, corpus.vocab->actions, corpus.vocab->int_to_words, &correct); double lp = as_scalar(hg.incremental_forward()); @@ -547,11 +545,9 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, double total_heads = 0; auto t_start = chrono::high_resolution_clock::now(); for (unsigned sii = 0; sii < dev_size; ++sii) { - const map& sentence = dev_corpus.sentences[sii]; - const map& sentence_pos = - dev_corpus.sentences_pos[sii]; + const Sentence& sentence = dev_corpus.sentences[sii]; - ParseTree hyp = Parse(sentence, sentence_pos, vocab, false); + ParseTree hyp = Parse(sentence, vocab, false); llh += hyp.logprob; const vector& actions = dev_corpus.correct_act_sent[sii]; @@ -561,7 +557,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, trs += actions.size(); correct_heads += ComputeCorrect(ref, hyp); - total_heads += sentence.size() - 1; // -1 to account for ROOT + total_heads += sentence.Size() - 1; // -1 to account for ROOT } auto t_end = chrono::high_resolution_clock::now(); @@ -584,26 +580,24 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, // TODO: fix this so that correct actually does something sometimes vector LSTMParser::LogProbParser( - const map& sentence, - const map& sentence_pos, const CorpusVocabulary& vocab, + const Sentence& sentence, const CorpusVocabulary& vocab, ComputationGraph *cg, Expression* final_parser_state) { - map tsentence(sentence); // sentence with OOVs replaced + Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced for (auto& index_and_id : tsentence) { // use reference to overwrite if (!vocab.int_to_training_word[index_and_id.second]) { index_and_id.second = kUNK; } } - return LogProbParser(cg, sentence, tsentence, sentence_pos, - vector(), vocab.actions, - vocab.int_to_words, nullptr, final_parser_state); + return LogProbParser(cg, sentence, tsentence, vector(), + vocab.actions, vocab.int_to_words, nullptr, + final_parser_state); } -ParseTree LSTMParser::Parse(const map& sentence, - const map& sentence_pos, +ParseTree LSTMParser::Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled) { ComputationGraph cg; - vector pred = LogProbParser(sentence, sentence_pos, vocab, &cg); + vector pred = LogProbParser(sentence, vocab, &cg); double lp = as_scalar(cg.incremental_forward()); return RecoverParseTree(sentence, pred, vocab.actions, vocab.actions_to_arc_labels, labeled, lp); @@ -623,15 +617,11 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, auto t_start = chrono::high_resolution_clock::now(); unsigned corpus_size = corpus.sentences.size(); for (unsigned sii = 0; sii < corpus_size; ++sii) { - const map& sentence = corpus.sentences[sii]; - const map& sentence_pos = corpus.sentences_pos[sii]; - const map& sentence_unk_str = - corpus.sentences_unk_surface_forms[sii]; - ParseTree hyp = Parse(sentence, sentence_pos, vocab, true); + const Sentence& sentence = corpus.sentences[sii]; + ParseTree hyp = Parse(sentence, vocab, true); if (output_parses) { - OutputConll(sentence, sentence_pos, sentence_unk_str, - corpus.vocab->int_to_words, corpus.vocab->int_to_pos, - corpus.vocab->words_to_int, hyp); + OutputConll(sentence, corpus.vocab->int_to_words, + corpus.vocab->int_to_pos, corpus.vocab->words_to_int, hyp); } if (evaluate) { @@ -648,7 +638,7 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, trs += actions.size(); llh += hyp.logprob; correct_heads += ComputeCorrect(ref, hyp); - total_heads += sentence.size() - 1; // -1 to account for ROOT + total_heads += sentence.Size() - 1; // -1 to account for ROOT } } @@ -667,29 +657,28 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, } -void LSTMParser::OutputConll(const map& sentence, - const map& pos, - const map& sentence_unk_strings, +void LSTMParser::OutputConll(const Sentence& sentence, const vector& int_to_words, const vector& int_to_pos, const map& words_to_int, const ParseTree& tree) { const unsigned int unk_word = words_to_int.find(CorpusVocabulary::UNK)->second; - for (const auto& token_index_and_word : sentence) { + for (const auto& token_index_and_word : sentence.words) { unsigned token_index = token_index_and_word.first; unsigned word_id = token_index_and_word.second; if (token_index == Corpus::ROOT_TOKEN_ID) // don't output anything for ROOT continue; - auto unk_strs_iter = sentence_unk_strings.find(token_index); - assert(unk_strs_iter != sentence_unk_strings.end() && + auto unk_strs_iter = sentence.unk_surface_forms.find(token_index); + assert(unk_strs_iter != sentence.unk_surface_forms.end() && ((word_id == unk_word && unk_strs_iter->second.size() > 0) || (word_id != unk_word && unk_strs_iter->second.size() == 0 && int_to_words.size() > word_id))); string wit = (unk_strs_iter->second.size() > 0) ? unk_strs_iter->second : int_to_words[word_id]; - const string& pos_tag = int_to_pos[pos.find(token_index)->second]; + const string& pos_tag = int_to_pos[ + sentence.poses.find(token_index)->second]; unsigned parent = tree.GetParent(token_index); if (parent == Corpus::ROOT_TOKEN_ID) parent = 0; diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 9f90f48..ef71451 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -70,10 +70,10 @@ class ParseTree { public: static std::string NO_LABEL; // Barebones representation of a parse tree. - const std::map& sentence; + const Sentence& sentence; double logprob; - ParseTree(const std::map& sentence, bool labeled = true) : + ParseTree(const Sentence& sentence, bool labeled = true) : sentence(sentence), logprob(0), arc_labels( labeled ? new std::map : nullptr) {} @@ -180,13 +180,12 @@ class LSTMParser { static bool IsActionForbidden(const std::string& a, unsigned bsize, unsigned ssize, const std::vector& stacki); - ParseTree Parse(const std::map& sentence, - const std::map& sentence_pos, + ParseTree Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled); // take a vector of actions and return a parse tree ParseTree RecoverParseTree( - const std::map& sentence, + const Sentence& sentence, const std::vector& actions, const std::vector& action_names, const std::vector& actions_to_arc_labels, double logprob = 0, @@ -207,9 +206,8 @@ class LSTMParser { // Used for testing. Replaces OOV with UNK. std::vector LogProbParser( - const std::map& sentence, - const std::map& sentence_pos, - const CorpusVocabulary& vocab, cnn::ComputationGraph *cg, + const Sentence& sentence, const CorpusVocabulary& vocab, + cnn::ComputationGraph *cg, cnn::expr::Expression* final_parser_state = nullptr); void LoadPretrainedWords(const std::string& words_path); @@ -226,9 +224,8 @@ class LSTMParser { // OOV in the parser training data. std::vector LogProbParser( cnn::ComputationGraph* hg, - const std::map& raw_sent, // raw sentence - const std::map& sent, // sentence with OOVs replaced - const std::map& sent_pos, + const Sentence& sentence, // raw sentence + const Sentence::SentenceMap& sent, // sentence with OOVs replaced const std::vector& correct_actions, const std::vector& action_names, const std::vector& int_to_words, double* correct, @@ -238,9 +235,9 @@ class LSTMParser { inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { - assert(ref.sentence.size() == hyp.sentence.size()); + assert(ref.sentence.Size() == hyp.sentence.Size()); unsigned correct_count = 0; - for (const auto& token_index_and_word : ref.sentence) { + for (const auto& token_index_and_word : ref.sentence.words) { unsigned i = token_index_and_word.first; if (i != Corpus::ROOT_TOKEN_ID && ref.GetParent(i) == hyp.GetParent(i)) ++correct_count; @@ -287,9 +284,7 @@ class LSTMParser { void DoTest(const Corpus& corpus, bool evaluate, bool output_parses); - static void OutputConll(const std::map& sentence, - const std::map& pos, - const std::map& sentence_unk_strings, + static void OutputConll(const Sentence& sentence, const std::vector& int_to_words, const std::vector& int_to_pos, const std::map& words_to_int, From 6a90c5e5e49596a3dbedfc92e033432a300ce011 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 6 Feb 2017 13:47:28 -0500 Subject: [PATCH 23/88] Replaced some map find() calls with at() calls --- parser/lstm-parser.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index fbfa9f2..bb0c4d3 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -280,12 +280,12 @@ vector LSTMParser::LogProbParser( vector args = {ib, w2l, w}; // learn embeddings if (options.use_pos) { // learn POS tag? - unsigned pos_id = raw_sent.poses.find(token_index)->second; + unsigned pos_id = raw_sent.poses.at(token_index); Expression p = lookup(*hg, p_p, pos_id); args.push_back(p2l); args.push_back(p); } - unsigned raw_word_id = raw_sent.words.find(token_index)->second; + unsigned raw_word_id = raw_sent.words.at(token_index); if (p_t && pretrained.count(raw_word_id)) { // include pretrained vectors? Expression t = const_lookup(*hg, p_t, raw_word_id); args.push_back(t2l); @@ -662,8 +662,7 @@ void LSTMParser::OutputConll(const Sentence& sentence, const vector& int_to_pos, const map& words_to_int, const ParseTree& tree) { - const unsigned int unk_word = - words_to_int.find(CorpusVocabulary::UNK)->second; + const unsigned int unk_word = words_to_int.at(CorpusVocabulary::UNK); for (const auto& token_index_and_word : sentence.words) { unsigned token_index = token_index_and_word.first; unsigned word_id = token_index_and_word.second; @@ -677,8 +676,7 @@ void LSTMParser::OutputConll(const Sentence& sentence, int_to_words.size() > word_id))); string wit = (unk_strs_iter->second.size() > 0) ? unk_strs_iter->second : int_to_words[word_id]; - const string& pos_tag = int_to_pos[ - sentence.poses.find(token_index)->second]; + const string& pos_tag = int_to_pos[sentence.poses.at(token_index)]; unsigned parent = tree.GetParent(token_index); if (parent == Corpus::ROOT_TOKEN_ID) parent = 0; From 014cd2655b23440c84a6bd858e703e4a7726513e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 27 Feb 2017 14:24:55 -0500 Subject: [PATCH 24/88] Removed old comment --- parser/lstm-parser.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index bb0c4d3..8244032 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -578,7 +578,6 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } -// TODO: fix this so that correct actually does something sometimes vector LSTMParser::LogProbParser( const Sentence& sentence, const CorpusVocabulary& vocab, ComputationGraph *cg, Expression* final_parser_state) { From 95ab82f668cce6b9aa8418bc53fc11d6a20017b2 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 13:47:32 -0500 Subject: [PATCH 25/88] Added checks for files being provided and successfully opened Also added kUNK to the corpus class --- parser/corpus.cc | 10 ++++++++++ parser/corpus.h | 7 +++++-- parser/lstm-parser-driver.cc | 12 ++++++++++++ parser/lstm-parser.h | 4 ++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 4f04952..2ab2442 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -33,6 +33,11 @@ void ConllUCorpusReader::ReadSentences(const string& file, Sentence::SentenceMap current_sentence_pos; ifstream conll_file(file); + if (!conll_file) { + cerr << "Unable to open corpus file " << file << "; aborting" << endl; + abort(); + } + unsigned unk_word_symbol = corpus->vocab->GetWord(CorpusVocabulary::UNK); unsigned root_symbol = corpus->vocab->GetWord(CorpusVocabulary::ROOT); unsigned root_pos_symbol = corpus->vocab->GetPOS(CorpusVocabulary::ROOT); @@ -197,6 +202,11 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( cerr << "Loading " << (is_training ? "training" : "dev") << " corpus from " << file << "..." << endl; ifstream actions_file(file); + if (!actions_file) { + cerr << "Unable to open actions file " << file << "; aborting" << endl; + abort(); + } + string line; CorpusVocabulary* vocab = corpus->vocab; diff --git a/parser/corpus.h b/parser/corpus.h index 1b75ed8..5a3c7a8 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -38,9 +38,11 @@ class CorpusVocabulary { std::vector actions; std::vector actions_to_arc_labels; + unsigned kUNK; + CorpusVocabulary() : int_to_training_word({true, true}) { AddEntry(BAD0, &words_to_int, &int_to_words); - AddEntry(UNK, &words_to_int, &int_to_words); + kUNK = AddEntry(UNK, &words_to_int, &int_to_words); AddEntry(BAD0, &chars_to_int, &int_to_chars); } @@ -51,7 +53,8 @@ class CorpusVocabulary { words_to_int(other.words_to_int), int_to_words(other.int_to_words), int_to_training_word(other.int_to_training_word), pos_to_int(other.pos_to_int), int_to_pos(other.int_to_pos), - chars_to_int(other.chars_to_int), int_to_chars(other.int_to_chars) {} + chars_to_int(other.chars_to_int), int_to_chars( + other.int_to_chars), kUNK(other.kUNK) {} inline unsigned CountPOS() { return pos_to_int.size(); } inline unsigned CountWords() { return words_to_int.size(); } diff --git a/parser/lstm-parser-driver.cc b/parser/lstm-parser-driver.cc index e5aaf91..6ed78fa 100644 --- a/parser/lstm-parser-driver.cc +++ b/parser/lstm-parser-driver.cc @@ -119,6 +119,18 @@ int main(int argc, char** argv) { cerr << "No model specified for testing!" << endl; abort(); } + if (train && !load_model) { + if (!conf.count("words")) { + cerr << "Can't train without word vectors! Please provide --words." + << endl; + abort(); + } + if (!conf.count("training_data")) { + cerr << "Can't train without training data! Please provide" + " --training_data" << endl; + abort(); + } + } const string words = load_model ? "" : conf["words"].as(); unique_ptr parser; diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index ef71451..1410525 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -161,6 +161,10 @@ class LSTMParser { std::cerr << "Loading model from " << model_path << "..."; auto t_start = std::chrono::high_resolution_clock::now(); std::ifstream model_file(model_path.c_str(), std::ios::binary); + if (!model_file) { + std::cerr << "Unable to open model file; aborting" << std::endl; + abort(); + } eos::portable_iarchive archive(model_file); archive >> *this; auto t_end = std::chrono::high_resolution_clock::now(); From 0b9684bcc84aba417f655d08a6d1ac7e5966ba38 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 13:52:26 -0500 Subject: [PATCH 26/88] Abstracted out much of the LSTM transition tagger architecture (In preparation for reusing the same architecture for another tagger that is not a parser.) --- parser/CMakeLists.txt | 2 +- parser/lstm-parser.cc | 366 +++++++++++------------------- parser/lstm-parser.h | 73 +++--- parser/lstm-transition-tagger.cpp | 124 ++++++++++ parser/lstm-transition-tagger.h | 82 +++++++ 5 files changed, 390 insertions(+), 257 deletions(-) create mode 100644 parser/lstm-transition-tagger.cpp create mode 100644 parser/lstm-transition-tagger.h diff --git a/parser/CMakeLists.txt b/parser/CMakeLists.txt index 3ac3352..1217810 100644 --- a/parser/CMakeLists.txt +++ b/parser/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(lstm-parser:parser) CMAKE_MINIMUM_REQUIRED(VERSION 2.8) -ADD_LIBRARY(lstm-parser-core lstm-parser.cc corpus.cc) +ADD_LIBRARY(lstm-parser-core lstm-parser.cc corpus.cc lstm-transition-tagger.cpp) target_link_libraries(lstm-parser-core cnn ${Boost_LIBRARIES}) ADD_EXECUTABLE(lstm-parse lstm-parser-driver.cc) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 8244032..764fbea 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -14,7 +14,6 @@ #include "cnn/model.h" #include "cnn/tensor.h" -#include "eos/portable_archive.hpp" using namespace cnn::expr; @@ -121,7 +120,6 @@ void LSTMParser::FinalizeVocab() { LSTMParser::LSTMParser(const ParserOptions& poptions, const string& pretrained_words_path, bool finalize) : options(poptions), - kUNK(vocab.GetOrAddWord(vocab.UNK)), kROOT_SYMBOL(vocab.GetOrAddWord(vocab.ROOT)), stack_lstm(options.layers, options.lstm_input_dim, options.hidden_dim, &model), @@ -143,13 +141,16 @@ LSTMParser::LSTMParser(const ParserOptions& poptions, } -bool LSTMParser::IsActionForbidden(const string& a, unsigned bsize, - unsigned ssize, const vector& stacki) { +bool LSTMParser::IsActionForbidden(const string& a, const TaggerState& state) { + const ParserState& real_state = static_cast(state); + unsigned ssize = real_state.stack.size(); + unsigned bsize = real_state.buffer.size(); + if (a[1] == 'W' && ssize < 3) return true; if (a[1] == 'W') { - int top = stacki[stacki.size() - 1]; - int sec = stacki[stacki.size() - 2]; + int top = real_state.stacki[real_state.stacki.size() - 1]; + int sec = real_state.stacki[real_state.stacki.size() - 2]; if (sec > top) return true; } @@ -224,234 +225,156 @@ ParseTree LSTMParser::RecoverParseTree( } -vector LSTMParser::LogProbParser( - ComputationGraph* hg, - const Sentence& raw_sent, // raw sentence +cnn::expr::Expression LSTMParser::GetActionProbabilities( + const TaggerState& state) { + // p_t = pbias + S * slstm + B * blstm + A * alstm + Expression p_t = affine_transform( + {GetParamExpr(p_pbias), GetParamExpr(p_S), stack_lstm.back(), + GetParamExpr(p_B), buffer_lstm.back(), GetParamExpr(p_A), + action_lstm.back()}); + Expression nlp_t = rectify(p_t); + // r_t = abias + p2a * nlp + Expression r_t = affine_transform( + {GetParamExpr(p_abias), GetParamExpr(p_p2a), nlp_t}); + return r_t; +} + + +void LSTMParser::DoAction(unsigned action, + const vector& action_names, + TaggerState* state, ComputationGraph* cg) { + ParserState* real_state = static_cast(state); + // add current action to action LSTM + Expression action_e = lookup(*cg, p_a, action); + action_lstm.add_input(action_e); + + // get relation embedding from action (TODO: convert to rel from action?) + Expression relation = lookup(*cg, p_r, action); + + // do action + const string& action_string = action_names[action]; + const char ac = action_string[0]; + const char ac2 = action_string[1]; + + if (ac == 'S' && ac2 == 'H') { // SHIFT + assert(real_state->buffer.size() > 1); // dummy symbol means > 1 (not >= 1) + real_state->stack.push_back(real_state->buffer.back()); + stack_lstm.add_input(real_state->buffer.back()); + real_state->buffer.pop_back(); + buffer_lstm.rewind_one_step(); + real_state->stacki.push_back(real_state->bufferi.back()); + real_state->bufferi.pop_back(); + } else if (ac == 'S' && ac2 == 'W') { //SWAP --- Miguel + assert(real_state->stack.size() > 2); // dummy symbol means > 2 (not >= 2) + + Expression toki, tokj; + unsigned ii = 0, jj = 0; + tokj = real_state->stack.back(); + jj = real_state->stacki.back(); + real_state->stack.pop_back(); + real_state->stacki.pop_back(); + + toki = real_state->stack.back(); + ii = real_state->stacki.back(); + real_state->stack.pop_back(); + real_state->stacki.pop_back(); + + real_state->buffer.push_back(toki); + real_state->bufferi.push_back(ii); + + stack_lstm.rewind_one_step(); + stack_lstm.rewind_one_step(); + + buffer_lstm.add_input(real_state->buffer.back()); + + real_state->stack.push_back(tokj); + real_state->stacki.push_back(jj); + + stack_lstm.add_input(real_state->stack.back()); + } else { // LEFT or RIGHT + assert(real_state->stack.size() > 2); // dummy symbol means > 2 (not >= 2) + assert(ac == 'L' || ac == 'R'); + Expression dep, head; + unsigned depi = 0, headi = 0; + (ac == 'R' ? dep : head) = real_state->stack.back(); + (ac == 'R' ? depi : headi) = real_state->stacki.back(); + real_state->stack.pop_back(); + real_state->stacki.pop_back(); + (ac == 'R' ? head : dep) = real_state->stack.back(); + (ac == 'R' ? headi : depi) = real_state->stacki.back(); + real_state->stack.pop_back(); + real_state->stacki.pop_back(); + // composed = cbias + H * head + D * dep + R * relation + Expression composed = affine_transform({GetParamExpr(p_cbias), + GetParamExpr(p_H), head, GetParamExpr(p_D), dep, GetParamExpr(p_R), + relation}); + Expression nlcomposed = tanh(composed); + stack_lstm.rewind_one_step(); + stack_lstm.rewind_one_step(); + stack_lstm.add_input(nlcomposed); + real_state->stack.push_back(nlcomposed); + real_state->stacki.push_back(headi); + } +} + + +LSTMTransitionTagger::TaggerState* LSTMParser::InitializeParserState( + cnn::ComputationGraph* cg, + const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const vector& correct_actions, const vector& action_names, - const vector& int_to_words, double* correct, - Expression* final_parser_state) { - // TODO: break up this function? - assert(finalized); - vector results; - const bool build_training_graph = correct_actions.size() > 0; - - stack_lstm.new_graph(*hg); - buffer_lstm.new_graph(*hg); - action_lstm.new_graph(*hg); + const std::vector& correct_actions, + const std::vector& action_names) { + stack_lstm.new_graph(*cg); + buffer_lstm.new_graph(*cg); + action_lstm.new_graph(*cg); stack_lstm.start_new_sequence(); buffer_lstm.start_new_sequence(); action_lstm.start_new_sequence(); - // variables in the computation graph representing the parameters - Expression pbias = parameter(*hg, p_pbias); - Expression H = parameter(*hg, p_H); - Expression D = parameter(*hg, p_D); - Expression R = parameter(*hg, p_R); - Expression cbias = parameter(*hg, p_cbias); - Expression S = parameter(*hg, p_S); - Expression B = parameter(*hg, p_B); - Expression A = parameter(*hg, p_A); - Expression ib = parameter(*hg, p_ib); - Expression w2l = parameter(*hg, p_w2l); - Expression p2l; - if (options.use_pos) - p2l = parameter(*hg, p_p2l); - Expression t2l; - if (p_t2l) - t2l = parameter(*hg, p_t2l); - Expression p2a = parameter(*hg, p_p2a); - Expression abias = parameter(*hg, p_abias); - Expression action_start = parameter(*hg, p_action_start); - - action_lstm.add_input(action_start); - - // variables representing word embeddings (possibly including POS info) - vector buffer(sent.size() + 1); - vector bufferi(sent.size() + 1); // position of the words in the sentence - // precompute buffer representation from left to right + action_lstm.add_input(GetParamExpr(p_action_start)); + + ParserState* state = new ParserState; + state->buffer.resize(raw_sent.Size() + 1); + state->bufferi.resize(raw_sent.Size() + 1); + state->stack.push_back(parameter(*cg, p_stack_guard)); + state->stacki.push_back(-999); + // drive dummy symbol on stack through LSTM + stack_lstm.add_input(state->stack.back()); + + // precompute buffer representation from left to right unsigned added_to_buffer = 0; for (const auto& index_and_word_id : sent) { unsigned token_index = index_and_word_id.first; unsigned word_id = index_and_word_id.second; assert(word_id < vocab.CountWords()); - Expression w = lookup(*hg, p_w, word_id); + Expression w = lookup(*cg, p_w, word_id); - vector args = {ib, w2l, w}; // learn embeddings - if (options.use_pos) { // learn POS tag? + vector args = {GetParamExpr(p_ib), GetParamExpr(p_w2l), + w}; // learn embeddings + if (options.use_pos) { // learn POS tag? unsigned pos_id = raw_sent.poses.at(token_index); - Expression p = lookup(*hg, p_p, pos_id); - args.push_back(p2l); + Expression p = lookup(*cg, p_p, pos_id); + args.push_back(GetParamExpr(p_p2l)); args.push_back(p); } unsigned raw_word_id = raw_sent.words.at(token_index); - if (p_t && pretrained.count(raw_word_id)) { // include pretrained vectors? - Expression t = const_lookup(*hg, p_t, raw_word_id); - args.push_back(t2l); + if (p_t && pretrained.count(raw_word_id)) { // include pretrained vectors? + Expression t = const_lookup(*cg, p_t, raw_word_id); + args.push_back(GetParamExpr(p_t2l)); args.push_back(t); } - buffer[sent.size() - added_to_buffer] = rectify(affine_transform(args)); - bufferi[sent.size() - added_to_buffer] = token_index; + state->buffer[sent.size() - added_to_buffer] = rectify(affine_transform(args)); + state->bufferi[sent.size() - added_to_buffer] = token_index; added_to_buffer++; } // dummy symbol to represent the empty buffer - buffer[0] = parameter(*hg, p_buffer_guard); - bufferi[0] = -999; - for (auto& b : buffer) + state->buffer[0] = parameter(*cg, p_buffer_guard); + state->bufferi[0] = -999; + for (auto& b : state->buffer) buffer_lstm.add_input(b); - vector stack; // variables representing subtree embeddings - vector stacki; // position of words in the sentence of head of subtree - stack.push_back(parameter(*hg, p_stack_guard)); - stacki.push_back(-999); // not used for anything - // drive dummy symbol on stack through LSTM - stack_lstm.add_input(stack.back()); - vector log_probs; - unsigned action_count = 0; // incremented at each prediction - Expression p_t; // declared outside to allow access later - while (stack.size() > 2 || buffer.size() > 1) { - // get list of possible actions for the current parser state - vector current_valid_actions; - for (unsigned action = 0; action < n_possible_actions; ++action) { - if (IsActionForbidden(action_names[action], buffer.size(), stack.size(), - stacki)) - continue; - current_valid_actions.push_back(action); - } - - // p_t = pbias + S * slstm + B * blstm + A * almst - p_t = affine_transform( - {pbias, S, stack_lstm.back(), B, buffer_lstm.back(), A, - action_lstm.back()}); - Expression nlp_t = rectify(p_t); - // r_t = abias + p2a * nlp - Expression r_t = affine_transform({abias, p2a, nlp_t}); - - // adist = log_softmax(r_t, current_valid_actions) - Expression adiste = log_softmax(r_t, current_valid_actions); - vector adist = as_vector(hg->incremental_forward()); - double best_score = adist[current_valid_actions[0]]; - unsigned best_a = current_valid_actions[0]; - for (unsigned i = 1; i < current_valid_actions.size(); ++i) { - if (adist[current_valid_actions[i]] > best_score) { - best_score = adist[current_valid_actions[i]]; - best_a = current_valid_actions[i]; - } - } - unsigned action = best_a; - // If we have reference actions (for training), use the reference action. - if (build_training_graph) { - action = correct_actions[action_count]; - if (correct && best_a == action) { - (*correct)++; - } - } - ++action_count; - log_probs.push_back(pick(adiste, action)); - results.push_back(action); - - // add current action to action LSTM - Expression action_e = lookup(*hg, p_a, action); - action_lstm.add_input(action_e); - - // get relation embedding from action (TODO: convert to rel from action?) - Expression relation = lookup(*hg, p_r, action); - - // do action - const string& action_string = action_names[action]; - const char ac = action_string[0]; - const char ac2 = action_string[1]; - - if (ac == 'S' && ac2 == 'H') { // SHIFT - assert(buffer.size() > 1); // dummy symbol means > 1 (not >= 1) - stack.push_back(buffer.back()); - stack_lstm.add_input(buffer.back()); - buffer.pop_back(); - buffer_lstm.rewind_one_step(); - stacki.push_back(bufferi.back()); - bufferi.pop_back(); - } else if (ac == 'S' && ac2 == 'W') { //SWAP --- Miguel - assert(stack.size() > 2); // dummy symbol means > 2 (not >= 2) - - Expression toki, tokj; - unsigned ii = 0, jj = 0; - tokj = stack.back(); - jj = stacki.back(); - stack.pop_back(); - stacki.pop_back(); - - toki = stack.back(); - ii = stacki.back(); - stack.pop_back(); - stacki.pop_back(); - - buffer.push_back(toki); - bufferi.push_back(ii); - - stack_lstm.rewind_one_step(); - stack_lstm.rewind_one_step(); - - buffer_lstm.add_input(buffer.back()); - - stack.push_back(tokj); - stacki.push_back(jj); - - stack_lstm.add_input(stack.back()); - } else { // LEFT or RIGHT - assert(stack.size() > 2); // dummy symbol means > 2 (not >= 2) - assert(ac == 'L' || ac == 'R'); - Expression dep, head; - unsigned depi = 0, headi = 0; - (ac == 'R' ? dep : head) = stack.back(); - (ac == 'R' ? depi : headi) = stacki.back(); - stack.pop_back(); - stacki.pop_back(); - (ac == 'R' ? head : dep) = stack.back(); - (ac == 'R' ? headi : depi) = stacki.back(); - stack.pop_back(); - stacki.pop_back(); - // composed = cbias + H * head + D * dep + R * relation - Expression composed = affine_transform({cbias, H, head, D, dep, R, - relation}); - Expression nlcomposed = tanh(composed); - stack_lstm.rewind_one_step(); - stack_lstm.rewind_one_step(); - stack_lstm.add_input(nlcomposed); - stack.push_back(nlcomposed); - stacki.push_back(headi); - } - } - assert(stack.size() == 2); // guard symbol, root - assert(stacki.size() == 2); - assert(buffer.size() == 1); // guard symbol - assert(bufferi.size() == 1); - Expression tot_neglogprob = -sum(log_probs); - assert(tot_neglogprob.pg != nullptr); - - if (final_parser_state) { - *final_parser_state = p_t; - } - return results; -} - - -void LSTMParser::SaveModel(const string& model_fname, bool softlink_created) { - ofstream out_file(model_fname); - eos::portable_oarchive archive(out_file); - archive << *this; - cerr << "Model saved." << endl; - // Create a soft link to the most recent model in order to make it - // easier to refer to it in a shell script. - if (!softlink_created) { - string softlink = "latest_model.params"; - - if (system((string("rm -f ") + softlink).c_str()) == 0 - && system(("ln -s " + model_fname + " " + softlink).c_str()) == 0) { - cerr << "Created " << softlink << " as a soft link to " << model_fname - << " for convenience." << endl; - } - } + return state; } @@ -504,13 +427,13 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, for (auto& index_and_id : tsentence) { // use reference to overwrite if (corpus.singletons.count(index_and_id.second) && cnn::rand01() < unk_prob) { - index_and_id.second = kUNK; + index_and_id.second = vocab.kUNK; } } } const vector& actions = corpus.correct_act_sent[order[si]]; ComputationGraph hg; - LogProbParser(&hg, sentence, tsentence, actions, + LogProbTagger(&hg, sentence, tsentence, actions, corpus.vocab->actions, corpus.vocab->int_to_words, &correct); double lp = as_scalar(hg.incremental_forward()); @@ -578,25 +501,10 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } -vector LSTMParser::LogProbParser( - const Sentence& sentence, const CorpusVocabulary& vocab, - ComputationGraph *cg, Expression* final_parser_state) { - Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced - for (auto& index_and_id : tsentence) { // use reference to overwrite - if (!vocab.int_to_training_word[index_and_id.second]) { - index_and_id.second = kUNK; - } - } - return LogProbParser(cg, sentence, tsentence, vector(), - vocab.actions, vocab.int_to_words, nullptr, - final_parser_state); -} - - ParseTree LSTMParser::Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled) { ComputationGraph cg; - vector pred = LogProbParser(sentence, vocab, &cg); + vector pred = LogProbTagger(sentence, vocab, &cg); double lp = as_scalar(cg.incremental_forward()); return RecoverParseTree(sentence, pred, vocab.actions, vocab.actions_to_arc_labels, labeled, lp); diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 1410525..0976f22 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -20,6 +20,7 @@ #include "cnn/rnn.h" #include "corpus.h" #include "eos/portable_archive.hpp" +#include "lstm-transition-tagger.h" namespace lstm_parser { @@ -112,17 +113,15 @@ class ParseTree { }; -class LSTMParser { +class LSTMParser : LSTMTransitionTagger { public: // TODO: make some of these members non-public ParserOptions options; CorpusVocabulary vocab; cnn::Model model; - bool finalized; std::unordered_map> pretrained; unsigned n_possible_actions; - const unsigned kUNK; const unsigned kROOT_SYMBOL; cnn::LSTMBuilder stack_lstm; // (layers, input, hidden, trainer) @@ -156,7 +155,6 @@ class LSTMParser { bool finalize=true); explicit LSTMParser(const std::string& model_path) : - kUNK(vocab.GetOrAddWord(vocab.UNK)), kROOT_SYMBOL(vocab.GetOrAddWord(vocab.ROOT)) { std::cerr << "Loading model from " << model_path << "..."; auto t_start = std::chrono::high_resolution_clock::now(); @@ -176,13 +174,19 @@ class LSTMParser { template explicit LSTMParser(Archive* archive) : - kUNK(vocab.GetOrAddWord(vocab.UNK)), kROOT_SYMBOL(vocab.GetOrAddWord(vocab.ROOT)) { *archive >> *this; } - static bool IsActionForbidden(const std::string& a, unsigned bsize, - unsigned ssize, const std::vector& stacki); + virtual bool IsActionForbidden(const std::string& a, + const TaggerState& state) override; + + virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) + override; + + virtual void DoAction(unsigned action, + const std::vector& action_names, + TaggerState* state, cnn::ComputationGraph* cg) override; ParseTree Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled); @@ -208,34 +212,45 @@ class LSTMParser { DoTest(corpus, true, output_parses); } - // Used for testing. Replaces OOV with UNK. - std::vector LogProbParser( - const Sentence& sentence, const CorpusVocabulary& vocab, - cnn::ComputationGraph *cg, - cnn::expr::Expression* final_parser_state = nullptr); - void LoadPretrainedWords(const std::string& words_path); void FinalizeVocab(); protected: - // *** if correct_actions is empty, this runs greedy decoding *** - // returns parse actions for input sentence (in training just returns the - // reference) - // OOV handling: raw_sent will have the actual words - // sent will have words replaced by appropriate UNK tokens - // this lets us use pretrained embeddings, when available, for words that were - // OOV in the parser training data. - std::vector LogProbParser( - cnn::ComputationGraph* hg, - const Sentence& sentence, // raw sentence + struct ParserState : public TaggerState { + std::vector buffer; + std::vector bufferi; // position of the words in the sentence + std::vector stack; // subtree embeddings + std::vector stacki; // word position in sentence of head of subtree + + ~ParserState() { + assert(stack.size() == 2); // guard symbol, root + assert(stacki.size() == 2); + assert(buffer.size() == 1); // guard symbol + assert(bufferi.size() == 1); + } + }; + + virtual std::vector GetParameters() override { + std::vector all_params {p_pbias, p_H, p_D, p_R, p_cbias, + p_S, p_B, p_A, p_ib, p_w2l, p_p2a, p_abias, p_action_start}; + if (options.use_pos) + all_params.push_back(p_p2l); + if (p_t2l) + all_params.push_back(p_t2l); + return all_params; + } + + virtual TaggerState* InitializeParserState( + cnn::ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced const std::vector& correct_actions, - const std::vector& action_names, - const std::vector& int_to_words, double* correct, - cnn::expr::Expression* final_parser_state = nullptr); + const std::vector& action_names) override; - void SaveModel(const std::string& model_fname, bool softlink_created); + virtual bool ShouldTerminate(const TaggerState& state) override { + const ParserState& real_state = static_cast(state); + return real_state.stack.size() <= 2 && real_state.buffer.size() <= 1; + } inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { @@ -249,6 +264,10 @@ class LSTMParser { return correct_count; } + virtual void DoSave(eos::portable_oarchive& archive) override { + archive << *this; + } + private: friend class boost::serialization::access; diff --git a/parser/lstm-transition-tagger.cpp b/parser/lstm-transition-tagger.cpp new file mode 100644 index 0000000..b647b12 --- /dev/null +++ b/parser/lstm-transition-tagger.cpp @@ -0,0 +1,124 @@ +#include "lstm-transition-tagger.h" + +#include +#include +#include + +#include "cnn/expr.h" +#include "cnn/model.h" +#include "eos/portable_archive.hpp" + +using namespace std; +using namespace cnn; +using namespace cnn::expr; + +namespace lstm_parser { + + +void LSTMTransitionTagger::SaveModel(const string& model_fname, + bool softlink_created) { + ofstream out_file(model_fname); + eos::portable_oarchive archive(out_file); + DoSave(archive); + cerr << "Model saved." << endl; + // Create a soft link to the most recent model in order to make it + // easier to refer to it in a shell script. + if (!softlink_created) { + string softlink = "latest_model.params"; + + if (system((string("rm -f ") + softlink).c_str()) == 0 + && system(("ln -s " + model_fname + " " + softlink).c_str()) == 0) { + cerr << "Created " << softlink << " as a soft link to " << model_fname + << " for convenience." << endl; + } + } +} + + +vector LSTMTransitionTagger::LogProbTagger( + const Sentence& sentence, const CorpusVocabulary& vocab, + ComputationGraph *cg, Expression* final_parser_state) { + Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced + for (auto& index_and_id : tsentence) { // use reference to overwrite + if (!vocab.int_to_training_word[index_and_id.second]) { + index_and_id.second = vocab.kUNK; + } + } + return LogProbTagger(cg, sentence, tsentence, vector(), + vocab.actions, vocab.int_to_words, nullptr, + final_parser_state); +} + + +vector LSTMTransitionTagger::LogProbTagger( + ComputationGraph* cg, + const Sentence& raw_sent, // raw sentence + const Sentence::SentenceMap& sent, // sentence with OOVs replaced + const vector& correct_actions, const vector& action_names, + const vector& int_to_words, double* correct, + Expression* final_parser_state) { + assert(finalized); + vector results; + const bool build_training_graph = correct_actions.size() > 0; + + // variables in the computation graph representing the parameters + for (Parameters *params : GetParameters()) { + param_expressions[params] = parameter(*cg, params); + } + + unique_ptr state(InitializeParserState(cg, raw_sent, sent, + correct_actions, + action_names)); + + vector log_probs; + unsigned action_count = 0; // incremented at each prediction + Expression p_t; // declared outside to allow access later + while (!ShouldTerminate(*state)) { + // Get list of possible actions for the current parser state. + vector current_valid_actions; + for (unsigned action = 0; action < action_names.size(); ++action) { + if (IsActionForbidden(action_names[action], *state)) + continue; + current_valid_actions.push_back(action); + } + + Expression r_t = GetActionProbabilities(*state); + // adist = log_softmax(r_t, current_valid_actions) + Expression adiste = log_softmax(r_t, current_valid_actions); + vector adist = as_vector(cg->incremental_forward()); + double best_score = adist[current_valid_actions[0]]; + unsigned best_a = current_valid_actions[0]; + for (unsigned i = 1; i < current_valid_actions.size(); ++i) { + if (adist[current_valid_actions[i]] > best_score) { + best_score = adist[current_valid_actions[i]]; + best_a = current_valid_actions[i]; + } + } + unsigned action = best_a; + // If we have reference actions (for training), use the reference action. + if (build_training_graph) { + action = correct_actions[action_count]; + if (correct && best_a == action) { + (*correct)++; + } + } + ++action_count; + log_probs.push_back(pick(adiste, action)); + results.push_back(action); + + DoAction(action, action_names, state.get(), cg); + } + + Expression tot_neglogprob = -sum(log_probs); + assert(tot_neglogprob.pg != nullptr); + + if (final_parser_state) { + *final_parser_state = p_t; + } + param_expressions.clear(); + return results; +} + + + +} /* namespace lstm_parser */ diff --git a/parser/lstm-transition-tagger.h b/parser/lstm-transition-tagger.h new file mode 100644 index 0000000..e2de232 --- /dev/null +++ b/parser/lstm-transition-tagger.h @@ -0,0 +1,82 @@ +#ifndef LSTM_PARSER_PARSER_LSTM_TRANSITION_TAGGER_H_ +#define LSTM_PARSER_PARSER_LSTM_TRANSITION_TAGGER_H_ + +#include +#include +#include + +#include "cnn/expr.h" +#include "cnn/model.h" +#include "corpus.h" + +namespace eos { +class portable_oarchive; +} + +namespace lstm_parser { + +class LSTMTransitionTagger { +public: + LSTMTransitionTagger() : finalized(false) {} + virtual ~LSTMTransitionTagger() {} + +protected: + struct TaggerState {}; + + bool finalized; + std::map param_expressions; + + inline cnn::expr::Expression GetParamExpr(cnn::ParametersBase* params) { + return param_expressions.at(params); + } + + virtual std::vector GetParameters() = 0; + + virtual TaggerState* InitializeParserState( + cnn::ComputationGraph* hg, const Sentence& raw_sent, + const Sentence::SentenceMap& sent, // sentence with OOVs replaced + const std::vector& correct_actions, + const std::vector& action_names) = 0; + + virtual cnn::expr::Expression GetActionProbabilities( + const TaggerState& state) = 0; + + virtual bool ShouldTerminate(const TaggerState& state) = 0; + + virtual bool IsActionForbidden(const std::string& action_name, + const TaggerState& state) = 0; + + virtual void DoAction(unsigned action, + const std::vector& action_names, + TaggerState* state, cnn::ComputationGraph* cg) = 0; + + virtual void DoSave(eos::portable_oarchive& archive) = 0; + + void SaveModel(const std::string& model_fname, bool softlink_created); + + // Used for testing. Replaces OOV with UNK. + std::vector LogProbTagger( + const Sentence& sentence, const CorpusVocabulary& vocab, + cnn::ComputationGraph *cg, + cnn::expr::Expression* final_parser_state = nullptr); + + // *** if correct_actions is empty, this runs greedy decoding *** + // returns parse actions for input sentence (in training just returns the + // reference) + // OOV handling: raw_sent will have the actual words + // sent will have words replaced by appropriate UNK tokens + // this lets us use pretrained embeddings, when available, for words that were + // OOV in the parser training data. + std::vector LogProbTagger( + cnn::ComputationGraph* hg, + const Sentence& sentence, // raw sentence + const Sentence::SentenceMap& sent, // sentence with OOVs replaced + const std::vector& correct_actions, + const std::vector& action_names, + const std::vector& int_to_words, double* correct, + cnn::expr::Expression* final_parser_state = nullptr); +}; + +} /* namespace lstm_parser */ + +#endif /* LSTM_PARSER_PARSER_LSTM_TRANSITION_TAGGER_H_ */ From ee8966ef7cc725f15827e5799c343e50e6f0b6d5 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 14:11:03 -0500 Subject: [PATCH 27/88] Fixed handling of -w flag --- parser/lstm-parser-driver.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/parser/lstm-parser-driver.cc b/parser/lstm-parser-driver.cc index 6ed78fa..ddd7eba 100644 --- a/parser/lstm-parser-driver.cc +++ b/parser/lstm-parser-driver.cc @@ -120,11 +120,6 @@ int main(int argc, char** argv) { abort(); } if (train && !load_model) { - if (!conf.count("words")) { - cerr << "Can't train without word vectors! Please provide --words." - << endl; - abort(); - } if (!conf.count("training_data")) { cerr << "Can't train without training data! Please provide" " --training_data" << endl; @@ -132,7 +127,8 @@ int main(int argc, char** argv) { } } - const string words = load_model ? "" : conf["words"].as(); + const string words = + load_model || !conf.count("words") ? "" : conf["words"].as(); unique_ptr parser; if (load_model) { parser.reset(new LSTMParser(conf["model"].as())); From f8c2aca8c1bbed07991600c29b3513c9f7c4d096 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 14:18:13 -0500 Subject: [PATCH 28/88] Lint --- parser/lstm-parser.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 0976f22..7ae9ddb 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -168,7 +168,8 @@ class LSTMParser : LSTMTransitionTagger { auto t_end = std::chrono::high_resolution_clock::now(); auto ms_passed = std::chrono::duration(t_end - t_start).count(); - std::cerr << "done. (Loading took " << ms_passed << " milliseconds.)" << std::endl; + std::cerr << "done. (Loading took " << ms_passed << " milliseconds.)" + << std::endl; } From 1b49452e0efe411f86e339e0543ef5164a06519a Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 14:47:16 -0500 Subject: [PATCH 29/88] A bit of rearranging within the inheritance hierarchy --- parser/lstm-parser.cc | 10 +++------- parser/lstm-parser.h | 31 ++++++++++++++----------------- parser/lstm-transition-tagger.cpp | 8 ++++++++ parser/lstm-transition-tagger.h | 21 ++++++++++++++++----- 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 764fbea..82418ce 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -64,10 +64,7 @@ void LSTMParser::LoadPretrainedWords(const string& words_path) { } -void LSTMParser::FinalizeVocab() { - if (finalized) - return; - +void LSTMParser::InitializeNetworkParameters() { // Now that the vocab is ready to be finalized, we can set all the network // parameters. unsigned action_size = vocab.CountActions() + 1; @@ -112,8 +109,6 @@ void LSTMParser::FinalizeVocab() { p_p = nullptr; p_p2l = nullptr; } - - finalized = true; } @@ -141,7 +136,8 @@ LSTMParser::LSTMParser(const ParserOptions& poptions, } -bool LSTMParser::IsActionForbidden(const string& a, const TaggerState& state) { +bool LSTMParser::IsActionForbidden(const string& a, + const TaggerState& state) const { const ParserState& real_state = static_cast(state); unsigned ssize = real_state.stack.size(); unsigned bsize = real_state.buffer.size(); diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 7ae9ddb..3c9db7e 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -113,12 +113,9 @@ class ParseTree { }; -class LSTMParser : LSTMTransitionTagger { +class LSTMParser : public LSTMTransitionTagger { public: - // TODO: make some of these members non-public ParserOptions options; - CorpusVocabulary vocab; - cnn::Model model; std::unordered_map> pretrained; unsigned n_possible_actions; @@ -179,16 +176,6 @@ class LSTMParser : LSTMTransitionTagger { *archive >> *this; } - virtual bool IsActionForbidden(const std::string& a, - const TaggerState& state) override; - - virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) - override; - - virtual void DoAction(unsigned action, - const std::vector& action_names, - TaggerState* state, cnn::ComputationGraph* cg) override; - ParseTree Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled); @@ -215,8 +202,6 @@ class LSTMParser : LSTMTransitionTagger { void LoadPretrainedWords(const std::string& words_path); - void FinalizeVocab(); - protected: struct ParserState : public TaggerState { std::vector buffer; @@ -248,11 +233,23 @@ class LSTMParser : LSTMTransitionTagger { const std::vector& correct_actions, const std::vector& action_names) override; - virtual bool ShouldTerminate(const TaggerState& state) override { + virtual void InitializeNetworkParameters() override; + + virtual bool ShouldTerminate(const TaggerState& state) const override { const ParserState& real_state = static_cast(state); return real_state.stack.size() <= 2 && real_state.buffer.size() <= 1; } + virtual bool IsActionForbidden(const std::string& a, + const TaggerState& state) const override; + + virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) + override; + + virtual void DoAction(unsigned action, + const std::vector& action_names, + TaggerState* state, cnn::ComputationGraph* cg) override; + inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { assert(ref.sentence.Size() == hyp.sentence.Size()); diff --git a/parser/lstm-transition-tagger.cpp b/parser/lstm-transition-tagger.cpp index b647b12..e04142b 100644 --- a/parser/lstm-transition-tagger.cpp +++ b/parser/lstm-transition-tagger.cpp @@ -35,6 +35,14 @@ void LSTMTransitionTagger::SaveModel(const string& model_fname, } +void LSTMTransitionTagger::FinalizeVocab() { + if (finalized) + return; + InitializeNetworkParameters(); + finalized = true; +} + + vector LSTMTransitionTagger::LogProbTagger( const Sentence& sentence, const CorpusVocabulary& vocab, ComputationGraph *cg, Expression* final_parser_state) { diff --git a/parser/lstm-transition-tagger.h b/parser/lstm-transition-tagger.h index e2de232..bade12f 100644 --- a/parser/lstm-transition-tagger.h +++ b/parser/lstm-transition-tagger.h @@ -17,15 +17,25 @@ namespace lstm_parser { class LSTMTransitionTagger { public: + // TODO: this really shouldn't be public... + CorpusVocabulary vocab; + LSTMTransitionTagger() : finalized(false) {} virtual ~LSTMTransitionTagger() {} + void FinalizeVocab(); + protected: struct TaggerState {}; bool finalized; std::map param_expressions; + cnn::Model model; + + LSTMTransitionTagger(const CorpusVocabulary& vocab) + : vocab(vocab), finalized(false) {} + inline cnn::expr::Expression GetParamExpr(cnn::ParametersBase* params) { return param_expressions.at(params); } @@ -41,10 +51,10 @@ class LSTMTransitionTagger { virtual cnn::expr::Expression GetActionProbabilities( const TaggerState& state) = 0; - virtual bool ShouldTerminate(const TaggerState& state) = 0; + virtual bool ShouldTerminate(const TaggerState& state) const = 0; virtual bool IsActionForbidden(const std::string& action_name, - const TaggerState& state) = 0; + const TaggerState& state) const = 0; virtual void DoAction(unsigned action, const std::vector& action_names, @@ -52,6 +62,8 @@ class LSTMTransitionTagger { virtual void DoSave(eos::portable_oarchive& archive) = 0; + virtual void InitializeNetworkParameters() = 0; + void SaveModel(const std::string& model_fname, bool softlink_created); // Used for testing. Replaces OOV with UNK. @@ -61,12 +73,11 @@ class LSTMTransitionTagger { cnn::expr::Expression* final_parser_state = nullptr); // *** if correct_actions is empty, this runs greedy decoding *** - // returns parse actions for input sentence (in training just returns the - // reference) + // returns actions for input sentence (in training just returns the reference) // OOV handling: raw_sent will have the actual words // sent will have words replaced by appropriate UNK tokens // this lets us use pretrained embeddings, when available, for words that were - // OOV in the parser training data. + // OOV in the training data. std::vector LogProbTagger( cnn::ComputationGraph* hg, const Sentence& sentence, // raw sentence From 818eac8c035727916c1fa117317c4da58c7e9fb8 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 15:25:58 -0500 Subject: [PATCH 30/88] Updated ShouldTerminate interface --- parser/lstm-parser.h | 4 +++- parser/lstm-transition-tagger.cpp | 2 +- parser/lstm-transition-tagger.h | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 3c9db7e..b598452 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -235,7 +235,9 @@ class LSTMParser : public LSTMTransitionTagger { virtual void InitializeNetworkParameters() override; - virtual bool ShouldTerminate(const TaggerState& state) const override { + virtual bool ShouldTerminate( + const TaggerState& state, const Sentence& raw_sent, + const Sentence::SentenceMap& sent) const override { const ParserState& real_state = static_cast(state); return real_state.stack.size() <= 2 && real_state.buffer.size() <= 1; } diff --git a/parser/lstm-transition-tagger.cpp b/parser/lstm-transition-tagger.cpp index e04142b..978846b 100644 --- a/parser/lstm-transition-tagger.cpp +++ b/parser/lstm-transition-tagger.cpp @@ -81,7 +81,7 @@ vector LSTMTransitionTagger::LogProbTagger( vector log_probs; unsigned action_count = 0; // incremented at each prediction Expression p_t; // declared outside to allow access later - while (!ShouldTerminate(*state)) { + while (!ShouldTerminate(*state, raw_sent, sent)) { // Get list of possible actions for the current parser state. vector current_valid_actions; for (unsigned action = 0; action < action_names.size(); ++action) { diff --git a/parser/lstm-transition-tagger.h b/parser/lstm-transition-tagger.h index bade12f..0870123 100644 --- a/parser/lstm-transition-tagger.h +++ b/parser/lstm-transition-tagger.h @@ -51,7 +51,9 @@ class LSTMTransitionTagger { virtual cnn::expr::Expression GetActionProbabilities( const TaggerState& state) = 0; - virtual bool ShouldTerminate(const TaggerState& state) const = 0; + virtual bool ShouldTerminate(const TaggerState& state, + const Sentence& raw_sent, + const Sentence::SentenceMap& sent) const = 0; virtual bool IsActionForbidden(const std::string& action_name, const TaggerState& state) const = 0; From 2030e2057b3a8305c9c7e2feeebaadef4e3257f4 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 17:35:55 -0500 Subject: [PATCH 31/88] Minor API and code cleanup --- parser/lstm-parser.cc | 24 ++++++++++++------------ parser/lstm-parser.h | 3 ++- parser/lstm-transition-tagger.cpp | 2 +- parser/lstm-transition-tagger.h | 6 ++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 82418ce..fd1045e 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -136,22 +136,24 @@ LSTMParser::LSTMParser(const ParserOptions& poptions, } -bool LSTMParser::IsActionForbidden(const string& a, +bool LSTMParser::IsActionForbidden(const unsigned action, + const vector& action_names, const TaggerState& state) const { + const string& action_name = action_names[action]; const ParserState& real_state = static_cast(state); unsigned ssize = real_state.stack.size(); unsigned bsize = real_state.buffer.size(); - if (a[1] == 'W' && ssize < 3) + if (action_name[1] == 'W' && ssize < 3) return true; - if (a[1] == 'W') { + if (action_name[1] == 'W') { int top = real_state.stacki[real_state.stacki.size() - 1]; int sec = real_state.stacki[real_state.stacki.size() - 2]; if (sec > top) return true; } - bool is_shift = (a[0] == 'S' && a[1] == 'H'); + bool is_shift = (action_name[0] == 'S' && action_name[1] == 'H'); bool is_reduce = !is_shift; if (is_shift && bsize == 1) return true; @@ -162,7 +164,7 @@ bool LSTMParser::IsActionForbidden(const string& a, is_shift) return true; // only attach left to ROOT - if (bsize == 1 && ssize == 3 && a[0] == 'R') + if (bsize == 1 && ssize == 3 && action_name[0] == 'R') return true; return false; } @@ -221,8 +223,7 @@ ParseTree LSTMParser::RecoverParseTree( } -cnn::expr::Expression LSTMParser::GetActionProbabilities( - const TaggerState& state) { +Expression LSTMParser::GetActionProbabilities(const TaggerState& state) { // p_t = pbias + S * slstm + B * blstm + A * alstm Expression p_t = affine_transform( {GetParamExpr(p_pbias), GetParamExpr(p_S), stack_lstm.back(), @@ -236,8 +237,7 @@ cnn::expr::Expression LSTMParser::GetActionProbabilities( } -void LSTMParser::DoAction(unsigned action, - const vector& action_names, +void LSTMParser::DoAction(unsigned action, const vector& action_names, TaggerState* state, ComputationGraph* cg) { ParserState* real_state = static_cast(state); // add current action to action LSTM @@ -315,11 +315,11 @@ void LSTMParser::DoAction(unsigned action, LSTMTransitionTagger::TaggerState* LSTMParser::InitializeParserState( - cnn::ComputationGraph* cg, + ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - const std::vector& action_names) { + const vector& correct_actions, + const vector& action_names) { stack_lstm.new_graph(*cg); buffer_lstm.new_graph(*cg); action_lstm.new_graph(*cg); diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index b598452..9b9d731 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -242,7 +242,8 @@ class LSTMParser : public LSTMTransitionTagger { return real_state.stack.size() <= 2 && real_state.buffer.size() <= 1; } - virtual bool IsActionForbidden(const std::string& a, + virtual bool IsActionForbidden(const unsigned action, + const std::vector& action_names, const TaggerState& state) const override; virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) diff --git a/parser/lstm-transition-tagger.cpp b/parser/lstm-transition-tagger.cpp index 978846b..c5f9cc9 100644 --- a/parser/lstm-transition-tagger.cpp +++ b/parser/lstm-transition-tagger.cpp @@ -85,7 +85,7 @@ vector LSTMTransitionTagger::LogProbTagger( // Get list of possible actions for the current parser state. vector current_valid_actions; for (unsigned action = 0; action < action_names.size(); ++action) { - if (IsActionForbidden(action_names[action], *state)) + if (IsActionForbidden(action, action_names, *state)) continue; current_valid_actions.push_back(action); } diff --git a/parser/lstm-transition-tagger.h b/parser/lstm-transition-tagger.h index 0870123..3af1b7b 100644 --- a/parser/lstm-transition-tagger.h +++ b/parser/lstm-transition-tagger.h @@ -33,9 +33,6 @@ class LSTMTransitionTagger { cnn::Model model; - LSTMTransitionTagger(const CorpusVocabulary& vocab) - : vocab(vocab), finalized(false) {} - inline cnn::expr::Expression GetParamExpr(cnn::ParametersBase* params) { return param_expressions.at(params); } @@ -55,7 +52,8 @@ class LSTMTransitionTagger { const Sentence& raw_sent, const Sentence::SentenceMap& sent) const = 0; - virtual bool IsActionForbidden(const std::string& action_name, + virtual bool IsActionForbidden(const unsigned action, + const std::vector& action_names, const TaggerState& state) const = 0; virtual void DoAction(unsigned action, From 751d3afd62c87ee60a48a2e008d3810ac9af33ad Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 19:05:43 -0500 Subject: [PATCH 32/88] Better code structure for unknown word replacement --- parser/lstm-transition-tagger.cpp | 26 +++++++++++++++++--------- parser/lstm-transition-tagger.h | 20 ++++++++++++-------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/parser/lstm-transition-tagger.cpp b/parser/lstm-transition-tagger.cpp index c5f9cc9..b8706d5 100644 --- a/parser/lstm-transition-tagger.cpp +++ b/parser/lstm-transition-tagger.cpp @@ -42,19 +42,27 @@ void LSTMTransitionTagger::FinalizeVocab() { finalized = true; } - -vector LSTMTransitionTagger::LogProbTagger( - const Sentence& sentence, const CorpusVocabulary& vocab, - ComputationGraph *cg, Expression* final_parser_state) { - Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced - for (auto& index_and_id : tsentence) { // use reference to overwrite +Sentence::SentenceMap LSTMTransitionTagger::ReplaceUnknowns( + const Sentence& sentence, const CorpusVocabulary& vocab) { + Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced + for (auto& index_and_id : tsentence) { + // use reference to overwrite if (!vocab.int_to_training_word[index_and_id.second]) { index_and_id.second = vocab.kUNK; } } - return LogProbTagger(cg, sentence, tsentence, vector(), - vocab.actions, vocab.int_to_words, nullptr, - final_parser_state); + return tsentence; +} + +vector LSTMTransitionTagger::LogProbTagger( + const Sentence& sentence, const CorpusVocabulary& vocab, + ComputationGraph *cg, bool replace_unknowns, + Expression* final_parser_state) { + return LogProbTagger( + cg, sentence, + replace_unknowns ? ReplaceUnknowns(sentence, vocab) : sentence.words, + vector(), vocab.actions, vocab.int_to_words, nullptr, + final_parser_state); } diff --git a/parser/lstm-transition-tagger.h b/parser/lstm-transition-tagger.h index 3af1b7b..12df1ee 100644 --- a/parser/lstm-transition-tagger.h +++ b/parser/lstm-transition-tagger.h @@ -25,15 +25,22 @@ class LSTMTransitionTagger { void FinalizeVocab(); + // Used for testing. Replaces OOV with UNK. + std::vector LogProbTagger( + const Sentence& sentence, const CorpusVocabulary& vocab, + cnn::ComputationGraph *cg, + bool replace_unknowns = true, + cnn::expr::Expression* final_parser_state = nullptr); + protected: struct TaggerState {}; bool finalized; - std::map param_expressions; + std::map param_expressions; cnn::Model model; - inline cnn::expr::Expression GetParamExpr(cnn::ParametersBase* params) { + inline cnn::expr::Expression GetParamExpr(cnn::Parameters* params) { return param_expressions.at(params); } @@ -66,12 +73,6 @@ class LSTMTransitionTagger { void SaveModel(const std::string& model_fname, bool softlink_created); - // Used for testing. Replaces OOV with UNK. - std::vector LogProbTagger( - const Sentence& sentence, const CorpusVocabulary& vocab, - cnn::ComputationGraph *cg, - cnn::expr::Expression* final_parser_state = nullptr); - // *** if correct_actions is empty, this runs greedy decoding *** // returns actions for input sentence (in training just returns the reference) // OOV handling: raw_sent will have the actual words @@ -86,6 +87,9 @@ class LSTMTransitionTagger { const std::vector& action_names, const std::vector& int_to_words, double* correct, cnn::expr::Expression* final_parser_state = nullptr); + + Sentence::SentenceMap ReplaceUnknowns(const Sentence& sentence, + const CorpusVocabulary& vocab); }; } /* namespace lstm_parser */ From 543eb3833b55f90c7eb798185403e36a485d57fc Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 1 Mar 2017 19:15:01 -0500 Subject: [PATCH 33/88] Made vocab not a public member --- parser/lstm-parser-driver.cc | 11 ++++++----- parser/lstm-transition-tagger.h | 8 ++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/parser/lstm-parser-driver.cc b/parser/lstm-parser-driver.cc index ddd7eba..09a6fb3 100644 --- a/parser/lstm-parser-driver.cc +++ b/parser/lstm-parser-driver.cc @@ -153,7 +153,7 @@ int main(int argc, char** argv) { } signal(SIGINT, signal_callback_handler); - ParserTrainingCorpus training_corpus(&parser->vocab, + ParserTrainingCorpus training_corpus(parser->GetVocab(), conf["training_data"].as(), true); parser->FinalizeVocab(); @@ -161,8 +161,8 @@ int main(int argc, char** argv) { << endl; // OOV words will be replaced by UNK tokens dev_corpus.reset( - new ParserTrainingCorpus(&parser->vocab, conf["dev_data"].as(), - false)); + new ParserTrainingCorpus(parser->GetVocab(), + conf["dev_data"].as(), false)); ostringstream os; os << "parser_" << (parser->options.use_pos ? "pos" : "nopos") @@ -190,7 +190,7 @@ int main(int argc, char** argv) { cerr << "Evaluating model on " << conf["dev_data"].as() << endl; if (!train) { // Didn't already load dev corpus for training dev_corpus.reset( - new ParserTrainingCorpus(&parser->vocab, + new ParserTrainingCorpus(parser->GetVocab(), conf["dev_data"].as(), false)); } parser->Evaluate(*dev_corpus); @@ -213,7 +213,8 @@ int main(int argc, char** argv) { << endl; abort(); } - Corpus test_corpus(&parser->vocab, *reader, conf["test_data"].as()); + Corpus test_corpus(parser->GetVocab(), *reader, + conf["test_data"].as()); parser->Test(test_corpus); } } diff --git a/parser/lstm-transition-tagger.h b/parser/lstm-transition-tagger.h index 12df1ee..e6ec5c0 100644 --- a/parser/lstm-transition-tagger.h +++ b/parser/lstm-transition-tagger.h @@ -17,8 +17,6 @@ namespace lstm_parser { class LSTMTransitionTagger { public: - // TODO: this really shouldn't be public... - CorpusVocabulary vocab; LSTMTransitionTagger() : finalized(false) {} virtual ~LSTMTransitionTagger() {} @@ -32,6 +30,11 @@ class LSTMTransitionTagger { bool replace_unknowns = true, cnn::expr::Expression* final_parser_state = nullptr); + const lstm_parser::CorpusVocabulary& GetVocab() const { return vocab; } + + // TODO: arrange things such that we don't need to expose this? + lstm_parser::CorpusVocabulary* GetVocab() { return &vocab; } + protected: struct TaggerState {}; @@ -39,6 +42,7 @@ class LSTMTransitionTagger { std::map param_expressions; cnn::Model model; + CorpusVocabulary vocab; inline cnn::expr::Expression GetParamExpr(cnn::Parameters* params) { return param_expressions.at(params); From 61142afbb4500fbb3273caf6fc0274301e0d3fd4 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 5 Mar 2017 18:08:36 -0500 Subject: [PATCH 34/88] Renamed to neural transition tagger; cleaned up API a bit Particularly regarding tagger state objects --- parser/CMakeLists.txt | 2 +- parser/lstm-parser.cc | 13 +++------- parser/lstm-parser.h | 17 ++++++++----- ...agger.cpp => neural-transition-tagger.cpp} | 14 +++++------ ...on-tagger.h => neural-transition-tagger.h} | 25 ++++++++++--------- 5 files changed, 36 insertions(+), 35 deletions(-) rename parser/{lstm-transition-tagger.cpp => neural-transition-tagger.cpp} (91%) rename parser/{lstm-transition-tagger.h => neural-transition-tagger.h} (81%) diff --git a/parser/CMakeLists.txt b/parser/CMakeLists.txt index 1217810..0077cab 100644 --- a/parser/CMakeLists.txt +++ b/parser/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(lstm-parser:parser) CMAKE_MINIMUM_REQUIRED(VERSION 2.8) -ADD_LIBRARY(lstm-parser-core lstm-parser.cc corpus.cc lstm-transition-tagger.cpp) +ADD_LIBRARY(lstm-parser-core lstm-parser.cc corpus.cc neural-transition-tagger.cpp) target_link_libraries(lstm-parser-core cnn ${Boost_LIBRARIES}) ADD_EXECUTABLE(lstm-parse lstm-parser-driver.cc) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index fd1045e..cd52ccf 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -314,7 +314,7 @@ void LSTMParser::DoAction(unsigned action, const vector& action_names, } -LSTMTransitionTagger::TaggerState* LSTMParser::InitializeParserState( +NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced @@ -327,15 +327,10 @@ LSTMTransitionTagger::TaggerState* LSTMParser::InitializeParserState( buffer_lstm.start_new_sequence(); action_lstm.start_new_sequence(); + Expression stack_guard = GetParamExpr(p_stack_guard); + ParserState* state = new ParserState(raw_sent, sent, stack_guard); action_lstm.add_input(GetParamExpr(p_action_start)); - - ParserState* state = new ParserState; - state->buffer.resize(raw_sent.Size() + 1); - state->bufferi.resize(raw_sent.Size() + 1); - state->stack.push_back(parameter(*cg, p_stack_guard)); - state->stacki.push_back(-999); - // drive dummy symbol on stack through LSTM - stack_lstm.add_input(state->stack.back()); + stack_lstm.add_input(stack_guard); // precompute buffer representation from left to right unsigned added_to_buffer = 0; diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 9b9d731..7e082a5 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -20,7 +20,7 @@ #include "cnn/rnn.h" #include "corpus.h" #include "eos/portable_archive.hpp" -#include "lstm-transition-tagger.h" +#include "neural-transition-tagger.h" namespace lstm_parser { @@ -113,7 +113,7 @@ class ParseTree { }; -class LSTMParser : public LSTMTransitionTagger { +class LSTMParser : public NeuralTransitionTagger { public: ParserOptions options; @@ -209,6 +209,12 @@ class LSTMParser : public LSTMTransitionTagger { std::vector stack; // subtree embeddings std::vector stacki; // word position in sentence of head of subtree + ParserState(const Sentence& raw_sentence, + const Sentence::SentenceMap& sentence, Expression stack_guard) + : TaggerState {raw_sentence, sentence}, buffer(raw_sentence.Size() + 1), + bufferi(raw_sentence.Size() + 1), stack( {stack_guard}), + stacki( {-999}) {} + ~ParserState() { assert(stack.size() == 2); // guard symbol, root assert(stacki.size() == 2); @@ -219,7 +225,8 @@ class LSTMParser : public LSTMTransitionTagger { virtual std::vector GetParameters() override { std::vector all_params {p_pbias, p_H, p_D, p_R, p_cbias, - p_S, p_B, p_A, p_ib, p_w2l, p_p2a, p_abias, p_action_start}; + p_S, p_B, p_A, p_ib, p_w2l, p_p2a, p_abias, p_action_start, + p_stack_guard}; if (options.use_pos) all_params.push_back(p_p2l); if (p_t2l) @@ -235,9 +242,7 @@ class LSTMParser : public LSTMTransitionTagger { virtual void InitializeNetworkParameters() override; - virtual bool ShouldTerminate( - const TaggerState& state, const Sentence& raw_sent, - const Sentence::SentenceMap& sent) const override { + virtual bool ShouldTerminate(const TaggerState& state) const override { const ParserState& real_state = static_cast(state); return real_state.stack.size() <= 2 && real_state.buffer.size() <= 1; } diff --git a/parser/lstm-transition-tagger.cpp b/parser/neural-transition-tagger.cpp similarity index 91% rename from parser/lstm-transition-tagger.cpp rename to parser/neural-transition-tagger.cpp index b8706d5..39af31f 100644 --- a/parser/lstm-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -1,4 +1,4 @@ -#include "lstm-transition-tagger.h" +#include "neural-transition-tagger.h" #include #include @@ -15,7 +15,7 @@ using namespace cnn::expr; namespace lstm_parser { -void LSTMTransitionTagger::SaveModel(const string& model_fname, +void NeuralTransitionTagger::SaveModel(const string& model_fname, bool softlink_created) { ofstream out_file(model_fname); eos::portable_oarchive archive(out_file); @@ -35,14 +35,14 @@ void LSTMTransitionTagger::SaveModel(const string& model_fname, } -void LSTMTransitionTagger::FinalizeVocab() { +void NeuralTransitionTagger::FinalizeVocab() { if (finalized) return; InitializeNetworkParameters(); finalized = true; } -Sentence::SentenceMap LSTMTransitionTagger::ReplaceUnknowns( +Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( const Sentence& sentence, const CorpusVocabulary& vocab) { Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced for (auto& index_and_id : tsentence) { @@ -54,7 +54,7 @@ Sentence::SentenceMap LSTMTransitionTagger::ReplaceUnknowns( return tsentence; } -vector LSTMTransitionTagger::LogProbTagger( +vector NeuralTransitionTagger::LogProbTagger( const Sentence& sentence, const CorpusVocabulary& vocab, ComputationGraph *cg, bool replace_unknowns, Expression* final_parser_state) { @@ -66,7 +66,7 @@ vector LSTMTransitionTagger::LogProbTagger( } -vector LSTMTransitionTagger::LogProbTagger( +vector NeuralTransitionTagger::LogProbTagger( ComputationGraph* cg, const Sentence& raw_sent, // raw sentence const Sentence::SentenceMap& sent, // sentence with OOVs replaced @@ -89,7 +89,7 @@ vector LSTMTransitionTagger::LogProbTagger( vector log_probs; unsigned action_count = 0; // incremented at each prediction Expression p_t; // declared outside to allow access later - while (!ShouldTerminate(*state, raw_sent, sent)) { + while (!ShouldTerminate(*state)) { // Get list of possible actions for the current parser state. vector current_valid_actions; for (unsigned action = 0; action < action_names.size(); ++action) { diff --git a/parser/lstm-transition-tagger.h b/parser/neural-transition-tagger.h similarity index 81% rename from parser/lstm-transition-tagger.h rename to parser/neural-transition-tagger.h index e6ec5c0..f6d08ff 100644 --- a/parser/lstm-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -1,5 +1,5 @@ -#ifndef LSTM_PARSER_PARSER_LSTM_TRANSITION_TAGGER_H_ -#define LSTM_PARSER_PARSER_LSTM_TRANSITION_TAGGER_H_ +#ifndef LSTM_PARSER_PARSER_NEURAL_TRANSITION_TAGGER_H_ +#define LSTM_PARSER_PARSER_NEURAL_TRANSITION_TAGGER_H_ #include #include @@ -15,11 +15,11 @@ class portable_oarchive; namespace lstm_parser { -class LSTMTransitionTagger { +class NeuralTransitionTagger { public: - LSTMTransitionTagger() : finalized(false) {} - virtual ~LSTMTransitionTagger() {} + NeuralTransitionTagger() : finalized(false) {} + virtual ~NeuralTransitionTagger() {} void FinalizeVocab(); @@ -30,13 +30,16 @@ class LSTMTransitionTagger { bool replace_unknowns = true, cnn::expr::Expression* final_parser_state = nullptr); - const lstm_parser::CorpusVocabulary& GetVocab() const { return vocab; } + const CorpusVocabulary& GetVocab() const { return vocab; } // TODO: arrange things such that we don't need to expose this? - lstm_parser::CorpusVocabulary* GetVocab() { return &vocab; } + CorpusVocabulary* GetVocab() { return &vocab; } protected: - struct TaggerState {}; + struct TaggerState { + const Sentence& raw_sentence; + const Sentence::SentenceMap& sentence; + }; bool finalized; std::map param_expressions; @@ -59,9 +62,7 @@ class LSTMTransitionTagger { virtual cnn::expr::Expression GetActionProbabilities( const TaggerState& state) = 0; - virtual bool ShouldTerminate(const TaggerState& state, - const Sentence& raw_sent, - const Sentence::SentenceMap& sent) const = 0; + virtual bool ShouldTerminate(const TaggerState& state) const = 0; virtual bool IsActionForbidden(const unsigned action, const std::vector& action_names, @@ -98,4 +99,4 @@ class LSTMTransitionTagger { } /* namespace lstm_parser */ -#endif /* LSTM_PARSER_PARSER_LSTM_TRANSITION_TAGGER_H_ */ +#endif /* LSTM_PARSER_PARSER_NEURAL_TRANSITION_TAGGER_H_ */ From 788f7c39b38960ce87d49297c45d53a8834d05f2 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 5 Mar 2017 19:02:11 -0500 Subject: [PATCH 35/88] Formatting --- parser/corpus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index 5a3c7a8..54aa2f8 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -53,8 +53,8 @@ class CorpusVocabulary { words_to_int(other.words_to_int), int_to_words(other.int_to_words), int_to_training_word(other.int_to_training_word), pos_to_int(other.pos_to_int), int_to_pos(other.int_to_pos), - chars_to_int(other.chars_to_int), int_to_chars( - other.int_to_chars), kUNK(other.kUNK) {} + chars_to_int(other.chars_to_int), int_to_chars(other.int_to_chars), + kUNK(other.kUNK) {} inline unsigned CountPOS() { return pos_to_int.size(); } inline unsigned CountWords() { return words_to_int.size(); } From 1d518e32f515d462edbd0326c9f0faa80bd7458a Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 6 Mar 2017 13:13:42 -0500 Subject: [PATCH 36/88] Got rid of dumb corpus copy constructor --- parser/corpus.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index 54aa2f8..a905ba5 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -46,16 +46,6 @@ class CorpusVocabulary { AddEntry(BAD0, &chars_to_int, &int_to_chars); } - // Copy constructor: Copy everything except action-related stuff, on the - // assumption that we're copying the vocabulary for use in another task with - // different actions. - CorpusVocabulary(const CorpusVocabulary& other) : - words_to_int(other.words_to_int), int_to_words(other.int_to_words), - int_to_training_word(other.int_to_training_word), - pos_to_int(other.pos_to_int), int_to_pos(other.int_to_pos), - chars_to_int(other.chars_to_int), int_to_chars(other.int_to_chars), - kUNK(other.kUNK) {} - inline unsigned CountPOS() { return pos_to_int.size(); } inline unsigned CountWords() { return words_to_int.size(); } inline unsigned CountChars() { return chars_to_int.size(); } From 14636cb4aace1944b8ade145c9d39489fbe45de1 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 7 Mar 2017 15:08:33 -0500 Subject: [PATCH 37/88] Attempted to marginally reduce memory usage --- parser/neural-transition-tagger.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 39af31f..7ce4870 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -39,6 +39,13 @@ void NeuralTransitionTagger::FinalizeVocab() { if (finalized) return; InitializeNetworkParameters(); + // Give up memory we don't need. + vocab.actions.shrink_to_fit(); + vocab.actions_to_arc_labels.shrink_to_fit(); + vocab.int_to_chars.shrink_to_fit(); + vocab.int_to_pos.shrink_to_fit(); + vocab.int_to_training_word.shrink_to_fit(); + vocab.int_to_words.shrink_to_fit(); finalized = true; } From 90076a8b6b5b6bb18c13dcfa5fcc84f7742b2f3e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 7 Mar 2017 15:29:36 -0500 Subject: [PATCH 38/88] Changed CNN build to always optimize, even for debug builds --- cnn/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cnn/CMakeLists.txt b/cnn/CMakeLists.txt index e8408b4..58173ae 100644 --- a/cnn/CMakeLists.txt +++ b/cnn/CMakeLists.txt @@ -1,6 +1,10 @@ project(cnn) cmake_minimum_required(VERSION 2.8 FATAL_ERROR) +if(NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif(NOT CMAKE_BUILD_TYPE) + set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # CNN uses Eigen which exploits modern CPU architectures. To get the From 069189e98d91ced9fe9f07366d3410406c26d7be Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 8 Mar 2017 17:07:06 -0500 Subject: [PATCH 39/88] Made it easier to print Sentence objects --- parser/corpus.cc | 4 ++-- parser/corpus.h | 32 +++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 2ab2442..85acb24 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -49,7 +49,7 @@ void ConllUCorpusReader::ReadSentences(const string& file, current_sentence_pos[Corpus::ROOT_TOKEN_ID] = root_pos_symbol; current_sentence_unk_surface_forms[Corpus::ROOT_TOKEN_ID] = ""; - corpus->sentences.push_back({}); + corpus->sentences.emplace_back(*corpus->vocab); corpus->sentences.back().words.swap(current_sentence); corpus->sentences.back().poses.swap(current_sentence_pos); corpus->sentences.back().unk_surface_forms.swap( @@ -183,7 +183,7 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( Sentence::SentenceMap* sentence_pos, Sentence::SentenceUnkMap* sentence_unk_surface_forms, bool final) const { // Store the sentence variables and clear them for the next sentence. - corpus->sentences.push_back({}); + corpus->sentences.emplace_back(*corpus->vocab); Sentence* sentence = &corpus->sentences.back(); sentence->words.swap(*words); sentence->poses.swap(*sentence_pos); diff --git a/parser/corpus.h b/parser/corpus.h index a905ba5..4caaa9a 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -188,19 +189,48 @@ class ConllUCorpusReader : public CorpusReader { }; -struct Sentence { +class Sentence; +inline std::ostream& operator<<(std::ostream& os, const Sentence& sentence); + +class Sentence { +public: typedef std::map SentenceMap; typedef std::map SentenceUnkMap; + Sentence(const CorpusVocabulary& vocab) : vocab(vocab) {} + SentenceMap words; SentenceMap poses; SentenceUnkMap unk_surface_forms; + const CorpusVocabulary& vocab; size_t Size() const { return words.size(); } + + std::string AsString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); + } }; +inline std::ostream& operator<<(std::ostream& os, const Sentence&sentence) { + for (auto &index_and_word_id : sentence.words) { + unsigned index = index_and_word_id.first; + unsigned word_id = index_and_word_id.second; + unsigned pos_id = sentence.poses.at(index); + auto unk_iter = sentence.unk_surface_forms.find(index); + os << (unk_iter == sentence.unk_surface_forms.end() ? + sentence.vocab.int_to_words.at(word_id) : unk_iter->second) + << '/' << sentence.vocab.int_to_pos.at(pos_id); + if (index != sentence.words.rend()->first) { + os << ' '; + } + } + return os; +} + class Corpus { public: From bcfe82d8819766bbd1c02d2cdb15338c76493e9e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 8 Mar 2017 18:00:32 -0500 Subject: [PATCH 40/88] Fixed nasty bug with handling of completely unknown words --- parser/neural-transition-tagger.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 7ce4870..150dc47 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -54,7 +54,8 @@ Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced for (auto& index_and_id : tsentence) { // use reference to overwrite - if (!vocab.int_to_training_word[index_and_id.second]) { + if (index_and_id.second >= vocab.int_to_training_word.size() + || !vocab.int_to_training_word[index_and_id.second]) { index_and_id.second = vocab.kUNK; } } From ae363f45de4675750dacac0aafdfc597d4e725c0 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 9 Mar 2017 13:29:23 -0500 Subject: [PATCH 41/88] More sensible storage for correct actions in corpus reading --- parser/corpus.cc | 25 ++++++++++++++----------- parser/corpus.h | 5 +++-- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 85acb24..acd99a5 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -155,17 +155,18 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordWord( void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( - const string& action, TrainingCorpus* corpus) const { + const string& action, TrainingCorpus* corpus, + vector* correct_actions) const { CorpusVocabulary* vocab = corpus->vocab; auto action_iter = find(vocab->actions.begin(), vocab->actions.end(), action); if (action_iter != vocab->actions.end()) { unsigned action_index = distance(vocab->actions.begin(), action_iter); - corpus->correct_act_sent.back().push_back(action_index); + correct_actions->push_back(action_index); } else { // A not-previously-seen action if (is_training) { vocab->actions.push_back(action); unsigned action_index = vocab->actions.size() - 1; - corpus->correct_act_sent.back().push_back(action_index); + correct_actions->push_back(action_index); vocab->actions_to_arc_labels.push_back(vocab->GetLabelForAction(action)); } else { // TODO: right now, new actions which haven't been observed in @@ -181,19 +182,21 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( TrainingCorpus* corpus, Sentence::SentenceMap* words, Sentence::SentenceMap* sentence_pos, - Sentence::SentenceUnkMap* sentence_unk_surface_forms, bool final) const { + Sentence::SentenceUnkMap* sentence_unk_surface_forms, + vector* correct_actions) const { // Store the sentence variables and clear them for the next sentence. corpus->sentences.emplace_back(*corpus->vocab); Sentence* sentence = &corpus->sentences.back(); sentence->words.swap(*words); sentence->poses.swap(*sentence_pos); + corpus->correct_act_sent.push_back({}); + corpus->correct_act_sent.back().swap(*correct_actions); + if (!is_training) { sentence->unk_surface_forms.swap(*sentence_unk_surface_forms); } - if (!final) { - corpus->correct_act_sent.push_back({}); - } + assert(corpus->correct_act_sent.size() == corpus->sentences.size()); } @@ -218,7 +221,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( Sentence::SentenceMap sentence; Sentence::SentenceMap sentence_pos; Sentence::SentenceUnkMap sentence_unk_surface_forms; - corpus->correct_act_sent.push_back({}); + vector correct_actions; // We'll need to make sure ROOT token has a consistent ID. // (Should get inlined; defined here for DRY purposes.) @@ -251,7 +254,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( if (!first) { // if first, first line is blank, but no sentence yet FixRootID(); RecordSentence(corpus, &sentence, &sentence_pos, - &sentence_unk_surface_forms); + &sentence_unk_surface_forms, &correct_actions); } start_of_sentence = true; continue; // don't update next_is_action_line @@ -300,7 +303,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( } while (iss); } } else { // next_is_action_line - RecordAction(line, corpus); + RecordAction(line, corpus, &correct_actions); start_of_sentence = false; } @@ -311,7 +314,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( if (sentence.size() > 0) { FixRootID(); RecordSentence(corpus, &sentence, &sentence_pos, - &sentence_unk_surface_forms, true); + &sentence_unk_surface_forms, &correct_actions); } actions_file.close(); diff --git a/parser/corpus.h b/parser/corpus.h index 4caaa9a..88f3e04 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -286,12 +286,13 @@ class TrainingCorpus : public Corpus { Sentence::SentenceMap* sentence_pos, Sentence::SentenceUnkMap* sentence_unk_surface_forms) const; - void RecordAction(const std::string& action, TrainingCorpus* corpus) const; + void RecordAction(const std::string& action, TrainingCorpus* corpus, + std::vector* correct_actions) const; void RecordSentence(TrainingCorpus* corpus, Sentence::SentenceMap* words, Sentence::SentenceMap* sentence_pos, Sentence::SentenceUnkMap* sentence_unk_surface_forms, - bool final = false) const; + std::vector* correct_actions) const; static inline unsigned UTF8Len(unsigned char x) { if (x < 0x80) return 1; From 21279da470f310a694288c8715948efd007aaacc Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 9 Mar 2017 13:37:45 -0500 Subject: [PATCH 42/88] Minor memory optimizations --- parser/corpus.cc | 2 ++ parser/corpus.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/parser/corpus.cc b/parser/corpus.cc index acd99a5..be682d8 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -82,6 +82,8 @@ void ConllUCorpusReader::ReadSentences(const string& file, current_sentence[token_index] = word_id; current_sentence_pos[token_index] = corpus->vocab->GetPOS(pos); } + + corpus->sentences.shrink_to_fit(); } diff --git a/parser/corpus.h b/parser/corpus.h index 88f3e04..2123738 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -333,6 +333,8 @@ class ParserTrainingCorpus : public TrainingCorpus { ParserTrainingCorpus* training_corpus = static_cast(corpus); LoadCorrectActions(file, training_corpus); + training_corpus->sentences.shrink_to_fit(); + training_corpus->correct_act_sent.shrink_to_fit(); } virtual ~OracleParseTransitionsReader() {}; From ff988c3c1b6f9e08c8283d7cad9a7208f69d6227 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 9 Mar 2017 14:27:32 -0500 Subject: [PATCH 43/88] Added assertion for too many training actions --- parser/neural-transition-tagger.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 150dc47..d629a5d 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -121,6 +121,7 @@ vector NeuralTransitionTagger::LogProbTagger( unsigned action = best_a; // If we have reference actions (for training), use the reference action. if (build_training_graph) { + assert(action_count < correct_actions.size()); action = correct_actions[action_count]; if (correct && best_a == action) { (*correct)++; From e2f4ceebd3e866c6a352df47d2d9bd8e5ec605aa Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 15 Mar 2017 20:33:54 -0400 Subject: [PATCH 44/88] Minor logging change --- parser/lstm-parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 7e082a5..aa07aed 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -153,7 +153,7 @@ class LSTMParser : public NeuralTransitionTagger { explicit LSTMParser(const std::string& model_path) : kROOT_SYMBOL(vocab.GetOrAddWord(vocab.ROOT)) { - std::cerr << "Loading model from " << model_path << "..."; + std::cerr << "Loading parser model from " << model_path << "..."; auto t_start = std::chrono::high_resolution_clock::now(); std::ifstream model_file(model_path.c_str(), std::ios::binary); if (!model_file) { From b2cec9baad400f2581cc273fbbb86a72d3709add Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 16 Mar 2017 13:13:23 -0400 Subject: [PATCH 45/88] Simplified RecoverParseTree --- parser/lstm-parser.cc | 20 +++++++------------- parser/lstm-parser.h | 7 ++----- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index cd52ccf..9183110 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -171,9 +171,8 @@ bool LSTMParser::IsActionForbidden(const unsigned action, ParseTree LSTMParser::RecoverParseTree( - const Sentence& sentence, const vector& actions, - const vector& action_names, - const vector& actions_to_arc_labels, double logprob, bool labeled) { + const Sentence& sentence, const vector& actions, double logprob, + bool labeled) { ParseTree tree(sentence, labeled); vector bufferi(sentence.Size() + 1); bufferi[0] = -999; @@ -185,7 +184,7 @@ ParseTree LSTMParser::RecoverParseTree( index_and_word_id.first; } for (auto action : actions) { // loop over transitions for sentence - const string& action_string = action_names[action]; + const string& action_string = vocab.actions[action]; const char ac = action_string[0]; const char ac2 = action_string[1]; if (ac == 'S' && ac2 == 'H') { // SHIFT @@ -212,7 +211,7 @@ ParseTree LSTMParser::RecoverParseTree( (ac == 'R' ? headi : depi) = stacki.back(); stacki.pop_back(); stacki.push_back(headi); - tree.SetParent(depi, headi, actions_to_arc_labels[action]); + tree.SetParent(depi, headi, vocab.actions_to_arc_labels[action]); } } assert(bufferi.size() == 1); @@ -465,9 +464,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, llh += hyp.logprob; const vector& actions = dev_corpus.correct_act_sent[sii]; - ParseTree ref = RecoverParseTree( - sentence, actions, dev_corpus.vocab->actions, - dev_corpus.vocab->actions_to_arc_labels); + ParseTree ref = RecoverParseTree(sentence, actions); trs += actions.size(); correct_heads += ComputeCorrect(ref, hyp); @@ -497,8 +494,7 @@ ParseTree LSTMParser::Parse(const Sentence& sentence, ComputationGraph cg; vector pred = LogProbTagger(sentence, vocab, &cg); double lp = as_scalar(cg.incremental_forward()); - return RecoverParseTree(sentence, pred, vocab.actions, - vocab.actions_to_arc_labels, labeled, lp); + return RecoverParseTree(sentence, pred, labeled, lp); } @@ -530,9 +526,7 @@ void LSTMParser::DoTest(const Corpus& corpus, bool evaluate, const ParserTrainingCorpus& training_corpus = static_cast(corpus); const vector& actions = training_corpus.correct_act_sent[sii]; - ParseTree ref = RecoverParseTree(sentence, actions, corpus.vocab->actions, - corpus.vocab->actions_to_arc_labels, - true); + ParseTree ref = RecoverParseTree(sentence, actions, true); trs += actions.size(); llh += hyp.logprob; correct_heads += ComputeCorrect(ref, hyp); diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index aa07aed..4bceb3a 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -181,11 +181,8 @@ class LSTMParser : public NeuralTransitionTagger { // take a vector of actions and return a parse tree ParseTree RecoverParseTree( - const Sentence& sentence, - const std::vector& actions, - const std::vector& action_names, - const std::vector& actions_to_arc_labels, double logprob = 0, - bool labeled = false); + const Sentence& sentence, const std::vector& actions, + double logprob = 0, bool labeled = false); void Train(const ParserTrainingCorpus& corpus, const ParserTrainingCorpus& dev_corpus, const double unk_prob, From a88be9de70f8cc6fa4166912e36374393412e5a0 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 16 Mar 2017 13:15:24 -0400 Subject: [PATCH 46/88] Whitespace cleanup --- parser/lstm-parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 4bceb3a..e3651f6 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -77,7 +77,7 @@ class ParseTree { ParseTree(const Sentence& sentence, bool labeled = true) : sentence(sentence), logprob(0), - arc_labels( labeled ? new std::map : nullptr) {} + arc_labels(labeled ? new std::map : nullptr) {} inline void SetParent(unsigned child_index, unsigned parent_index, const std::string& arc_label="") { From 4e269c8988237192e79bb18635b4ebe73e7b9e88 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 16 Mar 2017 13:32:55 -0400 Subject: [PATCH 47/88] Added function to check if a parse tree is labeled --- parser/lstm-parser.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index e3651f6..7f9ad6a 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -82,7 +82,7 @@ class ParseTree { inline void SetParent(unsigned child_index, unsigned parent_index, const std::string& arc_label="") { parents[child_index] = parent_index; - if (arc_labels) { + if (IsLabeled()) { (*arc_labels)[child_index] = arc_label; } } @@ -97,7 +97,7 @@ class ParseTree { } const inline std::string& GetArcLabel(unsigned child) const { - if (!arc_labels) + if (!IsLabeled()) return NO_LABEL; auto arc_label_iter = arc_labels->find(child); if (arc_label_iter == arc_labels->end()) { @@ -107,6 +107,8 @@ class ParseTree { } } + bool IsLabeled() const { return arc_labels.get(); } + private: std::map parents; std::unique_ptr> arc_labels; From f38c407d45c7a7ec306b5eacac307713c65a714f Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 16 Mar 2017 13:45:43 -0400 Subject: [PATCH 48/88] Made ParseTree wrap sentence reference to allow move assignment --- parser/lstm-parser.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 7f9ad6a..fed01df 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -67,17 +67,16 @@ struct ParserOptions { }; +// Barebones representation of a parse tree. class ParseTree { public: static std::string NO_LABEL; - // Barebones representation of a parse tree. - const Sentence& sentence; double logprob; ParseTree(const Sentence& sentence, bool labeled = true) : - sentence(sentence), logprob(0), - arc_labels(labeled ? new std::map : nullptr) {} + arc_labels(labeled ? new std::map : nullptr), + sentence(sentence) {} inline void SetParent(unsigned child_index, unsigned parent_index, const std::string& arc_label="") { @@ -87,6 +86,10 @@ class ParseTree { } } + const Sentence& GetSentence() const { + return sentence.get(); + } + const inline unsigned GetParent(unsigned child) const { auto parent_iter = parents.find(child); if (parent_iter == parents.end()) { @@ -112,6 +115,7 @@ class ParseTree { private: std::map parents; std::unique_ptr> arc_labels; + std::reference_wrapper sentence; }; @@ -259,9 +263,9 @@ class LSTMParser : public NeuralTransitionTagger { inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { - assert(ref.sentence.Size() == hyp.sentence.Size()); + assert(ref.GetSentence().Size() == hyp.GetSentence().Size()); unsigned correct_count = 0; - for (const auto& token_index_and_word : ref.sentence.words) { + for (const auto& token_index_and_word : ref.GetSentence().words) { unsigned i = token_index_and_word.first; if (i != Corpus::ROOT_TOKEN_ID && ref.GetParent(i) == hyp.GetParent(i)) ++correct_count; From 481fdbaeb68c49223bc6c7048d080e8f01ac77f2 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 16 Mar 2017 14:01:53 -0400 Subject: [PATCH 49/88] Simplified tagger interface Stopped stupidly passing around stuff that the tagger already has access to via its own vocab object --- parser/corpus.cc | 12 ++++++------ parser/corpus.h | 8 ++++---- parser/lstm-parser.cc | 20 ++++++++------------ parser/lstm-parser.h | 9 +++------ parser/neural-transition-tagger.cpp | 24 ++++++++++-------------- parser/neural-transition-tagger.h | 19 ++++++------------- 6 files changed, 37 insertions(+), 55 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index be682d8..6636824 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -160,14 +160,14 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordAction( const string& action, TrainingCorpus* corpus, vector* correct_actions) const { CorpusVocabulary* vocab = corpus->vocab; - auto action_iter = find(vocab->actions.begin(), vocab->actions.end(), action); - if (action_iter != vocab->actions.end()) { - unsigned action_index = distance(vocab->actions.begin(), action_iter); + auto action_iter = find(vocab->action_names.begin(), vocab->action_names.end(), action); + if (action_iter != vocab->action_names.end()) { + unsigned action_index = distance(vocab->action_names.begin(), action_iter); correct_actions->push_back(action_index); } else { // A not-previously-seen action if (is_training) { - vocab->actions.push_back(action); - unsigned action_index = vocab->actions.size() - 1; + vocab->action_names.push_back(action); + unsigned action_index = vocab->action_names.size() - 1; correct_actions->push_back(action_index); vocab->actions_to_arc_labels.push_back(vocab->GetLabelForAction(action)); } else { @@ -323,7 +323,7 @@ void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( cerr << "done." << "\n"; if (is_training) { - for (auto a : vocab->actions) { + for (auto a : vocab->action_names) { vocab->actions_to_arc_labels.push_back(vocab->GetLabelForAction(a)); cerr << a << "\n"; } diff --git a/parser/corpus.h b/parser/corpus.h index 2123738..ed239a3 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -36,7 +36,7 @@ class CorpusVocabulary { StrToIntMap chars_to_int; std::vector int_to_chars; - std::vector actions; + std::vector action_names; std::vector actions_to_arc_labels; unsigned kUNK; @@ -50,7 +50,7 @@ class CorpusVocabulary { inline unsigned CountPOS() { return pos_to_int.size(); } inline unsigned CountWords() { return words_to_int.size(); } inline unsigned CountChars() { return chars_to_int.size(); } - inline unsigned CountActions() { return actions.size(); } + inline unsigned CountActions() { return action_names.size(); } inline unsigned GetWord(const std::string& word) const { auto word_iter = words_to_int.find(word); @@ -118,7 +118,7 @@ class CorpusVocabulary { ar & vocab->int_to_pos; ar & vocab->int_to_chars; ar & vocab->int_to_training_word; - ar & vocab->actions; + ar & vocab->action_names; } template @@ -152,7 +152,7 @@ class CorpusVocabulary { chars_to_int[int_to_chars[i]] = i; // ...and the arc labels. - for (const std::string& action : actions) { + for (const std::string& action : action_names) { actions_to_arc_labels.push_back(GetLabelForAction(action)); } } diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 9183110..c5232f4 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -137,9 +137,8 @@ LSTMParser::LSTMParser(const ParserOptions& poptions, bool LSTMParser::IsActionForbidden(const unsigned action, - const vector& action_names, const TaggerState& state) const { - const string& action_name = action_names[action]; + const string& action_name = vocab.action_names[action]; const ParserState& real_state = static_cast(state); unsigned ssize = real_state.stack.size(); unsigned bsize = real_state.buffer.size(); @@ -184,7 +183,7 @@ ParseTree LSTMParser::RecoverParseTree( index_and_word_id.first; } for (auto action : actions) { // loop over transitions for sentence - const string& action_string = vocab.actions[action]; + const string& action_string = vocab.action_names[action]; const char ac = action_string[0]; const char ac2 = action_string[1]; if (ac == 'S' && ac2 == 'H') { // SHIFT @@ -236,8 +235,8 @@ Expression LSTMParser::GetActionProbabilities(const TaggerState& state) { } -void LSTMParser::DoAction(unsigned action, const vector& action_names, - TaggerState* state, ComputationGraph* cg) { +void LSTMParser::DoAction(unsigned action, TaggerState* state, + ComputationGraph* cg) { ParserState* real_state = static_cast(state); // add current action to action LSTM Expression action_e = lookup(*cg, p_a, action); @@ -247,7 +246,7 @@ void LSTMParser::DoAction(unsigned action, const vector& action_names, Expression relation = lookup(*cg, p_r, action); // do action - const string& action_string = action_names[action]; + const string& action_string = vocab.action_names[action]; const char ac = action_string[0]; const char ac2 = action_string[1]; @@ -317,8 +316,7 @@ NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const vector& correct_actions, - const vector& action_names) { + const vector& correct_actions) { stack_lstm.new_graph(*cg); buffer_lstm.new_graph(*cg); action_lstm.new_graph(*cg); @@ -423,9 +421,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } const vector& actions = corpus.correct_act_sent[order[si]]; ComputationGraph hg; - LogProbTagger(&hg, sentence, tsentence, actions, - corpus.vocab->actions, corpus.vocab->int_to_words, - &correct); + LogProbTagger(&hg, sentence, tsentence, actions, &correct); double lp = as_scalar(hg.incremental_forward()); if (lp < 0) { cerr << "Log prob < 0 on sentence " << order[si] << ": lp=" << lp @@ -492,7 +488,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, ParseTree LSTMParser::Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled) { ComputationGraph cg; - vector pred = LogProbTagger(sentence, vocab, &cg); + vector pred = LogProbTagger(sentence, &cg); double lp = as_scalar(cg.incremental_forward()); return RecoverParseTree(sentence, pred, labeled, lp); } diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index fed01df..867de4b 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -240,8 +240,7 @@ class LSTMParser : public NeuralTransitionTagger { virtual TaggerState* InitializeParserState( cnn::ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - const std::vector& action_names) override; + const std::vector& correct_actions) override; virtual void InitializeNetworkParameters() override; @@ -251,15 +250,13 @@ class LSTMParser : public NeuralTransitionTagger { } virtual bool IsActionForbidden(const unsigned action, - const std::vector& action_names, const TaggerState& state) const override; virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) override; - virtual void DoAction(unsigned action, - const std::vector& action_names, - TaggerState* state, cnn::ComputationGraph* cg) override; + virtual void DoAction(unsigned action, TaggerState* state, + cnn::ComputationGraph* cg) override; inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index d629a5d..7a265ca 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -40,7 +40,7 @@ void NeuralTransitionTagger::FinalizeVocab() { return; InitializeNetworkParameters(); // Give up memory we don't need. - vocab.actions.shrink_to_fit(); + vocab.action_names.shrink_to_fit(); vocab.actions_to_arc_labels.shrink_to_fit(); vocab.int_to_chars.shrink_to_fit(); vocab.int_to_pos.shrink_to_fit(); @@ -50,7 +50,7 @@ void NeuralTransitionTagger::FinalizeVocab() { } Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( - const Sentence& sentence, const CorpusVocabulary& vocab) { + const Sentence& sentence) { Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced for (auto& index_and_id : tsentence) { // use reference to overwrite @@ -63,14 +63,12 @@ Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( } vector NeuralTransitionTagger::LogProbTagger( - const Sentence& sentence, const CorpusVocabulary& vocab, - ComputationGraph *cg, bool replace_unknowns, + const Sentence& sentence, ComputationGraph *cg, bool replace_unknowns, Expression* final_parser_state) { return LogProbTagger( cg, sentence, - replace_unknowns ? ReplaceUnknowns(sentence, vocab) : sentence.words, - vector(), vocab.actions, vocab.int_to_words, nullptr, - final_parser_state); + replace_unknowns ? ReplaceUnknowns(sentence) : sentence.words, + vector(), nullptr, final_parser_state); } @@ -78,8 +76,7 @@ vector NeuralTransitionTagger::LogProbTagger( ComputationGraph* cg, const Sentence& raw_sent, // raw sentence const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const vector& correct_actions, const vector& action_names, - const vector& int_to_words, double* correct, + const vector& correct_actions, double* correct, Expression* final_parser_state) { assert(finalized); vector results; @@ -91,8 +88,7 @@ vector NeuralTransitionTagger::LogProbTagger( } unique_ptr state(InitializeParserState(cg, raw_sent, sent, - correct_actions, - action_names)); + correct_actions)); vector log_probs; unsigned action_count = 0; // incremented at each prediction @@ -100,8 +96,8 @@ vector NeuralTransitionTagger::LogProbTagger( while (!ShouldTerminate(*state)) { // Get list of possible actions for the current parser state. vector current_valid_actions; - for (unsigned action = 0; action < action_names.size(); ++action) { - if (IsActionForbidden(action, action_names, *state)) + for (unsigned action = 0; action < vocab.action_names.size(); ++action) { + if (IsActionForbidden(action, *state)) continue; current_valid_actions.push_back(action); } @@ -131,7 +127,7 @@ vector NeuralTransitionTagger::LogProbTagger( log_probs.push_back(pick(adiste, action)); results.push_back(action); - DoAction(action, action_names, state.get(), cg); + DoAction(action, state.get(), cg); } Expression tot_neglogprob = -sum(log_probs); diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index f6d08ff..baac0b4 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -25,8 +25,7 @@ class NeuralTransitionTagger { // Used for testing. Replaces OOV with UNK. std::vector LogProbTagger( - const Sentence& sentence, const CorpusVocabulary& vocab, - cnn::ComputationGraph *cg, + const Sentence& sentence, cnn::ComputationGraph *cg, bool replace_unknowns = true, cnn::expr::Expression* final_parser_state = nullptr); @@ -56,8 +55,7 @@ class NeuralTransitionTagger { virtual TaggerState* InitializeParserState( cnn::ComputationGraph* hg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - const std::vector& action_names) = 0; + const std::vector& correct_actions) = 0; virtual cnn::expr::Expression GetActionProbabilities( const TaggerState& state) = 0; @@ -65,12 +63,10 @@ class NeuralTransitionTagger { virtual bool ShouldTerminate(const TaggerState& state) const = 0; virtual bool IsActionForbidden(const unsigned action, - const std::vector& action_names, const TaggerState& state) const = 0; - virtual void DoAction(unsigned action, - const std::vector& action_names, - TaggerState* state, cnn::ComputationGraph* cg) = 0; + virtual void DoAction(unsigned action, TaggerState* state, + cnn::ComputationGraph* cg) = 0; virtual void DoSave(eos::portable_oarchive& archive) = 0; @@ -89,12 +85,9 @@ class NeuralTransitionTagger { const Sentence& sentence, // raw sentence const Sentence::SentenceMap& sent, // sentence with OOVs replaced const std::vector& correct_actions, - const std::vector& action_names, - const std::vector& int_to_words, double* correct, - cnn::expr::Expression* final_parser_state = nullptr); + double* correct, cnn::expr::Expression* final_parser_state = nullptr); - Sentence::SentenceMap ReplaceUnknowns(const Sentence& sentence, - const CorpusVocabulary& vocab); + Sentence::SentenceMap ReplaceUnknowns(const Sentence& sentence); }; } /* namespace lstm_parser */ From 33f51821cd8b7a9550a217a7459a669150128c5f Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 16 Mar 2017 14:22:23 -0400 Subject: [PATCH 50/88] More constness --- parser/lstm-parser.cc | 2 +- parser/lstm-parser.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index c5232f4..1754d8d 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -171,7 +171,7 @@ bool LSTMParser::IsActionForbidden(const unsigned action, ParseTree LSTMParser::RecoverParseTree( const Sentence& sentence, const vector& actions, double logprob, - bool labeled) { + bool labeled) const { ParseTree tree(sentence, labeled); vector bufferi(sentence.Size() + 1); bufferi[0] = -999; diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 867de4b..ad4376f 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -188,7 +188,7 @@ class LSTMParser : public NeuralTransitionTagger { // take a vector of actions and return a parse tree ParseTree RecoverParseTree( const Sentence& sentence, const std::vector& actions, - double logprob = 0, bool labeled = false); + double logprob = 0, bool labeled = false) const; void Train(const ParserTrainingCorpus& corpus, const ParserTrainingCorpus& dev_corpus, const double unk_prob, From 81f9ef798a575fe5eba91ae782b7b8ece41c4ff4 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Fri, 17 Mar 2017 00:40:02 -0400 Subject: [PATCH 51/88] Added copy/move constructors to ParseTree --- parser/lstm-parser.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index ad4376f..ba54586 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -78,7 +78,17 @@ class ParseTree { arc_labels(labeled ? new std::map : nullptr), sentence(sentence) {} - inline void SetParent(unsigned child_index, unsigned parent_index, + ParseTree(const ParseTree& other) + : logprob(other.logprob), parents(other.parents), + arc_labels(other.IsLabeled() ? + new std::map(*other.arc_labels) : nullptr), + sentence(other.sentence) {} + + ParseTree(ParseTree&& other) = default; + + ParseTree& operator=(ParseTree&& other) = default; + + void SetParent(unsigned child_index, unsigned parent_index, const std::string& arc_label="") { parents[child_index] = parent_index; if (IsLabeled()) { @@ -90,7 +100,7 @@ class ParseTree { return sentence.get(); } - const inline unsigned GetParent(unsigned child) const { + const unsigned GetParent(unsigned child) const { auto parent_iter = parents.find(child); if (parent_iter == parents.end()) { return Corpus::ROOT_TOKEN_ID; // This is the best guess we've got. @@ -99,7 +109,7 @@ class ParseTree { } } - const inline std::string& GetArcLabel(unsigned child) const { + const std::string& GetArcLabel(unsigned child) const { if (!IsLabeled()) return NO_LABEL; auto arc_label_iter = arc_labels->find(child); @@ -112,7 +122,7 @@ class ParseTree { bool IsLabeled() const { return arc_labels.get(); } -private: +protected: std::map parents; std::unique_ptr> arc_labels; std::reference_wrapper sentence; From b360d62edc223f1a53e171fa1fcd2f044877b938 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Fri, 17 Mar 2017 19:19:05 -0400 Subject: [PATCH 52/88] Const-ness change --- parser/lstm-parser.cc | 2 +- parser/lstm-parser.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 1754d8d..7c9a202 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -23,7 +23,7 @@ using namespace std; namespace lstm_parser { -string ParseTree::NO_LABEL = "ERROR"; +const string ParseTree::NO_LABEL("ERROR"); void LSTMParser::LoadPretrainedWords(const string& words_path) { diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index ba54586..7374df8 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -70,7 +70,8 @@ struct ParserOptions { // Barebones representation of a parse tree. class ParseTree { public: - static std::string NO_LABEL; + static const std::string NO_LABEL; + double logprob; ParseTree(const Sentence& sentence, bool labeled = true) : From 3e8a27503f1f3249e73a663c7ceaf0f456a0c19c Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Fri, 17 Mar 2017 21:03:08 -0400 Subject: [PATCH 53/88] Added root child to ParseTree representation --- parser/lstm-parser.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 7374df8..fd8eac5 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -77,13 +77,13 @@ class ParseTree { ParseTree(const Sentence& sentence, bool labeled = true) : logprob(0), arc_labels(labeled ? new std::map : nullptr), - sentence(sentence) {} + sentence(sentence), root_child(-1) {} ParseTree(const ParseTree& other) : logprob(other.logprob), parents(other.parents), arc_labels(other.IsLabeled() ? new std::map(*other.arc_labels) : nullptr), - sentence(other.sentence) {} + sentence(other.sentence), root_child(-1) {} ParseTree(ParseTree&& other) = default; @@ -95,6 +95,9 @@ class ParseTree { if (IsLabeled()) { (*arc_labels)[child_index] = arc_label; } + if (parent_index == Corpus::ROOT_TOKEN_ID) { + root_child = child_index; + } } const Sentence& GetSentence() const { @@ -121,12 +124,15 @@ class ParseTree { } } + const unsigned GetRootChild() const { return root_child; } + bool IsLabeled() const { return arc_labels.get(); } protected: std::map parents; std::unique_ptr> arc_labels; std::reference_wrapper sentence; + unsigned root_child; }; From 7599db133a6ec4af40ec6ab5426bab7fcf83509e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Fri, 17 Mar 2017 22:58:36 -0400 Subject: [PATCH 54/88] Relinquish CNN memory when replacing a model --- parser/corpus.cc | 2 +- parser/lstm-parser.cc | 54 ++++++++++++++--------------- parser/lstm-parser.h | 12 +++---- parser/neural-transition-tagger.cpp | 2 ++ parser/neural-transition-tagger.h | 6 ++-- 5 files changed, 39 insertions(+), 37 deletions(-) diff --git a/parser/corpus.cc b/parser/corpus.cc index 6636824..d9291c9 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -204,7 +204,7 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( void ParserTrainingCorpus::OracleParseTransitionsReader::LoadCorrectActions( const string& file, ParserTrainingCorpus* corpus) const { - cerr << "Loading " << (is_training ? "training" : "dev") + cerr << "Loading " << (is_training ? "training" : "dev/test") << " corpus from " << file << "..." << endl; ifstream actions_file(file); if (!actions_file) { diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 7c9a202..f545f44 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -74,37 +74,37 @@ void LSTMParser::InitializeNetworkParameters() { if (!pretrained.empty()) { unsigned pretrained_dim = pretrained.begin()->second.size(); - p_t = model.add_lookup_parameters(vocab_size, {pretrained_dim}); + p_t = model->add_lookup_parameters(vocab_size, {pretrained_dim}); for (const auto& it : pretrained) p_t->Initialize(it.first, it.second); - p_t2l = model.add_parameters({options.lstm_input_dim, pretrained_dim}); + p_t2l = model->add_parameters({options.lstm_input_dim, pretrained_dim}); } else { p_t = nullptr; p_t2l = nullptr; } - p_w = model.add_lookup_parameters(vocab_size, {options.input_dim}); - p_a = model.add_lookup_parameters(action_size, {options.action_dim}); - p_r = model.add_lookup_parameters(action_size, {options.rel_dim}); - p_pbias = model.add_parameters({options.hidden_dim}); - p_A = model.add_parameters({options.hidden_dim, options.hidden_dim}); - p_B = model.add_parameters({options.hidden_dim, options.hidden_dim}); - p_S = model.add_parameters({options.hidden_dim, options.hidden_dim}); - p_H = model.add_parameters({options.lstm_input_dim, options.lstm_input_dim}); - p_D = model.add_parameters({options.lstm_input_dim, options.lstm_input_dim}); - p_R = model.add_parameters({options.lstm_input_dim, options.rel_dim}); - p_w2l = model.add_parameters({options.lstm_input_dim, options.input_dim}); - p_ib = model.add_parameters({options.lstm_input_dim}); - p_cbias = model.add_parameters({options.lstm_input_dim}); - p_p2a = model.add_parameters({action_size, options.hidden_dim}); - p_action_start = model.add_parameters({options.action_dim}); - p_abias = model.add_parameters({action_size}); - p_buffer_guard = model.add_parameters({options.lstm_input_dim}); - p_stack_guard = model.add_parameters({options.lstm_input_dim}); + p_w = model->add_lookup_parameters(vocab_size, {options.input_dim}); + p_a = model->add_lookup_parameters(action_size, {options.action_dim}); + p_r = model->add_lookup_parameters(action_size, {options.rel_dim}); + p_pbias = model->add_parameters({options.hidden_dim}); + p_A = model->add_parameters({options.hidden_dim, options.hidden_dim}); + p_B = model->add_parameters({options.hidden_dim, options.hidden_dim}); + p_S = model->add_parameters({options.hidden_dim, options.hidden_dim}); + p_H = model->add_parameters({options.lstm_input_dim, options.lstm_input_dim}); + p_D = model->add_parameters({options.lstm_input_dim, options.lstm_input_dim}); + p_R = model->add_parameters({options.lstm_input_dim, options.rel_dim}); + p_w2l = model->add_parameters({options.lstm_input_dim, options.input_dim}); + p_ib = model->add_parameters({options.lstm_input_dim}); + p_cbias = model->add_parameters({options.lstm_input_dim}); + p_p2a = model->add_parameters({action_size, options.hidden_dim}); + p_action_start = model->add_parameters({options.action_dim}); + p_abias = model->add_parameters({action_size}); + p_buffer_guard = model->add_parameters({options.lstm_input_dim}); + p_stack_guard = model->add_parameters({options.lstm_input_dim}); if (options.use_pos) { - p_p = model.add_lookup_parameters(pos_size, {options.pos_dim}); - p_p2l = model.add_parameters({options.lstm_input_dim, options.pos_dim}); + p_p = model->add_lookup_parameters(pos_size, {options.pos_dim}); + p_p2l = model->add_parameters({options.lstm_input_dim, options.pos_dim}); } else { p_p = nullptr; p_p2l = nullptr; @@ -117,11 +117,11 @@ LSTMParser::LSTMParser(const ParserOptions& poptions, options(poptions), kROOT_SYMBOL(vocab.GetOrAddWord(vocab.ROOT)), stack_lstm(options.layers, options.lstm_input_dim, options.hidden_dim, - &model), + model.get()), buffer_lstm(options.layers, options.lstm_input_dim, options.hidden_dim, - &model), + model.get()), action_lstm(options.layers, options.action_dim, options.hidden_dim, - &model) { + model.get()) { // First load words if needed before creating network parameters. // That will ensure that the vocab has the final number of words. if (!pretrained_words_path.empty()) { @@ -373,8 +373,8 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, bool softlink_created = false; int best_correct_heads = 0; unsigned status_every_i_iterations = 100; - SimpleSGDTrainer sgd(&model); - //MomentumSGDTrainer sgd(model); + SimpleSGDTrainer sgd(model.get()); + //MomentumSGDTrainer sgd(model.get()); sgd.eta_decay = 0.08; //sgd.eta_decay = 0.05; unsigned num_sentences = corpus.sentences.size(); diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index fd8eac5..84e7285 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -299,7 +299,7 @@ class LSTMParser : public NeuralTransitionTagger { ar & options; ar & vocab; ar & pretrained; - ar & model; + ar & *model; } template @@ -312,19 +312,19 @@ class LSTMParser : public NeuralTransitionTagger { ar & pretrained; // Don't finalize yet...we want to finalize once our model is initialized. - model = cnn::Model(); + model.reset(new cnn::Model); // Reset the LSTMs *before* reading in the network model, to make sure the // model knows how big it's supposed to be. stack_lstm = cnn::LSTMBuilder(options.layers, options.lstm_input_dim, - options.hidden_dim, &model); + options.hidden_dim, model.get()); buffer_lstm = cnn::LSTMBuilder(options.layers, options.lstm_input_dim, - options.hidden_dim, &model); + options.hidden_dim, model.get()); action_lstm = cnn::LSTMBuilder(options.layers, options.action_dim, - options.hidden_dim, &model); + options.hidden_dim, model.get()); FinalizeVocab(); // OK, now finalize. :) - ar & model; + ar & *model; } BOOST_SERIALIZATION_SPLIT_MEMBER(); diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 7a265ca..b886c3b 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -38,6 +38,8 @@ void NeuralTransitionTagger::SaveModel(const string& model_fname, void NeuralTransitionTagger::FinalizeVocab() { if (finalized) return; + if (!model.get()) + model.reset(new Model); InitializeNetworkParameters(); // Give up memory we don't need. vocab.action_names.shrink_to_fit(); diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index baac0b4..fd20ffa 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -17,8 +17,7 @@ namespace lstm_parser { class NeuralTransitionTagger { public: - - NeuralTransitionTagger() : finalized(false) {} + NeuralTransitionTagger() : finalized(false), model(new cnn::Model) {} virtual ~NeuralTransitionTagger() {} void FinalizeVocab(); @@ -43,7 +42,8 @@ class NeuralTransitionTagger { bool finalized; std::map param_expressions; - cnn::Model model; + // Store the model as a smart ptr so we can call its destructor when needed. + std::unique_ptr model; CorpusVocabulary vocab; inline cnn::expr::Expression GetParamExpr(cnn::Parameters* params) { From 415f0844d14ceb75214bc6b2e8297dd96d40c1f7 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 18 Mar 2017 16:54:17 -0400 Subject: [PATCH 55/88] Fixed memory leak --- parser/lstm-parser.h | 6 +++--- parser/neural-transition-tagger.h | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 84e7285..684f0b0 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -231,9 +231,9 @@ class LSTMParser : public NeuralTransitionTagger { ParserState(const Sentence& raw_sentence, const Sentence::SentenceMap& sentence, Expression stack_guard) - : TaggerState {raw_sentence, sentence}, buffer(raw_sentence.Size() + 1), - bufferi(raw_sentence.Size() + 1), stack( {stack_guard}), - stacki( {-999}) {} + : TaggerState(raw_sentence, sentence), buffer(raw_sentence.Size() + 1), + bufferi(raw_sentence.Size() + 1), stack({stack_guard}), + stacki({-999}) {} ~ParserState() { assert(stack.size() == 2); // guard symbol, root diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index fd20ffa..0ef5514 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -35,8 +35,12 @@ class NeuralTransitionTagger { protected: struct TaggerState { + TaggerState(const Sentence& raw_sentence, + const Sentence::SentenceMap& sentence) + : raw_sentence(raw_sentence), sentence(sentence) {} const Sentence& raw_sentence; const Sentence::SentenceMap& sentence; + virtual ~TaggerState() {} }; bool finalized; From 0be98ffbaebe1d0d0100116d23ffadf530742244 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 18 Mar 2017 17:30:29 -0400 Subject: [PATCH 56/88] Patched memory leak --- cnn/cnn/tensor.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cnn/cnn/tensor.h b/cnn/cnn/tensor.h index 0516fe7..599b97a 100644 --- a/cnn/cnn/tensor.h +++ b/cnn/cnn/tensor.h @@ -7,6 +7,7 @@ #include "cnn/dim.h" #include "cnn/random.h" #include "cnn/aligned-mem-pool.h" +#include "devices.h" #if HAVE_CUDA #include @@ -26,6 +27,7 @@ namespace cnn { #define EIGEN_BACKEND 1 typedef float real; +extern Device* default_device; // for allocating memory on a load struct Tensor { Tensor() = default; @@ -160,8 +162,12 @@ struct Tensor { float* vc = static_cast(std::malloc(d.size() * sizeof(float))); ar & boost::serialization::make_array(vc, d.size()); CUDA_CHECK(cudaMemcpyAsync(v, vc, d.size() * sizeof(float), cudaMemcpyHostToDevice)); + free(vc); #else - v = static_cast(_mm_malloc(d.size() * sizeof(float), 32)); + // UGLY HACK to avoid memory leak: node values and gradients don't get + // stored to disk; only parameters. So allocate memory for loading from the + // parameters pool. + v = static_cast(default_device->ps->allocate(d.size() * sizeof(float))); ar & boost::serialization::make_array(v, d.size()); #endif } From e25679a940c00c6b13ff06c295c0e9eb9c81a5e7 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 18 Mar 2017 19:35:41 -0400 Subject: [PATCH 57/88] Fixed another memory leak --- cnn/cnn/model.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cnn/cnn/model.cc b/cnn/cnn/model.cc index 4bd35d4..7179e4d 100644 --- a/cnn/cnn/model.cc +++ b/cnn/cnn/model.cc @@ -160,6 +160,7 @@ void LookupParameters::clear() { Model::~Model() { for (auto p : all_params) delete p; + default_device->mem->free(gradient_norm_scratch); } void Model::project_weights(float radius) { From 5d2b8145e2670ff85551db04362f4c5e819064ce Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 18 Mar 2017 21:29:49 -0400 Subject: [PATCH 58/88] Added WordForToken convenience function to Sentence --- parser/corpus.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/parser/corpus.h b/parser/corpus.h index ed239a3..5eac736 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -213,6 +213,12 @@ class Sentence { oss << *this; return oss.str(); } + + const std::string& WordForToken(unsigned token_id) const { + unsigned word_id = words.at(token_id); + return word_id == vocab.kUNK ? unk_surface_forms.at(token_id) + : vocab.int_to_words[word_id]; + } }; inline std::ostream& operator<<(std::ostream& os, const Sentence&sentence) { From 9f74cb09c303989a8fe7ef3bcafedff95b254e84 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 18 Mar 2017 22:15:26 -0400 Subject: [PATCH 59/88] Made CNN initialization return the random seed --- cnn/cnn/init.cc | 4 +++- cnn/cnn/init.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cnn/cnn/init.cc b/cnn/cnn/init.cc index 4e0a1a3..915a246 100644 --- a/cnn/cnn/init.cc +++ b/cnn/cnn/init.cc @@ -30,7 +30,7 @@ static void RemoveArgs(int& argc, char**& argv, int& argi, int n) { assert(argc >= 0); } -void Initialize(int& argc, char**& argv, unsigned random_seed, bool shared_parameters) { +unsigned Initialize(int& argc, char**& argv, unsigned random_seed, bool shared_parameters) { vector gpudevices; #if HAVE_CUDA cerr << "[cnn] initializing CUDA\n"; @@ -88,6 +88,8 @@ void Initialize(int& argc, char**& argv, unsigned random_seed, bool shared_param kSCALAR_ONE = default_device->kSCALAR_ONE; kSCALAR_ZERO = default_device->kSCALAR_ZERO; cerr << "[cnn] memory allocation done.\n"; + + return random_seed; } void Cleanup() { diff --git a/cnn/cnn/init.h b/cnn/cnn/init.h index e9e8fef..80a4b28 100644 --- a/cnn/cnn/init.h +++ b/cnn/cnn/init.h @@ -3,7 +3,7 @@ namespace cnn { -void Initialize(int& argc, char**& argv, unsigned random_seed = 0, bool shared_parameters = false); +unsigned Initialize(int& argc, char**& argv, unsigned random_seed = 0, bool shared_parameters = false); void Cleanup(); } // namespace cnn From e27f33027adaf74c17ee536283601304a66cf679 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 18 Mar 2017 22:59:53 -0400 Subject: [PATCH 60/88] A bit of cleanup --- parser/corpus.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index 5eac736..c5a9d0a 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -208,12 +207,6 @@ class Sentence { return words.size(); } - std::string AsString() const { - std::ostringstream oss; - oss << *this; - return oss.str(); - } - const std::string& WordForToken(unsigned token_id) const { unsigned word_id = words.at(token_id); return word_id == vocab.kUNK ? unk_surface_forms.at(token_id) @@ -221,7 +214,7 @@ class Sentence { } }; -inline std::ostream& operator<<(std::ostream& os, const Sentence&sentence) { +inline std::ostream& operator<<(std::ostream& os, const Sentence& sentence) { for (auto &index_and_word_id : sentence.words) { unsigned index = index_and_word_id.first; unsigned word_id = index_and_word_id.second; From 5a2256753dcf2d52a223a5689da14b5beec004ce Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 20 Mar 2017 14:06:00 -0400 Subject: [PATCH 61/88] Minor memory management improvement --- cnn/cnn/model.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cnn/cnn/model.h b/cnn/cnn/model.h index 2e76194..ddb6258 100644 --- a/cnn/cnn/model.h +++ b/cnn/cnn/model.h @@ -103,6 +103,15 @@ struct LookupParameters : public ParametersBase { class Model { public: Model() : gradient_norm_scratch() {} + Model(const Model&) = delete; + Model(Model&& m) { + all_params = std::move(m.all_params); + lookup_params = std::move(m.lookup_params); + params = std::move(m.params); + // Free our scratch memory before claiming the other model's. + default_device->mem->free(gradient_norm_scratch); + gradient_norm_scratch = m.gradient_norm_scratch; + } ~Model(); float gradient_l2_norm() const; void reset_gradient(); From 478e3ffe4a0212ae51857af2878d5d0b3aac48c5 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 20 Mar 2017 23:01:21 -0400 Subject: [PATCH 62/88] Possible minor memory management improvement --- cnn/cnn/exec.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cnn/cnn/exec.cc b/cnn/cnn/exec.cc index bc8b799..4005ad4 100644 --- a/cnn/cnn/exec.cc +++ b/cnn/cnn/exec.cc @@ -10,6 +10,7 @@ ExecutionEngine::~ExecutionEngine() {} void SimpleExecutionEngine::invalidate() { num_nodes_evaluated = 0; + fxs->free(); } const Tensor& SimpleExecutionEngine::forward() { From 9c2be25a2369c49117ac7eb16f55657dfe1ced98 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 20 Mar 2017 23:05:32 -0400 Subject: [PATCH 63/88] Made main LogProbTagger public --- parser/lstm-parser.cc | 8 ++++---- parser/neural-transition-tagger.h | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index f545f44..a76aa03 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -420,15 +420,15 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } } const vector& actions = corpus.correct_act_sent[order[si]]; - ComputationGraph hg; - LogProbTagger(&hg, sentence, tsentence, actions, &correct); - double lp = as_scalar(hg.incremental_forward()); + ComputationGraph cg; + LogProbTagger(&cg, sentence, tsentence, actions, &correct); + double lp = as_scalar(cg.incremental_forward()); if (lp < 0) { cerr << "Log prob < 0 on sentence " << order[si] << ": lp=" << lp << endl; assert(lp >= 0.0); } - hg.backward(); + cg.backward(); sgd.update(1.0); llh += lp; ++si; diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index 0ef5514..e927edf 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -28,6 +28,19 @@ class NeuralTransitionTagger { bool replace_unknowns = true, cnn::expr::Expression* final_parser_state = nullptr); + // *** if correct_actions is empty, this runs greedy decoding *** + // returns actions for input sentence (in training just returns the reference) + // OOV handling: raw_sent will have the actual words + // sent will have words replaced by appropriate UNK tokens + // this lets us use pretrained embeddings, when available, for words that were + // OOV in the training data. + std::vector LogProbTagger( + cnn::ComputationGraph* cg, + const Sentence& sentence, // raw sentence + const Sentence::SentenceMap& sent, // sentence with OOVs replaced + const std::vector& correct_actions, + double* correct, cnn::expr::Expression* final_parser_state = nullptr); + const CorpusVocabulary& GetVocab() const { return vocab; } // TODO: arrange things such that we don't need to expose this? @@ -78,19 +91,6 @@ class NeuralTransitionTagger { void SaveModel(const std::string& model_fname, bool softlink_created); - // *** if correct_actions is empty, this runs greedy decoding *** - // returns actions for input sentence (in training just returns the reference) - // OOV handling: raw_sent will have the actual words - // sent will have words replaced by appropriate UNK tokens - // this lets us use pretrained embeddings, when available, for words that were - // OOV in the training data. - std::vector LogProbTagger( - cnn::ComputationGraph* hg, - const Sentence& sentence, // raw sentence - const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - double* correct, cnn::expr::Expression* final_parser_state = nullptr); - Sentence::SentenceMap ReplaceUnknowns(const Sentence& sentence); }; From 4e470b9c29bcd4b3ef810c0b7ac348d5c911522f Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 20 Mar 2017 23:09:44 -0400 Subject: [PATCH 64/88] Minor parameter order change --- parser/lstm-parser.cc | 2 +- parser/neural-transition-tagger.cpp | 2 +- parser/neural-transition-tagger.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index a76aa03..97ba3a6 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -488,7 +488,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, ParseTree LSTMParser::Parse(const Sentence& sentence, const CorpusVocabulary& vocab, bool labeled) { ComputationGraph cg; - vector pred = LogProbTagger(sentence, &cg); + vector pred = LogProbTagger(&cg, sentence); double lp = as_scalar(cg.incremental_forward()); return RecoverParseTree(sentence, pred, labeled, lp); } diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index b886c3b..3409ede 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -65,7 +65,7 @@ Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( } vector NeuralTransitionTagger::LogProbTagger( - const Sentence& sentence, ComputationGraph *cg, bool replace_unknowns, + ComputationGraph *cg, const Sentence& sentence, bool replace_unknowns, Expression* final_parser_state) { return LogProbTagger( cg, sentence, diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index e927edf..b3781da 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -24,7 +24,7 @@ class NeuralTransitionTagger { // Used for testing. Replaces OOV with UNK. std::vector LogProbTagger( - const Sentence& sentence, cnn::ComputationGraph *cg, + cnn::ComputationGraph *cg, const Sentence& sentence, bool replace_unknowns = true, cnn::expr::Expression* final_parser_state = nullptr); From 24d9e5cc4670d0c9d0739884e3c4577a9f4628b3 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 20 Mar 2017 23:24:16 -0400 Subject: [PATCH 65/88] Sensible default params for LogProbTagger --- parser/neural-transition-tagger.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index b3781da..fd2f4a4 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -38,8 +38,9 @@ class NeuralTransitionTagger { cnn::ComputationGraph* cg, const Sentence& sentence, // raw sentence const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - double* correct, cnn::expr::Expression* final_parser_state = nullptr); + const std::vector& correct_actions = std::vector(), + double* correct = nullptr, + cnn::expr::Expression* final_parser_state = nullptr); const CorpusVocabulary& GetVocab() const { return vocab; } From 88e1af0b3d202949104cdf6da387f8d522e4fdff Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 20 Mar 2017 23:36:10 -0400 Subject: [PATCH 66/88] Don't use reference actions in test, even if they're specified --- parser/lstm-parser.cc | 2 +- parser/neural-transition-tagger.cpp | 15 +++------------ parser/neural-transition-tagger.h | 8 +++++++- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 97ba3a6..095ac04 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -421,7 +421,7 @@ void LSTMParser::Train(const ParserTrainingCorpus& corpus, } const vector& actions = corpus.correct_act_sent[order[si]]; ComputationGraph cg; - LogProbTagger(&cg, sentence, tsentence, actions, &correct); + LogProbTagger(&cg, sentence, tsentence, true, actions, &correct); double lp = as_scalar(cg.incremental_forward()); if (lp < 0) { cerr << "Log prob < 0 on sentence " << order[si] << ": lp=" << lp diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 3409ede..487c078 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -64,25 +64,16 @@ Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( return tsentence; } -vector NeuralTransitionTagger::LogProbTagger( - ComputationGraph *cg, const Sentence& sentence, bool replace_unknowns, - Expression* final_parser_state) { - return LogProbTagger( - cg, sentence, - replace_unknowns ? ReplaceUnknowns(sentence) : sentence.words, - vector(), nullptr, final_parser_state); -} - vector NeuralTransitionTagger::LogProbTagger( ComputationGraph* cg, const Sentence& raw_sent, // raw sentence const Sentence::SentenceMap& sent, // sentence with OOVs replaced + bool training, const vector& correct_actions, double* correct, Expression* final_parser_state) { assert(finalized); vector results; - const bool build_training_graph = correct_actions.size() > 0; // variables in the computation graph representing the parameters for (Parameters *params : GetParameters()) { @@ -117,8 +108,8 @@ vector NeuralTransitionTagger::LogProbTagger( } } unsigned action = best_a; - // If we have reference actions (for training), use the reference action. - if (build_training_graph) { + // If we're training, use the reference action. + if (training) { assert(action_count < correct_actions.size()); action = correct_actions[action_count]; if (correct && best_a == action) { diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index fd2f4a4..3e91d03 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -26,7 +26,12 @@ class NeuralTransitionTagger { std::vector LogProbTagger( cnn::ComputationGraph *cg, const Sentence& sentence, bool replace_unknowns = true, - cnn::expr::Expression* final_parser_state = nullptr); + cnn::expr::Expression* final_parser_state = nullptr) { + return LogProbTagger( + cg, sentence, + replace_unknowns ? ReplaceUnknowns(sentence) : sentence.words, + false, std::vector(), nullptr, final_parser_state); + } // *** if correct_actions is empty, this runs greedy decoding *** // returns actions for input sentence (in training just returns the reference) @@ -38,6 +43,7 @@ class NeuralTransitionTagger { cnn::ComputationGraph* cg, const Sentence& sentence, // raw sentence const Sentence::SentenceMap& sent, // sentence with OOVs replaced + bool training = false, const std::vector& correct_actions = std::vector(), double* correct = nullptr, cnn::expr::Expression* final_parser_state = nullptr); From c97a0f4b7287d58efabae580a0f10775ab2386d7 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 21 Mar 2017 00:07:26 -0400 Subject: [PATCH 67/88] *Do* still update the correct count if applicable, even in dev --- parser/neural-transition-tagger.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 487c078..e75fea9 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -72,6 +72,8 @@ vector NeuralTransitionTagger::LogProbTagger( bool training, const vector& correct_actions, double* correct, Expression* final_parser_state) { + if (training) + assert(!correct_actions.empty()); assert(finalized); vector results; @@ -108,13 +110,16 @@ vector NeuralTransitionTagger::LogProbTagger( } } unsigned action = best_a; - // If we're training, use the reference action. - if (training) { + + if (!correct_actions.empty()) { assert(action_count < correct_actions.size()); - action = correct_actions[action_count]; - if (correct && best_a == action) { + unsigned correct_action = correct_actions[action_count]; + if (correct && best_a == correct_action) { (*correct)++; } + // If we're training, use the reference action. + if (training) + action = correct_action; } ++action_count; log_probs.push_back(pick(adiste, action)); From 0e065e035c87ac9ab0afe76f21a3d6fb6f5e70c8 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 21 Mar 2017 14:07:29 -0400 Subject: [PATCH 68/88] Can now shrink a memory pool back down w/o clearing entirely --- cnn/cnn/aligned-mem-pool.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cnn/cnn/aligned-mem-pool.h b/cnn/cnn/aligned-mem-pool.h index 9a087b0..fa616f0 100644 --- a/cnn/cnn/aligned-mem-pool.h +++ b/cnn/cnn/aligned-mem-pool.h @@ -8,6 +8,8 @@ namespace cnn { class AlignedMemoryPool { public: + typedef size_t PoolState; + explicit AlignedMemoryPool(size_t cap, MemAllocator* a) : a(a) { sys_alloc(cap); zero_all(); @@ -36,6 +38,14 @@ class AlignedMemoryPool { bool is_shared() { return shared; } + + PoolState get_state() const { + return used; + } + + void restore_state(const PoolState& state) { + used = state; + } private: void sys_alloc(size_t cap) { capacity = a->round_up_align(cap); From 3662e8ee504ed5e87ad0042d9a015416933390ea Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 19 Apr 2017 23:54:56 -0400 Subject: [PATCH 69/88] Attempted to get GPU compilation working --- CMakeLists.txt | 6 +++--- cnn/CMakeLists.txt | 22 ++++++++++++++-------- cnn/cnn/CMakeLists.txt | 14 ++++++++++---- parser/CMakeLists.txt | 19 +++++++++++++++---- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a4e62ba..0ca9f46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ if(NOT CMAKE_BUILD_TYPE) endif(NOT CMAKE_BUILD_TYPE) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=c++14") enable_testing() @@ -30,6 +30,6 @@ include_directories(${EIGEN3_INCLUDE_DIR}) #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) -add_subdirectory(cnn/cnn) +add_subdirectory(cnn) # add_subdirectory(cnn/examples) -add_subdirectory(parser) +add_subdirectory(parser) \ No newline at end of file diff --git a/cnn/CMakeLists.txt b/cnn/CMakeLists.txt index 58173ae..17fc1ec 100644 --- a/cnn/CMakeLists.txt +++ b/cnn/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR) if(NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_BUILD_TYPE RelWithDebInfo) -endif(NOT CMAKE_BUILD_TYPE) +endif(NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) @@ -14,7 +14,7 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # 3. try compiler options like -march=native or other architecture # flags (the compiler does not always make the best configuration # decisions without help) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -DEIGEN_FAST_MATH -march=native") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++14 -Ofast -g -DEIGEN_FAST_MATH -march=native") enable_testing() @@ -67,9 +67,11 @@ else() endif() if(BACKEND MATCHES "^eigen$") - set(WITH_EIGEN_BACKEND 1) + set(WITH_CUDA_BACKEND 0 CACHE INTERNAL "" FORCE) + set(WITH_EIGEN_BACKEND 1 CACHE INTERNAL "" FORCE) elseif(BACKEND MATCHES "^cuda$") - set(WITH_CUDA_BACKEND 1) + set(WITH_CUDA_BACKEND 1 CACHE INTERNAL "" FORCE) + set(WITH_EIGEN_BACKEND 0 CACHE INTERNAL "" FORCE) else() message(SEND_ERROR "BACKEND must be eigen or cuda") endif() @@ -97,8 +99,12 @@ set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}) +option(CNN_CORE_ONLY "If off, won't build extra dirs like tests and examples" ON) + add_subdirectory(cnn) -add_subdirectory(tests) -add_subdirectory(examples) -add_subdirectory(rnnlm) -enable_testing() +if(NOT CNN_CORE_ONLY) + add_subdirectory(tests) + add_subdirectory(examples) + add_subdirectory(rnnlm) + enable_testing() +endif(NOT CNN_CORE_ONLY) \ No newline at end of file diff --git a/cnn/cnn/CMakeLists.txt b/cnn/cnn/CMakeLists.txt index bfa85d0..6f66321 100644 --- a/cnn/cnn/CMakeLists.txt +++ b/cnn/cnn/CMakeLists.txt @@ -69,6 +69,8 @@ set(cnn_library_HDRS training.h ) +option(CNN_SHARED "Whether to build CNN shared libs" OFF) + if(WITH_CUDA_BACKEND) list(APPEND cnn_library_SRCS cuda.cc) @@ -99,20 +101,24 @@ file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} tests/*.cc) # actual target: add_library(cnn STATIC ${cnn_library_SRCS} ${cnn_library_HDRS}) target_link_libraries(cnn ${LIBS}) -if(WITH_CUDA_BACKEND) +if(CNN_SHARED) + if(WITH_CUDA_BACKEND) add_library(gcnn_shared SHARED ${cnn_library_SRCS} ${cnn_library_HDRS}) target_link_libraries(gcnn_shared ${LIBS}) -else() + else() add_library(cnn_shared SHARED ${cnn_library_SRCS} ${cnn_library_HDRS}) target_link_libraries(cnn_shared ${LIBS}) -endif(WITH_CUDA_BACKEND) + endif(WITH_CUDA_BACKEND) +endif(CNN_SHARED) #add_library(cnn ${cnn_library_SRCS} ${cnn_library_HDRS} ${LIBS}) if(WITH_CUDA_BACKEND) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_52,code=compute_52;-std=c++11;-O2;-DVERBOSE;-Xcompiler;-fpic") SET(CUDA_PROPAGATE_HOST_FLAGS OFF) cuda_add_library(cnncuda STATIC gpu-ops.cu) - cuda_add_library(cnncuda_shared SHARED gpu-ops.cu) + if(CNN_SHARED) + cuda_add_library(cnncuda_shared SHARED gpu-ops.cu) + endif(CNN_SHARED) endif(WITH_CUDA_BACKEND) install(FILES ${cnn_library_HDRS} DESTINATION include/cnn) diff --git a/parser/CMakeLists.txt b/parser/CMakeLists.txt index 0077cab..80fee68 100644 --- a/parser/CMakeLists.txt +++ b/parser/CMakeLists.txt @@ -1,8 +1,19 @@ PROJECT(lstm-parser:parser) CMAKE_MINIMUM_REQUIRED(VERSION 2.8) -ADD_LIBRARY(lstm-parser-core lstm-parser.cc corpus.cc neural-transition-tagger.cpp) -target_link_libraries(lstm-parser-core cnn ${Boost_LIBRARIES}) - +add_library(lstm-parser-core STATIC lstm-parser.cc corpus.cc + neural-transition-tagger.cpp) ADD_EXECUTABLE(lstm-parse lstm-parser-driver.cc) -target_link_libraries(lstm-parse lstm-parser-core ${Boost_LIBRARIES}) + +if(WITH_CUDA_BACKEND) + add_dependencies(lstm-parser-core cnncuda) + target_link_libraries(lstm-parser-core cnncuda) + CUDA_ADD_CUBLAS_TO_TARGET(lstm-parser-core) + + add_dependencies(lstm-parse cnncuda) + target_link_libraries(lstm-parse cnncuda) + CUDA_ADD_CUBLAS_TO_TARGET(lstm-parse) +endif(WITH_CUDA_BACKEND) + +target_link_libraries(lstm-parser-core cnn ${Boost_LIBRARIES}) +target_link_libraries(lstm-parse lstm-parser-core ${Boost_LIBRARIES}) \ No newline at end of file From ae6f7c9777c5f53d13e2788d8ee0a5614eb719d2 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 20 Apr 2017 22:13:24 -0400 Subject: [PATCH 70/88] Unused final parser state -> expose arbitrary network states --- parser/lstm-parser.cc | 15 +++++++++++++-- parser/lstm-parser.h | 8 +++++--- parser/neural-transition-tagger.cpp | 12 +++++------- parser/neural-transition-tagger.h | 14 ++++++++------ 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 095ac04..61f9705 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -236,7 +236,8 @@ Expression LSTMParser::GetActionProbabilities(const TaggerState& state) { void LSTMParser::DoAction(unsigned action, TaggerState* state, - ComputationGraph* cg) { + ComputationGraph* cg, + vector* states_to_expose) { ParserState* real_state = static_cast(state); // add current action to action LSTM Expression action_e = lookup(*cg, p_a, action); @@ -309,6 +310,11 @@ void LSTMParser::DoAction(unsigned action, TaggerState* state, real_state->stack.push_back(nlcomposed); real_state->stacki.push_back(headi); } + + // After the last action, record the final tree state, if requested. + if (states_to_expose && ShouldTerminate(*real_state)) { + (*states_to_expose).back() = real_state->stack.back(); + } } @@ -316,7 +322,8 @@ NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const vector& correct_actions) { + const vector& correct_actions, + vector* states_to_expose) { stack_lstm.new_graph(*cg); buffer_lstm.new_graph(*cg); action_lstm.new_graph(*cg); @@ -362,6 +369,10 @@ NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( for (auto& b : state->buffer) buffer_lstm.add_input(b); + if (states_to_expose) { + states_to_expose->resize(1); + } + return state; } diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 684f0b0..2ff5bba 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -257,7 +257,8 @@ class LSTMParser : public NeuralTransitionTagger { virtual TaggerState* InitializeParserState( cnn::ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions) override; + const std::vector& correct_actions, + std::vector* states_to_expose) override; virtual void InitializeNetworkParameters() override; @@ -272,8 +273,9 @@ class LSTMParser : public NeuralTransitionTagger { virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) override; - virtual void DoAction(unsigned action, TaggerState* state, - cnn::ComputationGraph* cg) override; + virtual void DoAction( + unsigned action, TaggerState* state, cnn::ComputationGraph* cg, + std::vector* states_to_expose) override; inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index e75fea9..37b09ba 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -71,7 +71,7 @@ vector NeuralTransitionTagger::LogProbTagger( const Sentence::SentenceMap& sent, // sentence with OOVs replaced bool training, const vector& correct_actions, double* correct, - Expression* final_parser_state) { + vector* states_to_expose) { if (training) assert(!correct_actions.empty()); assert(finalized); @@ -82,8 +82,9 @@ vector NeuralTransitionTagger::LogProbTagger( param_expressions[params] = parameter(*cg, params); } - unique_ptr state(InitializeParserState(cg, raw_sent, sent, - correct_actions)); + unique_ptr state( + InitializeParserState(cg, raw_sent, sent, correct_actions, + states_to_expose)); vector log_probs; unsigned action_count = 0; // incremented at each prediction @@ -125,15 +126,12 @@ vector NeuralTransitionTagger::LogProbTagger( log_probs.push_back(pick(adiste, action)); results.push_back(action); - DoAction(action, state.get(), cg); + DoAction(action, state.get(), cg, states_to_expose); } Expression tot_neglogprob = -sum(log_probs); assert(tot_neglogprob.pg != nullptr); - if (final_parser_state) { - *final_parser_state = p_t; - } param_expressions.clear(); return results; } diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index 3e91d03..4e814db 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -26,11 +26,11 @@ class NeuralTransitionTagger { std::vector LogProbTagger( cnn::ComputationGraph *cg, const Sentence& sentence, bool replace_unknowns = true, - cnn::expr::Expression* final_parser_state = nullptr) { + std::vector* states_to_expose = nullptr) { return LogProbTagger( cg, sentence, replace_unknowns ? ReplaceUnknowns(sentence) : sentence.words, - false, std::vector(), nullptr, final_parser_state); + false, std::vector(), nullptr, states_to_expose); } // *** if correct_actions is empty, this runs greedy decoding *** @@ -46,7 +46,7 @@ class NeuralTransitionTagger { bool training = false, const std::vector& correct_actions = std::vector(), double* correct = nullptr, - cnn::expr::Expression* final_parser_state = nullptr); + std::vector* states_to_expose = nullptr); const CorpusVocabulary& GetVocab() const { return vocab; } @@ -79,7 +79,8 @@ class NeuralTransitionTagger { virtual TaggerState* InitializeParserState( cnn::ComputationGraph* hg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions) = 0; + const std::vector& correct_actions, + std::vector* states_to_expose) = 0; virtual cnn::expr::Expression GetActionProbabilities( const TaggerState& state) = 0; @@ -89,8 +90,9 @@ class NeuralTransitionTagger { virtual bool IsActionForbidden(const unsigned action, const TaggerState& state) const = 0; - virtual void DoAction(unsigned action, TaggerState* state, - cnn::ComputationGraph* cg) = 0; + virtual void DoAction( + unsigned action, TaggerState* state, cnn::ComputationGraph* cg, + std::vector* states_to_expose) = 0; virtual void DoSave(eos::portable_oarchive& archive) = 0; From 43dd29a7d9aa4062742d73867b4367822f740f0d Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 20 Apr 2017 23:32:28 -0400 Subject: [PATCH 71/88] Switched to map for states to expose --- parser/lstm-parser.cc | 11 +++-------- parser/lstm-parser.h | 5 ++--- parser/neural-transition-tagger.cpp | 5 ++--- parser/neural-transition-tagger.h | 13 +++++++------ 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 61f9705..0f90e75 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -237,7 +237,7 @@ Expression LSTMParser::GetActionProbabilities(const TaggerState& state) { void LSTMParser::DoAction(unsigned action, TaggerState* state, ComputationGraph* cg, - vector* states_to_expose) { + map* states_to_expose) { ParserState* real_state = static_cast(state); // add current action to action LSTM Expression action_e = lookup(*cg, p_a, action); @@ -313,7 +313,7 @@ void LSTMParser::DoAction(unsigned action, TaggerState* state, // After the last action, record the final tree state, if requested. if (states_to_expose && ShouldTerminate(*real_state)) { - (*states_to_expose).back() = real_state->stack.back(); + (*states_to_expose)["Tree"] = real_state->stack.back(); } } @@ -322,8 +322,7 @@ NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const vector& correct_actions, - vector* states_to_expose) { + const vector& correct_actions) { stack_lstm.new_graph(*cg); buffer_lstm.new_graph(*cg); action_lstm.new_graph(*cg); @@ -369,10 +368,6 @@ NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( for (auto& b : state->buffer) buffer_lstm.add_input(b); - if (states_to_expose) { - states_to_expose->resize(1); - } - return state; } diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 2ff5bba..90338e4 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -257,8 +257,7 @@ class LSTMParser : public NeuralTransitionTagger { virtual TaggerState* InitializeParserState( cnn::ComputationGraph* cg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - std::vector* states_to_expose) override; + const std::vector& correct_actions) override; virtual void InitializeNetworkParameters() override; @@ -275,7 +274,7 @@ class LSTMParser : public NeuralTransitionTagger { virtual void DoAction( unsigned action, TaggerState* state, cnn::ComputationGraph* cg, - std::vector* states_to_expose) override; + std::map* states_to_expose) override; inline unsigned ComputeCorrect(const ParseTree& ref, const ParseTree& hyp) const { diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 37b09ba..83336d4 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -71,7 +71,7 @@ vector NeuralTransitionTagger::LogProbTagger( const Sentence::SentenceMap& sent, // sentence with OOVs replaced bool training, const vector& correct_actions, double* correct, - vector* states_to_expose) { + map* states_to_expose) { if (training) assert(!correct_actions.empty()); assert(finalized); @@ -83,8 +83,7 @@ vector NeuralTransitionTagger::LogProbTagger( } unique_ptr state( - InitializeParserState(cg, raw_sent, sent, correct_actions, - states_to_expose)); + InitializeParserState(cg, raw_sent, sent, correct_actions)); vector log_probs; unsigned action_count = 0; // incremented at each prediction diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index 4e814db..553f6c1 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -24,9 +24,11 @@ class NeuralTransitionTagger { // Used for testing. Replaces OOV with UNK. std::vector LogProbTagger( - cnn::ComputationGraph *cg, const Sentence& sentence, + cnn::ComputationGraph *cg, + const Sentence& sentence, bool replace_unknowns = true, - std::vector* states_to_expose = nullptr) { + std::map* states_to_expose = + nullptr) { return LogProbTagger( cg, sentence, replace_unknowns ? ReplaceUnknowns(sentence) : sentence.words, @@ -46,7 +48,7 @@ class NeuralTransitionTagger { bool training = false, const std::vector& correct_actions = std::vector(), double* correct = nullptr, - std::vector* states_to_expose = nullptr); + std::map* states_to_expose = nullptr); const CorpusVocabulary& GetVocab() const { return vocab; } @@ -79,8 +81,7 @@ class NeuralTransitionTagger { virtual TaggerState* InitializeParserState( cnn::ComputationGraph* hg, const Sentence& raw_sent, const Sentence::SentenceMap& sent, // sentence with OOVs replaced - const std::vector& correct_actions, - std::vector* states_to_expose) = 0; + const std::vector& correct_actions) = 0; virtual cnn::expr::Expression GetActionProbabilities( const TaggerState& state) = 0; @@ -92,7 +93,7 @@ class NeuralTransitionTagger { virtual void DoAction( unsigned action, TaggerState* state, cnn::ComputationGraph* cg, - std::vector* states_to_expose) = 0; + std::map* states_to_expose) = 0; virtual void DoSave(eos::portable_oarchive& archive) = 0; From 7a02eb52bb3d79dbfdefaa8a49a32bd4c1ee5ef3 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 20 Apr 2017 23:32:39 -0400 Subject: [PATCH 72/88] Code formatting --- parser/lstm-parser.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 0f90e75..0cdae71 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -300,9 +300,11 @@ void LSTMParser::DoAction(unsigned action, TaggerState* state, real_state->stack.pop_back(); real_state->stacki.pop_back(); // composed = cbias + H * head + D * dep + R * relation - Expression composed = affine_transform({GetParamExpr(p_cbias), - GetParamExpr(p_H), head, GetParamExpr(p_D), dep, GetParamExpr(p_R), - relation}); + Expression composed = affine_transform( + {GetParamExpr(p_cbias), + GetParamExpr(p_H), head, + GetParamExpr(p_D), dep, + GetParamExpr(p_R), relation}); Expression nlcomposed = tanh(composed); stack_lstm.rewind_one_step(); stack_lstm.rewind_one_step(); @@ -358,7 +360,8 @@ NeuralTransitionTagger::TaggerState* LSTMParser::InitializeParserState( args.push_back(GetParamExpr(p_t2l)); args.push_back(t); } - state->buffer[sent.size() - added_to_buffer] = rectify(affine_transform(args)); + state->buffer[sent.size() - added_to_buffer] = rectify( + affine_transform(args)); state->bufferi[sent.size() - added_to_buffer] = token_index; added_to_buffer++; } From 9124b54b8abadbf68f354e76a90ddfe1b6d31747 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Mon, 24 Apr 2017 18:18:35 -0400 Subject: [PATCH 73/88] Expose tree node embeddings --- parser/lstm-parser.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 0cdae71..9b32cbd 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -311,6 +311,11 @@ void LSTMParser::DoAction(unsigned action, TaggerState* state, stack_lstm.add_input(nlcomposed); real_state->stack.push_back(nlcomposed); real_state->stacki.push_back(headi); + if (states_to_expose) { + // Once something is attached as a dependent, it will never again be + // modified, so cache its expression. + (*states_to_expose)[to_string(depi)] = dep; + } } // After the last action, record the final tree state, if requested. From 93e570833ab7551a76bd7320dc3be5417ba6e80e Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Wed, 26 Apr 2017 16:09:08 -0400 Subject: [PATCH 74/88] Exposed whether tagger is in training to subclass functions --- parser/neural-transition-tagger.cpp | 1 + parser/neural-transition-tagger.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 83336d4..dcac0f3 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -72,6 +72,7 @@ vector NeuralTransitionTagger::LogProbTagger( bool training, const vector& correct_actions, double* correct, map* states_to_expose) { + in_training = training; if (training) assert(!correct_actions.empty()); assert(finalized); diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index 553f6c1..502073f 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -17,7 +17,8 @@ namespace lstm_parser { class NeuralTransitionTagger { public: - NeuralTransitionTagger() : finalized(false), model(new cnn::Model) {} + NeuralTransitionTagger() : finalized(false), in_training(false), + model(new cnn::Model) {} virtual ~NeuralTransitionTagger() {} void FinalizeVocab(); @@ -66,6 +67,7 @@ class NeuralTransitionTagger { }; bool finalized; + bool in_training; // expose to virtual fns whether we're doing training std::map param_expressions; // Store the model as a smart ptr so we can call its destructor when needed. From bfcea60ef350d19c06d67b949e89993efb9fa457 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Thu, 27 Apr 2017 13:44:25 -0400 Subject: [PATCH 75/88] Minor formatting --- parser/lstm-parser.h | 2 +- parser/neural-transition-tagger.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index 90338e4..ba72f7e 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -323,7 +323,7 @@ class LSTMParser : public NeuralTransitionTagger { action_lstm = cnn::LSTMBuilder(options.layers, options.action_dim, options.hidden_dim, model.get()); - FinalizeVocab(); // OK, now finalize. :) + FinalizeVocab(); // OK, now finalize. :) (Also initializes network params.) ar & *model; } diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index dcac0f3..7e8c30a 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -51,6 +51,7 @@ void NeuralTransitionTagger::FinalizeVocab() { finalized = true; } + Sentence::SentenceMap NeuralTransitionTagger::ReplaceUnknowns( const Sentence& sentence) { Sentence::SentenceMap tsentence(sentence.words); // sentence w/ OOVs replaced @@ -137,5 +138,4 @@ vector NeuralTransitionTagger::LogProbTagger( } - } /* namespace lstm_parser */ From 5619f815210e36753ba85a9416e937e01fa95254 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 2 May 2017 13:03:24 -0400 Subject: [PATCH 76/88] Added optional ParseTree pointer to Sentence --- parser/corpus.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parser/corpus.h b/parser/corpus.h index c5a9d0a..e3ec3fd 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -191,17 +191,20 @@ class ConllUCorpusReader : public CorpusReader { class Sentence; inline std::ostream& operator<<(std::ostream& os, const Sentence& sentence); +class ParseTree; // forward declaration + class Sentence { public: typedef std::map SentenceMap; typedef std::map SentenceUnkMap; - Sentence(const CorpusVocabulary& vocab) : vocab(vocab) {} + Sentence(const CorpusVocabulary& vocab) : vocab(vocab), tree(nullptr) {} SentenceMap words; SentenceMap poses; SentenceUnkMap unk_surface_forms; const CorpusVocabulary& vocab; + ParseTree* tree; size_t Size() const { return words.size(); From d7902d911d6d980ef7447d21dc52353f6edd17a5 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 2 May 2017 22:45:52 -0400 Subject: [PATCH 77/88] Fixed assertion bug --- parser/neural-transition-tagger.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 7e8c30a..afa9929 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -114,7 +114,7 @@ vector NeuralTransitionTagger::LogProbTagger( unsigned action = best_a; if (!correct_actions.empty()) { - assert(action_count < correct_actions.size()); + assert(action_count < correct_actions.size() || !training); unsigned correct_action = correct_actions[action_count]; if (correct && best_a == correct_action) { (*correct)++; From 5653ab730f28c2f8167fa4b4b505f04bf846471f Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 20 May 2017 22:33:18 -0400 Subject: [PATCH 78/88] Added num_values() to CNN LookupParameters --- cnn/cnn/model.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cnn/cnn/model.h b/cnn/cnn/model.h index ddb6258..b27dec9 100644 --- a/cnn/cnn/model.h +++ b/cnn/cnn/model.h @@ -61,6 +61,7 @@ struct LookupParameters : public ParametersBase { void squared_l2norm(float* sqnorm) const override; void g_squared_l2norm(float* sqnorm) const override; size_t size() const override; + size_t num_values() const { return values.size(); } void Initialize(unsigned index, const std::vector& val); void copy(const LookupParameters & val); From 372cdea496b64e7f232ead5c3401a7ddae59b4c0 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 23 May 2017 14:22:55 -0400 Subject: [PATCH 79/88] Made all TaggerStates modifiable by overriding subclass member fns --- parser/lstm-parser.cc | 8 ++++---- parser/lstm-parser.h | 8 ++++---- parser/neural-transition-tagger.cpp | 6 +++--- parser/neural-transition-tagger.h | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/parser/lstm-parser.cc b/parser/lstm-parser.cc index 9b32cbd..e33ed09 100644 --- a/parser/lstm-parser.cc +++ b/parser/lstm-parser.cc @@ -137,9 +137,9 @@ LSTMParser::LSTMParser(const ParserOptions& poptions, bool LSTMParser::IsActionForbidden(const unsigned action, - const TaggerState& state) const { + TaggerState* state) const { const string& action_name = vocab.action_names[action]; - const ParserState& real_state = static_cast(state); + const ParserState& real_state = static_cast(*state); unsigned ssize = real_state.stack.size(); unsigned bsize = real_state.buffer.size(); @@ -221,7 +221,7 @@ ParseTree LSTMParser::RecoverParseTree( } -Expression LSTMParser::GetActionProbabilities(const TaggerState& state) { +Expression LSTMParser::GetActionProbabilities(TaggerState* state) { // p_t = pbias + S * slstm + B * blstm + A * alstm Expression p_t = affine_transform( {GetParamExpr(p_pbias), GetParamExpr(p_S), stack_lstm.back(), @@ -319,7 +319,7 @@ void LSTMParser::DoAction(unsigned action, TaggerState* state, } // After the last action, record the final tree state, if requested. - if (states_to_expose && ShouldTerminate(*real_state)) { + if (states_to_expose && ShouldTerminate(real_state)) { (*states_to_expose)["Tree"] = real_state->stack.back(); } } diff --git a/parser/lstm-parser.h b/parser/lstm-parser.h index ba72f7e..e6a7e09 100644 --- a/parser/lstm-parser.h +++ b/parser/lstm-parser.h @@ -261,15 +261,15 @@ class LSTMParser : public NeuralTransitionTagger { virtual void InitializeNetworkParameters() override; - virtual bool ShouldTerminate(const TaggerState& state) const override { - const ParserState& real_state = static_cast(state); + virtual bool ShouldTerminate(TaggerState* state) const override { + const ParserState& real_state = static_cast(*state); return real_state.stack.size() <= 2 && real_state.buffer.size() <= 1; } virtual bool IsActionForbidden(const unsigned action, - const TaggerState& state) const override; + TaggerState* state) const override; - virtual cnn::expr::Expression GetActionProbabilities(const TaggerState& state) + virtual cnn::expr::Expression GetActionProbabilities(TaggerState* state) override; virtual void DoAction( diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index afa9929..60f1557 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -90,16 +90,16 @@ vector NeuralTransitionTagger::LogProbTagger( vector log_probs; unsigned action_count = 0; // incremented at each prediction Expression p_t; // declared outside to allow access later - while (!ShouldTerminate(*state)) { + while (!ShouldTerminate(state.get())) { // Get list of possible actions for the current parser state. vector current_valid_actions; for (unsigned action = 0; action < vocab.action_names.size(); ++action) { - if (IsActionForbidden(action, *state)) + if (IsActionForbidden(action, state.get())) continue; current_valid_actions.push_back(action); } - Expression r_t = GetActionProbabilities(*state); + Expression r_t = GetActionProbabilities(state.get()); // adist = log_softmax(r_t, current_valid_actions) Expression adiste = log_softmax(r_t, current_valid_actions); vector adist = as_vector(cg->incremental_forward()); diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index 502073f..7d4d375 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -86,12 +86,12 @@ class NeuralTransitionTagger { const std::vector& correct_actions) = 0; virtual cnn::expr::Expression GetActionProbabilities( - const TaggerState& state) = 0; + TaggerState* state) = 0; - virtual bool ShouldTerminate(const TaggerState& state) const = 0; + virtual bool ShouldTerminate(TaggerState* state) const = 0; virtual bool IsActionForbidden(const unsigned action, - const TaggerState& state) const = 0; + TaggerState* state) const = 0; virtual void DoAction( unsigned action, TaggerState* state, cnn::ComputationGraph* cg, From c8d7685903b9a45193d8781eddc177db239fa39a Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 4 Jun 2017 01:55:53 -0400 Subject: [PATCH 80/88] Made Sentences swappable (also more efficient GetWord default) --- parser/corpus.h | 21 ++++++++++++++------- parser/neural-transition-tagger.cpp | 2 +- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index e3ec3fd..075810d 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -54,7 +54,7 @@ class CorpusVocabulary { inline unsigned GetWord(const std::string& word) const { auto word_iter = words_to_int.find(word); if (word_iter == words_to_int.end()) { - return words_to_int.find(CorpusVocabulary::UNK)->second; + return kUNK; } else { return word_iter->second; } @@ -198,12 +198,12 @@ class Sentence { typedef std::map SentenceMap; typedef std::map SentenceUnkMap; - Sentence(const CorpusVocabulary& vocab) : vocab(vocab), tree(nullptr) {} + Sentence(const CorpusVocabulary& vocab) : vocab(&vocab), tree(nullptr) {} SentenceMap words; SentenceMap poses; SentenceUnkMap unk_surface_forms; - const CorpusVocabulary& vocab; + const CorpusVocabulary* vocab; ParseTree* tree; size_t Size() const { @@ -212,8 +212,8 @@ class Sentence { const std::string& WordForToken(unsigned token_id) const { unsigned word_id = words.at(token_id); - return word_id == vocab.kUNK ? unk_surface_forms.at(token_id) - : vocab.int_to_words[word_id]; + return word_id == vocab->kUNK ? unk_surface_forms.at(token_id) + : vocab->int_to_words[word_id]; } }; @@ -224,8 +224,8 @@ inline std::ostream& operator<<(std::ostream& os, const Sentence& sentence) { unsigned pos_id = sentence.poses.at(index); auto unk_iter = sentence.unk_surface_forms.find(index); os << (unk_iter == sentence.unk_surface_forms.end() ? - sentence.vocab.int_to_words.at(word_id) : unk_iter->second) - << '/' << sentence.vocab.int_to_pos.at(pos_id); + sentence.vocab->int_to_words.at(word_id) : unk_iter->second) + << '/' << sentence.vocab->int_to_pos.at(pos_id); if (index != sentence.words.rend()->first) { os << ' '; } @@ -351,4 +351,11 @@ class ParserTrainingCorpus : public TrainingCorpus { } // namespace lstm_parser + +inline void swap(lstm_parser::Sentence& s1, lstm_parser::Sentence& s2) { + lstm_parser::Sentence tmp = std::move(s1); + s2 = std::move(s1); + s1 = std::move(tmp); +} + #endif diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 60f1557..016a6a3 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -23,7 +23,7 @@ void NeuralTransitionTagger::SaveModel(const string& model_fname, cerr << "Model saved." << endl; // Create a soft link to the most recent model in order to make it // easier to refer to it in a shell script. - if (!softlink_created) { + if (false) { string softlink = "latest_model.params"; if (system((string("rm -f ") + softlink).c_str()) == 0 From 8b6d31561e99c9a9eb8ace3601947244355eb5c9 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 4 Jun 2017 15:33:06 -0400 Subject: [PATCH 81/88] Fixed Sentence swap function --- parser/corpus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index 075810d..2a6c1a9 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -354,8 +354,8 @@ class ParserTrainingCorpus : public TrainingCorpus { inline void swap(lstm_parser::Sentence& s1, lstm_parser::Sentence& s2) { lstm_parser::Sentence tmp = std::move(s1); - s2 = std::move(s1); - s1 = std::move(tmp); + s1 = std::move(s2); + s2 = std::move(tmp); } #endif From 7d0746440eba38d478675425c35ca820a48399c2 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 16 Sep 2017 22:00:27 -0400 Subject: [PATCH 82/88] Updated test command line in README to latest flags --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 369808e..f0704ea 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ There is a pretrained model for English [here](http://www.cs.cmu.edu/~jdunietz/h Given a `test.conll` file formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat): - parser/lstm-parse -m english_pos_2_32_100_20_100_12_20.params -t test.conll + parser/lstm-parse -m english_pos_2_32_100_20_100_12_20.params -T test.conll -s If you are not using the pretrained model, you will need to replace the `.params` argument with the name of your own trained model file. From 741dad6807663ea9141a3041ba12f505ed8281ea Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 3 Oct 2017 00:34:58 -0400 Subject: [PATCH 83/88] Deleted dead variable declaration --- parser/neural-transition-tagger.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 016a6a3..f1a9739 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -89,7 +89,6 @@ vector NeuralTransitionTagger::LogProbTagger( vector log_probs; unsigned action_count = 0; // incremented at each prediction - Expression p_t; // declared outside to allow access later while (!ShouldTerminate(state.get())) { // Get list of possible actions for the current parser state. vector current_valid_actions; From a4670beb8162fda00815f27efedd7ce2b052bed2 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Tue, 3 Oct 2017 12:22:15 -0400 Subject: [PATCH 84/88] Enabled GetActionProbabilities to signal that oracle action should be used --- parser/neural-transition-tagger.cpp | 54 +++++++++++++++++------------ parser/neural-transition-tagger.h | 4 +++ 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index f1a9739..6c4ddfc 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -14,9 +14,12 @@ using namespace cnn::expr; namespace lstm_parser { +const cnn::expr::Expression NeuralTransitionTagger::USE_ORACLE( + nullptr, cnn::VariableIndex(static_cast(-1))); + void NeuralTransitionTagger::SaveModel(const string& model_fname, - bool softlink_created) { + bool softlink_created) { ofstream out_file(model_fname); eos::portable_oarchive archive(out_file); DoSave(archive); @@ -99,31 +102,38 @@ vector NeuralTransitionTagger::LogProbTagger( } Expression r_t = GetActionProbabilities(state.get()); - // adist = log_softmax(r_t, current_valid_actions) - Expression adiste = log_softmax(r_t, current_valid_actions); - vector adist = as_vector(cg->incremental_forward()); - double best_score = adist[current_valid_actions[0]]; - unsigned best_a = current_valid_actions[0]; - for (unsigned i = 1; i < current_valid_actions.size(); ++i) { - if (adist[current_valid_actions[i]] > best_score) { - best_score = adist[current_valid_actions[i]]; - best_a = current_valid_actions[i]; + unsigned action; + if (r_t.pg == USE_ORACLE.pg && r_t.i == USE_ORACLE.i) { + assert(!correct_actions.empty() && action_count < correct_actions.size()); + action = correct_actions[action_count]; + // cerr << "Using oracle action: " << vocab.action_names[action] << endl; + } else { + // adist = log_softmax(r_t, current_valid_actions) + Expression adiste = log_softmax(r_t, current_valid_actions); + vector adist = as_vector(cg->incremental_forward()); + double best_score = adist[current_valid_actions[0]]; + unsigned best_a = current_valid_actions[0]; + for (unsigned i = 1; i < current_valid_actions.size(); ++i) { + if (adist[current_valid_actions[i]] > best_score) { + best_score = adist[current_valid_actions[i]]; + best_a = current_valid_actions[i]; + } } - } - unsigned action = best_a; - - if (!correct_actions.empty()) { - assert(action_count < correct_actions.size() || !training); - unsigned correct_action = correct_actions[action_count]; - if (correct && best_a == correct_action) { - (*correct)++; + action = best_a; + + if (!correct_actions.empty()) { + assert(action_count < correct_actions.size() || !training); + unsigned correct_action = correct_actions[action_count]; + if (correct && best_a == correct_action) { + (*correct)++; + } + // If we're training, use the reference action. + if (training) + action = correct_action; } - // If we're training, use the reference action. - if (training) - action = correct_action; + log_probs.push_back(pick(adiste, action)); } ++action_count; - log_probs.push_back(pick(adiste, action)); results.push_back(action); DoAction(action, state.get(), cg, states_to_expose); diff --git a/parser/neural-transition-tagger.h b/parser/neural-transition-tagger.h index 7d4d375..b4afa5c 100644 --- a/parser/neural-transition-tagger.h +++ b/parser/neural-transition-tagger.h @@ -66,6 +66,10 @@ class NeuralTransitionTagger { virtual ~TaggerState() {} }; + // Special network pseudo-node for signaling that an oracle action should + // be used. + static const cnn::expr::Expression USE_ORACLE; + bool finalized; bool in_training; // expose to virtual fns whether we're doing training std::map param_expressions; From 53756b2813c2c75cb48bb07ba9acc7f0bc9a1295 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 13 Oct 2018 15:52:26 -0400 Subject: [PATCH 85/88] Allowed attaching metadata to Sentence objects --- parser/corpus.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parser/corpus.h b/parser/corpus.h index 2a6c1a9..ad5dca6 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -198,6 +198,9 @@ class Sentence { typedef std::map SentenceMap; typedef std::map SentenceUnkMap; + // TODO: move correct_act_sent from corpus-level to here + struct SentenceMetadata {}; + Sentence(const CorpusVocabulary& vocab) : vocab(&vocab), tree(nullptr) {} SentenceMap words; @@ -205,6 +208,7 @@ class Sentence { SentenceUnkMap unk_surface_forms; const CorpusVocabulary* vocab; ParseTree* tree; + std::unique_ptr metadata; size_t Size() const { return words.size(); From 13f4e6b70c1f49b79ad77c8e0ba70cb3ddeca771 Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sat, 13 Oct 2018 20:57:12 -0400 Subject: [PATCH 86/88] Fixed Sentence printer for non-training corpora --- parser/corpus.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/parser/corpus.h b/parser/corpus.h index ad5dca6..0f1efc9 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -189,7 +189,7 @@ class ConllUCorpusReader : public CorpusReader { class Sentence; -inline std::ostream& operator<<(std::ostream& os, const Sentence& sentence); +inline std::ostream& operator<<(std::ostream& os, const Sentence& sent); class ParseTree; // forward declaration @@ -221,16 +221,17 @@ class Sentence { } }; -inline std::ostream& operator<<(std::ostream& os, const Sentence& sentence) { - for (auto &index_and_word_id : sentence.words) { +inline std::ostream& operator<<(std::ostream& os, const Sentence& sent) { + for (auto &index_and_word_id : sent.words) { unsigned index = index_and_word_id.first; unsigned word_id = index_and_word_id.second; - unsigned pos_id = sentence.poses.at(index); - auto unk_iter = sentence.unk_surface_forms.find(index); - os << (unk_iter == sentence.unk_surface_forms.end() ? - sentence.vocab->int_to_words.at(word_id) : unk_iter->second) - << '/' << sentence.vocab->int_to_pos.at(pos_id); - if (index != sentence.words.rend()->first) { + unsigned pos_id = sent.poses.at(index); + auto unk_iter = sent.unk_surface_forms.find(index); + os << (unk_iter == sent.unk_surface_forms.end() || unk_iter->second == "" + ? sent.vocab->int_to_words.at(word_id) + : unk_iter->second) + << '/' << sent.vocab->int_to_pos.at(pos_id); + if (index != sent.words.rend()->first) { os << ' '; } } From be9091d9a6cb109750f6542e5440efcf9f71043b Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 17 Nov 2019 00:05:44 -0500 Subject: [PATCH 87/88] Improved sentence metadata handling; create dir for model if needed --- CMakeLists.txt | 2 +- parser/corpus.cc | 4 +++- parser/corpus.h | 3 ++- parser/neural-transition-tagger.cpp | 7 +++++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ca9f46..aa4712a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ if(DEFINED ENV{BOOST_ROOT}) set(Boost_NO_SYSTEM_PATHS ON) endif() set(Boost_REALPATH ON) -find_package(Boost COMPONENTS program_options serialization iostreams regex REQUIRED) +find_package(Boost COMPONENTS program_options serialization iostreams regex filesystem REQUIRED) include_directories(${Boost_INCLUDE_DIR}) set(LIBS ${LIBS} ${Boost_LIBRARIES}) diff --git a/parser/corpus.cc b/parser/corpus.cc index d9291c9..639963e 100644 --- a/parser/corpus.cc +++ b/parser/corpus.cc @@ -185,12 +185,14 @@ void TrainingCorpus::OracleTransitionsCorpusReader::RecordSentence( TrainingCorpus* corpus, Sentence::SentenceMap* words, Sentence::SentenceMap* sentence_pos, Sentence::SentenceUnkMap* sentence_unk_surface_forms, - vector* correct_actions) const { + vector* correct_actions, + Sentence::SentenceMetadata* metadata) const { // Store the sentence variables and clear them for the next sentence. corpus->sentences.emplace_back(*corpus->vocab); Sentence* sentence = &corpus->sentences.back(); sentence->words.swap(*words); sentence->poses.swap(*sentence_pos); + sentence->metadata.reset(metadata); corpus->correct_act_sent.push_back({}); corpus->correct_act_sent.back().swap(*correct_actions); diff --git a/parser/corpus.h b/parser/corpus.h index 0f1efc9..710fe2c 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -299,7 +299,8 @@ class TrainingCorpus : public Corpus { void RecordSentence(TrainingCorpus* corpus, Sentence::SentenceMap* words, Sentence::SentenceMap* sentence_pos, Sentence::SentenceUnkMap* sentence_unk_surface_forms, - std::vector* correct_actions) const; + std::vector* correct_actions, + Sentence::SentenceMetadata* metadata = nullptr) const; static inline unsigned UTF8Len(unsigned char x) { if (x < 0x80) return 1; diff --git a/parser/neural-transition-tagger.cpp b/parser/neural-transition-tagger.cpp index 6c4ddfc..11f8ec1 100644 --- a/parser/neural-transition-tagger.cpp +++ b/parser/neural-transition-tagger.cpp @@ -1,5 +1,6 @@ #include "neural-transition-tagger.h" +#include #include #include #include @@ -20,6 +21,12 @@ const cnn::expr::Expression NeuralTransitionTagger::USE_ORACLE( void NeuralTransitionTagger::SaveModel(const string& model_fname, bool softlink_created) { + boost::filesystem::path model_dir_path(model_fname); + model_dir_path.remove_filename(); + if (boost::filesystem::create_directories(model_dir_path)) { + cerr << "Created directory " << model_dir_path << endl; + } + ofstream out_file(model_fname); eos::portable_oarchive archive(out_file); DoSave(archive); From a951527ecf8f396eb77b2c450ec2a313a39237ba Mon Sep 17 00:00:00 2001 From: Jesse Dunietz Date: Sun, 17 Nov 2019 18:18:47 -0500 Subject: [PATCH 88/88] Updated WordForToken to allow iterator hinting for speed --- parser/corpus.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/parser/corpus.h b/parser/corpus.h index 710fe2c..b6aa496 100644 --- a/parser/corpus.h +++ b/parser/corpus.h @@ -215,7 +215,12 @@ class Sentence { } const std::string& WordForToken(unsigned token_id) const { - unsigned word_id = words.at(token_id); + return WordForToken(words.find(token_id), token_id); + } + + const std::string& WordForToken(SentenceMap::const_iterator words_iter, + unsigned token_id) const { + unsigned word_id = words_iter->second; return word_id == vocab->kUNK ? unk_surface_forms.at(token_id) : vocab->int_to_words[word_id]; }