From 2b57733d1a22a85bce29464cba895e1a82010c1d Mon Sep 17 00:00:00 2001 From: Louis Date: Thu, 10 Sep 2020 17:24:47 -0400 Subject: [PATCH 1/2] Fixed loading errors --- code/run_args_qa_thresh.py | 5 +++-- code/run_trigger_qa.py | 13 ++++++++----- code/script_args_qa_thresh.sh | 4 ++-- code/script_trigger_qa.sh | 10 +++++----- proc/scripts/data/ace-event/convert_examples.py | 14 +++++++++----- proc/scripts/data/ace-event/parse_ace_event.py | 5 +++-- 6 files changed, 30 insertions(+), 21 deletions(-) diff --git a/code/run_args_qa_thresh.py b/code/run_args_qa_thresh.py index 2412fe5..20f65d3 100644 --- a/code/run_args_qa_thresh.py +++ b/code/run_args_qa_thresh.py @@ -87,8 +87,9 @@ def read_ace_examples(input_file, is_training): """Read a ACE json file into a list of AceExample.""" examples = [] with open(input_file, "r", encoding='utf-8') as f: - for line in f: - example = json.loads(line) + print(input_file) + lines = json.load(f) + for example in lines: sentence, events, s_start = example["sentence"], example["event"], example["s_start"] example = AceExample(sentence=sentence, events=events, s_start=s_start) examples.append(example) diff --git a/code/run_trigger_qa.py b/code/run_trigger_qa.py index 90fe5dd..5796116 100644 --- a/code/run_trigger_qa.py +++ b/code/run_trigger_qa.py @@ -56,9 +56,10 @@ def create_vocab(self, files_list): self.category_to_index["None"] = 0 self.index_to_category[0] = "None" for file in files_list: + print(file) with open(file) as f: - for line in f: - example = json.loads(line) + lines = json.load(f) + for example in lines: events, sentence = example["event"], example["sentence"] if len(sentence) > self.max_sent_length: self.max_sent_length = len(sentence) for event in events: @@ -109,8 +110,8 @@ def read_ace_examples(nth_query, input_file, tokenizer, category_vocab, is_train examples = [] sentence_id = 0 with open(input_file, "r", encoding='utf-8') as f: - for line in f: - example = json.loads(line) + lines = json.load(f) + for example in lines: sentence, events, s_start = example["sentence"], example["event"], example["s_start"] offset_category = dict() for event in events: @@ -571,8 +572,10 @@ def main(args): for key in result: writer.write("%s = %s\n" % (key, str(result[key]))) with open(os.path.join(args.output_dir, "trigger_predictions.json"), "w") as writer: + to_write=[] for line in preds: - writer.write(json.dumps(line, default=int) + "\n") + to_write.append(line) + writer.write(json.dumps(to_write, default=int)) if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/code/script_args_qa_thresh.sh b/code/script_args_qa_thresh.sh index 529e845..e3a1ca4 100755 --- a/code/script_args_qa_thresh.sh +++ b/code/script_args_qa_thresh.sh @@ -19,10 +19,10 @@ python code/run_args_qa_thresh.py \ --do_train \ --do_eval \ --model bert-base-uncased \ - --train_file $ACE_DIR/toy.json \ + --train_file $ACE_DIR/train_convert.json \ --dev_file $ACE_PRE_DIR/trigger_predictions.json \ --test_file $ACE_PRE_DIR/trigger_predictions.json \ - --gold_file $ACE_DIR/toy.json \ + --gold_file $ACE_DIR/test_convert.json \ --train_batch_size 8 \ --eval_batch_size 8 \ --learning_rate 4e-5 \ diff --git a/code/script_trigger_qa.sh b/code/script_trigger_qa.sh index f747216..645cebd 100755 --- a/code/script_trigger_qa.sh +++ b/code/script_trigger_qa.sh @@ -8,13 +8,13 @@ echo " query 5 'verb' echo "==========================================================================================" python code/run_trigger_qa.py \ - --do_train \ --do_eval \ + --save_model \ --eval_test \ --model bert-base-uncased \ - --train_file $ACE_DIR/toy.json \ - --dev_file $ACE_DIR/toy.json \ - --test_file $ACE_DIR/toy.json \ + --train_file $ACE_DIR/train_convert.json \ + --dev_file $ACE_DIR/dev_convert.json \ + --test_file $ACE_DIR/test_convert.json \ --train_batch_size 8 \ --eval_batch_size 8 \ --eval_per_epoch 20 \ @@ -23,4 +23,4 @@ python code/run_trigger_qa.py \ --learning_rate 4e-5 \ --nth_query 5 \ --warmup_proportion 0.1 \ - \ No newline at end of file + diff --git a/proc/scripts/data/ace-event/convert_examples.py b/proc/scripts/data/ace-event/convert_examples.py index 958a5b5..e5df2ea 100644 --- a/proc/scripts/data/ace-event/convert_examples.py +++ b/proc/scripts/data/ace-event/convert_examples.py @@ -1,13 +1,15 @@ from os import path import json import collections +import sys -output_dir = "./data/ace-event/processed-data/json" +output_dir = "./data/ace-event/processed-data/default-settings/json" for fold in ["train", "dev", "test"]: g_convert = open(path.join(output_dir, fold + "_convert.json"), "w") + to_write = [] with open(path.join(output_dir, fold + ".json"), "r") as g: - for line in g: - line = json.loads(line) + lines = json.load(g) + for line in lines: sentences = line["sentences"] ner = line["ner"] relations = line["relations"] @@ -25,6 +27,8 @@ sentence_annotated["ner"] = ner sentence_annotated["relation"] = relation sentence_annotated["event"] = event - + # if sentence_annotated["s_start"]>5: - g_convert.write(json.dumps(sentence_annotated, default=int) + "\n") + to_write.append(sentence_annotated) + + g_convert.write(json.dumps(to_write, default=int)) diff --git a/proc/scripts/data/ace-event/parse_ace_event.py b/proc/scripts/data/ace-event/parse_ace_event.py index 75f82da..32dc8f7 100644 --- a/proc/scripts/data/ace-event/parse_ace_event.py +++ b/proc/scripts/data/ace-event/parse_ace_event.py @@ -738,7 +738,7 @@ def one_fold(fold, output_dir, heads_only=True, real_entities_only=True, include with open(path.join(split_path, fold + ".filelist")) as f: for line in f: doc_keys.append(line.strip()) - + to_file = [] with open(path.join(output_dir, fold + ".json"), "w") as g: for doc_key in doc_keys: annotation_path = path.join(doc_path, doc_key + ".apf.xml") @@ -746,7 +746,8 @@ def one_fold(fold, output_dir, heads_only=True, real_entities_only=True, include document = Document(annotation_path, text_path, doc_key, fold, heads_only, real_entities_only, include_pronouns) js = document.to_json() - g.write(json.dumps(js, default=int, indent = 4) + "\n") + to_file.append(js) + g.write(json.dumps(to_file, default=int, indent=4)) def main(): From 1237915b2695bb522b460fff565760bc9b8133e0 Mon Sep 17 00:00:00 2001 From: Louis Date: Fri, 18 Sep 2020 13:35:53 -0400 Subject: [PATCH 2/2] Just adding my own data pipelne --- .gitignore | 5 +- .vscode/settings.json | 3 + code/run_trigger_qa.py | 140 ++++++++++++++++++ code/script_trigger_qa.sh | 2 +- .../ace-event/processed-data/json/toy.json | 3 +- 5 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index 8834bba..6810244 100644 --- a/.gitignore +++ b/.gitignore @@ -133,4 +133,7 @@ dmypy.json *_output *.o* archive -code/script_trigger_debug.sh \ No newline at end of file +code/script_trigger_debug.sh + +.txt +proc/FFO/Stories/ \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..5ab6822 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/usr/bin/python3.7" +} \ No newline at end of file diff --git a/code/run_trigger_qa.py b/code/run_trigger_qa.py index 5796116..68db4c6 100644 --- a/code/run_trigger_qa.py +++ b/code/run_trigger_qa.py @@ -27,6 +27,9 @@ from pytorch_pretrained_bert.tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize) +from spacy.lang.en import English # updated +nlp = English() +nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', @@ -104,6 +107,87 @@ def __init__(self, self.labels = labels +#Used when only sentences are available +def read_arb_examples(nth_query, input_files, tokenizer, category_vocab, is_training): + #Read arbitrary examples + features = [] + examples = [] + sentence_id = 0 + for dir in input_files: + with open(dir, "r", encoding='utf-8') as f: + raw_text = f.read() + doc = nlp(raw_text) + sentences = [sent.string.strip() for sent in doc.sents] + for sentence in sentences: + sentence = sentence.split() + + tokens = [] + segment_ids = [] + in_sentence = [] + labels = [] + + # add [CLS] + tokens.append("[CLS]") + segment_ids.append(0) + in_sentence.append(0) + + # add query + query = candidate_queries[nth_query] + for (i, token) in enumerate(query): + sub_tokens = tokenizer.tokenize(token) + tokens.append(sub_tokens[0]) + segment_ids.append(0) + in_sentence.append(0) + + # add [SEP] + tokens.append("[SEP]") + segment_ids.append(0) + in_sentence.append(0) + + # add sentence + for (i, token) in enumerate(sentence): + sub_tokens = tokenizer.tokenize(token) + tokens.append(sub_tokens[0]) + segment_ids.append(1) + in_sentence.append(1) + + # add [SEP] + tokens.append("[SEP]") + segment_ids.append(1) + in_sentence.append(0) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + while len(input_ids) < category_vocab.max_sent_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + in_sentence.append(0) + + # print(len(input_ids), category_vocab.max_sent_length) + assert len(input_ids) == category_vocab.max_sent_length + assert len(segment_ids) == category_vocab.max_sent_length + assert len(in_sentence) == category_vocab.max_sent_length + assert len(input_mask) == category_vocab.max_sent_length + + features.append( + InputFeatures( + # unique_id=unique_id, + # example_index=example_index, + sentence_id=sentence_id, + tokens=tokens, + # token_to_orig_map=token_to_orig_map, + # token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + in_sentence=in_sentence, + labels=labels)) + examples.append(sentence) + # if len(tokens) > 20 and sum(labels) > 0: + # import ipdb; ipdb.set_trace() + sentence_id += 1 + return examples, features def read_ace_examples(nth_query, input_file, tokenizer, category_vocab, is_training): """Read an ACE json file, transform to features""" features = [] @@ -198,7 +282,35 @@ def read_ace_examples(nth_query, input_file, tokenizer, category_vocab, is_train return examples, features +def infer(args, eval_examples, category_vocab, model, device, eval_dataloader): + # eval_examples, eval_features, na_prob_thresh=1.0, pred_only=False): + all_results = [] + model.eval() + + # get predictions + pred_triggers = dict() + for _, (sentence_id, input_ids, segmend_ids, in_sentence, input_mask) in enumerate(eval_dataloader): + input_ids = input_ids.to(device) + segmend_ids = segmend_ids.to(device) + input_mask = input_mask.to(device) + with torch.no_grad(): + logits = model(input_ids, token_type_ids = segmend_ids, attention_mask = input_mask) + for i, in_sent in enumerate(in_sentence): + logits_i = logits[i].detach().cpu() + _, tag_seq = torch.max(logits_i, 1) + tag_seq = tag_seq.tolist() + + decoded_tag_seg = [] + for idj, j in enumerate(in_sent): + if j: + decoded_tag_seg.append(category_vocab.index_to_category[tag_seq[idj]]) + sentence_triggers = [] + for offset, tag in enumerate(decoded_tag_seg): + if tag != "None": + sentence_triggers.append([offset, tag]) + pred_triggers[sentence_id[i]] = sentence_triggers + return pred_triggers def evaluate(args, eval_examples, category_vocab, model, device, eval_dataloader, pred_only=False): # eval_examples, eval_features, na_prob_thresh=1.0, pred_only=False): @@ -539,6 +651,33 @@ def main(args): writer.write("%s = %s\n" % (key, str(best_result[key]))) del model + if args.do_infer: + #To be updated later + files = ["proc/FFO/Stories/32_The_Snow-White_Heart.txt"] + eval_examples, eval_features = read_arb_examples(input_files=files, nth_query=args.nth_query, tokenizer=tokenizer, category_vocab=category_vocab, is_training=False) + all_sentence_id = torch.tensor([f.sentence_id for f in eval_features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_segmend_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + all_in_sentence = torch.tensor([f.in_sentence for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + + eval_data = TensorDataset(all_sentence_id, all_input_ids, all_segmend_ids, all_in_sentence, all_input_mask) + eval_dataloader = DataLoader(eval_data, batch_size=1) + + + model = BertForTriggerClassification.from_pretrained(args.output_dir, num_labels=len(category_vocab.index_to_category)) + if args.fp16: + model.half() + model.to(device) + preds = infer(args, eval_examples, category_vocab, model, device, eval_dataloader) + + + with open(os.path.join(args.output_dir, "trigger_predictions.json"), "w") as writer: + to_write=[] + for line in preds: + to_write.append(line) + writer.write(json.dumps(to_write, default=int)) + if args.do_eval: if args.eval_test: @@ -589,6 +728,7 @@ def main(args): help="How many times it evaluates on dev set per epoch") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--do_infer", action='store_true', help="Whether to run inference on a set of files given.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_test", action='store_true', help='Wehther to run eval on the test set.') parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") diff --git a/code/script_trigger_qa.sh b/code/script_trigger_qa.sh index 645cebd..5635cf9 100755 --- a/code/script_trigger_qa.sh +++ b/code/script_trigger_qa.sh @@ -8,8 +8,8 @@ echo " query 5 'verb' echo "==========================================================================================" python code/run_trigger_qa.py \ + --do_infer \ --do_eval \ - --save_model \ --eval_test \ --model bert-base-uncased \ --train_file $ACE_DIR/train_convert.json \ diff --git a/proc/data/ace-event/processed-data/json/toy.json b/proc/data/ace-event/processed-data/json/toy.json index 669b112..141fa7a 100644 --- a/proc/data/ace-event/processed-data/json/toy.json +++ b/proc/data/ace-event/processed-data/json/toy.json @@ -1,2 +1,3 @@ {"sentence": ["Tom", "visited", "all", "their", "friends", "."], "s_start": 462, "ner": [[466, 466, "PER"]], "relation": [], "event": [[[463, "Contact.Meet"], [466, 466, "Entity"]]]} -{"sentence": ["Mary", "visited", "all", "her", "friends", "."], "s_start": 462, "ner": [[466, 466, "PER"]], "relation": [], "event": [[[463, "Contact.Meet"], [466, 466, "Entity"]]]} \ No newline at end of file +{"sentence": ["Mary", "visited", "all", "her", "friends", "."], "s_start": 462, "ner": [[466, 466, "PER"]], "relation": [], "event": [[[463, "Contact.Meet"], [466, 466, "Entity"]]]} +