From f8bd2f98ee6633d4a2bc503c04ab8fada5fc2ddd Mon Sep 17 00:00:00 2001 From: Yoshitomo Matsubara Date: Mon, 8 Jul 2019 23:36:13 -0700 Subject: [PATCH 1/6] fixed typos and bugs --- README.md | 11 ++++++----- triviaqa/build_span_corpus.py | 5 ++--- triviaqa/evidence_corpus.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e482a53..1e8e3d9 100644 --- a/README.md +++ b/README.md @@ -92,17 +92,17 @@ Reader, type: test_open, step: 19332, em: 40.123, f1: 48.358 ## TriviaQA ### Data Preprocessing -The raw TriviaQA data is expected to be unzipped in `~/data/triviaqa`. Training +The raw TriviaQA data is expected to be unzipped in `data/triviaqa`. Training or testing in the unfiltered setting requires the unfiltered data to be -download to `~/data/triviaqa-unfiltered`. +download to `data/triviaqa-unfiltered`. ```bash -mkdir -p ~/data/triviaqa -cd ~/data/triviaqa +mkdir -p /data/triviaqa +cd /data/triviaqa wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz tar xf triviaqa-rc.tar.gz rm triviaqa-rc.tar.gz -cd ~/data +cd .. wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz tar xf triviaqa-unfiltered.tar.gz rm triviaqa-unfiltered.tar.gz @@ -170,6 +170,7 @@ python -m bert.run_triviaqa_wiki_full_e2e \ --do_train \ --do_dev \ --data_dir $DATA_DIR \ + --dev_file unfiltered-web-dev.json --train_batch_size 32 \ --learning_rate 3e-5 \ --num_train_epochs 2.0 \ diff --git a/triviaqa/build_span_corpus.py b/triviaqa/build_span_corpus.py index 6a6f92c..982944c 100644 --- a/triviaqa/build_span_corpus.py +++ b/triviaqa/build_span_corpus.py @@ -5,15 +5,14 @@ from itertools import islice from typing import List, Optional, Dict from os import mkdir -from os.path import join, exists, expanduser +from os.path import join, exists import bert.tokenization as tokenization from triviaqa.configurable import Configurable from triviaqa.read_data import iter_trivia_question, TriviaQaQuestion from triviaqa.evidence_corpus import TriviaQaEvidenceCorpusTxt from triviaqa.answer_detection import compute_answer_spans_par, FastNormalizedAnswerDetector -TRIVIA_QA = join(expanduser("~"), "data", "triviaqa") -TRIVIA_QA_UNFILTERED = join(expanduser("~"), "data", "triviaqa-unfiltered") +TRIVIA_QA = join("data", "triviaqa") def build_dataset(name: str, tokenizer, train_files: Dict[str, str], diff --git a/triviaqa/evidence_corpus.py b/triviaqa/evidence_corpus.py index 4069fe7..23a04d4 100644 --- a/triviaqa/evidence_corpus.py +++ b/triviaqa/evidence_corpus.py @@ -1,7 +1,7 @@ import argparse import re from os import walk, mkdir, makedirs -from os.path import relpath, join, exists, expanduser +from os.path import relpath, join, exists from typing import Set from tqdm import tqdm from typing import List @@ -10,7 +10,7 @@ from triviaqa.utils import split, flatten_iterable, group from triviaqa.read_data import normalize_wiki_filename -TRIVIA_QA = join(expanduser("~"), "data", "triviaqa") +TRIVIA_QA = join("data", "triviaqa") class MergeParagraphs(object): def __init__(self, max_tokens: int): From b7513093b2a7c38395a53be9ddc8ff6ce17967ef Mon Sep 17 00:00:00 2001 From: Yoshitomo Matsubara Date: Mon, 8 Jul 2019 23:50:13 -0700 Subject: [PATCH 2/6] tried different parent dir name --- triviaqa/build_span_corpus.py | 44 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/triviaqa/build_span_corpus.py b/triviaqa/build_span_corpus.py index 982944c..9bba63c 100644 --- a/triviaqa/build_span_corpus.py +++ b/triviaqa/build_span_corpus.py @@ -123,58 +123,58 @@ def __init__(self): def build_wiki_corpus(n_processes): build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), - dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), - train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), - test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json") + verified=join(TRIVIA_QA, "wiki", "verified-wikipedia-dev.json"), + dev=join(TRIVIA_QA, "wiki", "wikipedia-dev.json"), + train=join(TRIVIA_QA, "wiki", "wikipedia-train.json"), + test=join(TRIVIA_QA, "wiki", "wikipedia-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes) def build_web_corpus(n_processes): build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), - dev=join(TRIVIA_QA, "qa", "web-dev.json"), - train=join(TRIVIA_QA, "qa", "web-train.json"), - test=join(TRIVIA_QA, "qa", "web-test-without-answers.json") + verified=join(TRIVIA_QA, "web", "verified-web-dev.json"), + dev=join(TRIVIA_QA, "web", "web-dev.json"), + train=join(TRIVIA_QA, "web", "web-train.json"), + test=join(TRIVIA_QA, "web", "web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes) def build_unfiltered_corpus(n_processes): build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True), dict( - dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), - train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), - test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json") + dev=join(TRIVIA_QA, "unfiltered", "unfiltered-web-dev.json"), + train=join(TRIVIA_QA, "unfiltered", "unfiltered-web-train.json"), + test=join(TRIVIA_QA, "unfiltered", "unfiltered-web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes) def build_wiki_sample_corpus(n_processes): build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), - dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), - train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), - test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json") + verified=join(TRIVIA_QA, "wiki-sample", "verified-wikipedia-dev.json"), + dev=join(TRIVIA_QA, "wiki-sample", "wikipedia-dev.json"), + train=join(TRIVIA_QA, "wiki-sample", "wikipedia-train.json"), + test=join(TRIVIA_QA, "wiki-sample", "wikipedia-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20) def build_web_sample_corpus(n_processes): build_dataset("web-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), - dev=join(TRIVIA_QA, "qa", "web-dev.json"), - train=join(TRIVIA_QA, "qa", "web-train.json"), - test=join(TRIVIA_QA, "qa", "web-test-without-answers.json") + verified=join(TRIVIA_QA, "web-sample", "verified-web-dev.json"), + dev=join(TRIVIA_QA, "web-sample", "web-dev.json"), + train=join(TRIVIA_QA, "web-sample", "web-train.json"), + test=join(TRIVIA_QA, "web-sample", "web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20) def build_unfiltered_sample_corpus(n_processes): build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( - dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), - train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), - test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json") + dev=join(TRIVIA_QA, "unfiltered-sample", "unfiltered-web-dev.json"), + train=join(TRIVIA_QA, "unfiltered-sample", "unfiltered-web-train.json"), + test=join(TRIVIA_QA, "unfiltered-sample", "unfiltered-web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20) From dcf77fa3862eade1fed7b0931a7dd3662f48389f Mon Sep 17 00:00:00 2001 From: Yoshitomo Matsubara Date: Tue, 9 Jul 2019 00:02:11 -0700 Subject: [PATCH 3/6] fixed a typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1e8e3d9..5a188f1 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ The raw TriviaQA data is expected to be unzipped in `data/triviaqa`. Training or testing in the unfiltered setting requires the unfiltered data to be download to `data/triviaqa-unfiltered`. ```bash -mkdir -p /data/triviaqa +mkdir -p data/triviaqa cd /data/triviaqa wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz tar xf triviaqa-rc.tar.gz From 58ca98d165f0bf8db80938ce6d2b11279c3fb06b Mon Sep 17 00:00:00 2001 From: Yoshitomo Matsubara Date: Tue, 9 Jul 2019 00:03:10 -0700 Subject: [PATCH 4/6] fixed a typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a188f1..d3fb622 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ or testing in the unfiltered setting requires the unfiltered data to be download to `data/triviaqa-unfiltered`. ```bash mkdir -p data/triviaqa -cd /data/triviaqa +cd data/triviaqa wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz tar xf triviaqa-rc.tar.gz rm triviaqa-rc.tar.gz From 9bb67f43b34ca2f3875f4a0550694aaabaf87e92 Mon Sep 17 00:00:00 2001 From: Yoshitomo Matsubara Date: Tue, 9 Jul 2019 00:23:17 -0700 Subject: [PATCH 5/6] fixed a typo --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d3fb622..717a118 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ cd .. wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz tar xf triviaqa-unfiltered.tar.gz rm triviaqa-unfiltered.tar.gz +cd .. ``` First tokenize evidence documents by From dd6e8a31bdbd2462b4b3a1982157b94da480015a Mon Sep 17 00:00:00 2001 From: Yoshitomo Matsubara Date: Tue, 9 Jul 2019 23:44:15 -0700 Subject: [PATCH 6/6] fixed bugs and updated README --- README.md | 4 ++- triviaqa/build_span_corpus.py | 46 +++++++++++++++++------------------ 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 717a118..f8c8133 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,8 @@ Next, retrieve top-n paragraphs based on TF-IDF to construct the train and dev s ```shell python -m triviaqa.ablate_triviaqa_wiki --n_processes 8 --n_para_train 12 --n_para_dev 14 --n_para_test 14 --do_train --do_dev --do_test python -m triviaqa.ablate_triviaqa_unfiltered --n_processes 8 --n_para_train 12 --n_para_dev 14 --n_para_test 14 --do_train --do_dev --do_test +cp data/triviaqa/qa/wikipedia-dev.json data/triviaqa/wiki/ +cp data/triviaqa-unfiltered/unfiltered-web-dev.json data/triviaqa/unfiltered/ ``` ### Wikipedia Domain @@ -171,7 +173,7 @@ python -m bert.run_triviaqa_wiki_full_e2e \ --do_train \ --do_dev \ --data_dir $DATA_DIR \ - --dev_file unfiltered-web-dev.json + --dev_file unfiltered-web-dev.json \ --train_batch_size 32 \ --learning_rate 3e-5 \ --num_train_epochs 2.0 \ diff --git a/triviaqa/build_span_corpus.py b/triviaqa/build_span_corpus.py index 9bba63c..9217eb4 100644 --- a/triviaqa/build_span_corpus.py +++ b/triviaqa/build_span_corpus.py @@ -13,6 +13,7 @@ from triviaqa.answer_detection import compute_answer_spans_par, FastNormalizedAnswerDetector TRIVIA_QA = join("data", "triviaqa") +TRIVIA_QA_UNFILTERED = join("data", "triviaqa-unfiltered") def build_dataset(name: str, tokenizer, train_files: Dict[str, str], @@ -23,7 +24,6 @@ def build_dataset(name: str, tokenizer, train_files: Dict[str, str], mkdir(out_dir) file_map = {} # maps document_id -> filename - for name, filename in train_files.items(): print("Loading %s questions" % name) if sample is None: @@ -123,58 +123,58 @@ def __init__(self): def build_wiki_corpus(n_processes): build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "wiki", "verified-wikipedia-dev.json"), - dev=join(TRIVIA_QA, "wiki", "wikipedia-dev.json"), - train=join(TRIVIA_QA, "wiki", "wikipedia-train.json"), - test=join(TRIVIA_QA, "wiki", "wikipedia-test-without-answers.json") + verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), + dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), + train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), + test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes) def build_web_corpus(n_processes): build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "web", "verified-web-dev.json"), - dev=join(TRIVIA_QA, "web", "web-dev.json"), - train=join(TRIVIA_QA, "web", "web-train.json"), - test=join(TRIVIA_QA, "web", "web-test-without-answers.json") + verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), + dev=join(TRIVIA_QA, "qa", "web-dev.json"), + train=join(TRIVIA_QA, "qa", "web-train.json"), + test=join(TRIVIA_QA, "qa", "web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes) def build_unfiltered_corpus(n_processes): build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True), dict( - dev=join(TRIVIA_QA, "unfiltered", "unfiltered-web-dev.json"), - train=join(TRIVIA_QA, "unfiltered", "unfiltered-web-train.json"), - test=join(TRIVIA_QA, "unfiltered", "unfiltered-web-test-without-answers.json") + dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), + train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), + test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes) def build_wiki_sample_corpus(n_processes): build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "wiki-sample", "verified-wikipedia-dev.json"), - dev=join(TRIVIA_QA, "wiki-sample", "wikipedia-dev.json"), - train=join(TRIVIA_QA, "wiki-sample", "wikipedia-train.json"), - test=join(TRIVIA_QA, "wiki-sample", "wikipedia-test-without-answers.json") + verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), + dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), + train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), + test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20) def build_web_sample_corpus(n_processes): build_dataset("web-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( - verified=join(TRIVIA_QA, "web-sample", "verified-web-dev.json"), - dev=join(TRIVIA_QA, "web-sample", "web-dev.json"), - train=join(TRIVIA_QA, "web-sample", "web-train.json"), - test=join(TRIVIA_QA, "web-sample", "web-test-without-answers.json") + verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), + dev=join(TRIVIA_QA, "qa", "web-dev.json"), + train=join(TRIVIA_QA, "qa", "web-train.json"), + test=join(TRIVIA_QA, "qa", "web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20) def build_unfiltered_sample_corpus(n_processes): build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True), dict( - dev=join(TRIVIA_QA, "unfiltered-sample", "unfiltered-web-dev.json"), - train=join(TRIVIA_QA, "unfiltered-sample", "unfiltered-web-train.json"), - test=join(TRIVIA_QA, "unfiltered-sample", "unfiltered-web-test-without-answers.json") + dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), + train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), + test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json") ), FastNormalizedAnswerDetector(), n_processes, sample=20)