diff --git a/README.md b/README.md index e482a53..f8c8133 100644 --- a/README.md +++ b/README.md @@ -92,20 +92,21 @@ Reader, type: test_open, step: 19332, em: 40.123, f1: 48.358 ## TriviaQA ### Data Preprocessing -The raw TriviaQA data is expected to be unzipped in `~/data/triviaqa`. Training +The raw TriviaQA data is expected to be unzipped in `data/triviaqa`. Training or testing in the unfiltered setting requires the unfiltered data to be -download to `~/data/triviaqa-unfiltered`. +download to `data/triviaqa-unfiltered`. ```bash -mkdir -p ~/data/triviaqa -cd ~/data/triviaqa +mkdir -p data/triviaqa +cd data/triviaqa wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz tar xf triviaqa-rc.tar.gz rm triviaqa-rc.tar.gz -cd ~/data +cd .. wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz tar xf triviaqa-unfiltered.tar.gz rm triviaqa-unfiltered.tar.gz +cd .. ``` First tokenize evidence documents by @@ -124,6 +125,8 @@ Next, retrieve top-n paragraphs based on TF-IDF to construct the train and dev s ```shell python -m triviaqa.ablate_triviaqa_wiki --n_processes 8 --n_para_train 12 --n_para_dev 14 --n_para_test 14 --do_train --do_dev --do_test python -m triviaqa.ablate_triviaqa_unfiltered --n_processes 8 --n_para_train 12 --n_para_dev 14 --n_para_test 14 --do_train --do_dev --do_test +cp data/triviaqa/qa/wikipedia-dev.json data/triviaqa/wiki/ +cp data/triviaqa-unfiltered/unfiltered-web-dev.json data/triviaqa/unfiltered/ ``` ### Wikipedia Domain @@ -170,6 +173,7 @@ python -m bert.run_triviaqa_wiki_full_e2e \ --do_train \ --do_dev \ --data_dir $DATA_DIR \ + --dev_file unfiltered-web-dev.json \ --train_batch_size 32 \ --learning_rate 3e-5 \ --num_train_epochs 2.0 \ diff --git a/triviaqa/build_span_corpus.py b/triviaqa/build_span_corpus.py index 6a6f92c..9217eb4 100644 --- a/triviaqa/build_span_corpus.py +++ b/triviaqa/build_span_corpus.py @@ -5,15 +5,15 @@ from itertools import islice from typing import List, Optional, Dict from os import mkdir -from os.path import join, exists, expanduser +from os.path import join, exists import bert.tokenization as tokenization from triviaqa.configurable import Configurable from triviaqa.read_data import iter_trivia_question, TriviaQaQuestion from triviaqa.evidence_corpus import TriviaQaEvidenceCorpusTxt from triviaqa.answer_detection import compute_answer_spans_par, FastNormalizedAnswerDetector -TRIVIA_QA = join(expanduser("~"), "data", "triviaqa") -TRIVIA_QA_UNFILTERED = join(expanduser("~"), "data", "triviaqa-unfiltered") +TRIVIA_QA = join("data", "triviaqa") +TRIVIA_QA_UNFILTERED = join("data", "triviaqa-unfiltered") def build_dataset(name: str, tokenizer, train_files: Dict[str, str], @@ -24,7 +24,6 @@ def build_dataset(name: str, tokenizer, train_files: Dict[str, str], mkdir(out_dir) file_map = {} # maps document_id -> filename - for name, filename in train_files.items(): print("Loading %s questions" % name) if sample is None: diff --git a/triviaqa/evidence_corpus.py b/triviaqa/evidence_corpus.py index 4069fe7..23a04d4 100644 --- a/triviaqa/evidence_corpus.py +++ b/triviaqa/evidence_corpus.py @@ -1,7 +1,7 @@ import argparse import re from os import walk, mkdir, makedirs -from os.path import relpath, join, exists, expanduser +from os.path import relpath, join, exists from typing import Set from tqdm import tqdm from typing import List @@ -10,7 +10,7 @@ from triviaqa.utils import split, flatten_iterable, group from triviaqa.read_data import normalize_wiki_filename -TRIVIA_QA = join(expanduser("~"), "data", "triviaqa") +TRIVIA_QA = join("data", "triviaqa") class MergeParagraphs(object): def __init__(self, max_tokens: int):