Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,21 @@ Reader, type: test_open, step: 19332, em: 40.123, f1: 48.358

## TriviaQA
### Data Preprocessing
The raw TriviaQA data is expected to be unzipped in `~/data/triviaqa`. Training
The raw TriviaQA data is expected to be unzipped in `data/triviaqa`. Training
or testing in the unfiltered setting requires the unfiltered data to be
download to `~/data/triviaqa-unfiltered`.
download to `data/triviaqa-unfiltered`.
```bash
mkdir -p ~/data/triviaqa
cd ~/data/triviaqa
mkdir -p data/triviaqa
cd data/triviaqa
wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz
tar xf triviaqa-rc.tar.gz
rm triviaqa-rc.tar.gz

cd ~/data
cd ..
wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz
tar xf triviaqa-unfiltered.tar.gz
rm triviaqa-unfiltered.tar.gz
cd ..
```

First tokenize evidence documents by
Expand All @@ -124,6 +125,8 @@ Next, retrieve top-n paragraphs based on TF-IDF to construct the train and dev s
```shell
python -m triviaqa.ablate_triviaqa_wiki --n_processes 8 --n_para_train 12 --n_para_dev 14 --n_para_test 14 --do_train --do_dev --do_test
python -m triviaqa.ablate_triviaqa_unfiltered --n_processes 8 --n_para_train 12 --n_para_dev 14 --n_para_test 14 --do_train --do_dev --do_test
cp data/triviaqa/qa/wikipedia-dev.json data/triviaqa/wiki/
cp data/triviaqa-unfiltered/unfiltered-web-dev.json data/triviaqa/unfiltered/
```

### Wikipedia Domain
Expand Down Expand Up @@ -170,6 +173,7 @@ python -m bert.run_triviaqa_wiki_full_e2e \
--do_train \
--do_dev \
--data_dir $DATA_DIR \
--dev_file unfiltered-web-dev.json \
--train_batch_size 32 \
--learning_rate 3e-5 \
--num_train_epochs 2.0 \
Expand Down
7 changes: 3 additions & 4 deletions triviaqa/build_span_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from itertools import islice
from typing import List, Optional, Dict
from os import mkdir
from os.path import join, exists, expanduser
from os.path import join, exists
import bert.tokenization as tokenization
from triviaqa.configurable import Configurable
from triviaqa.read_data import iter_trivia_question, TriviaQaQuestion
from triviaqa.evidence_corpus import TriviaQaEvidenceCorpusTxt
from triviaqa.answer_detection import compute_answer_spans_par, FastNormalizedAnswerDetector

TRIVIA_QA = join(expanduser("~"), "data", "triviaqa")
TRIVIA_QA_UNFILTERED = join(expanduser("~"), "data", "triviaqa-unfiltered")
TRIVIA_QA = join("data", "triviaqa")
TRIVIA_QA_UNFILTERED = join("data", "triviaqa-unfiltered")


def build_dataset(name: str, tokenizer, train_files: Dict[str, str],
Expand All @@ -24,7 +24,6 @@ def build_dataset(name: str, tokenizer, train_files: Dict[str, str],
mkdir(out_dir)

file_map = {} # maps document_id -> filename

for name, filename in train_files.items():
print("Loading %s questions" % name)
if sample is None:
Expand Down
4 changes: 2 additions & 2 deletions triviaqa/evidence_corpus.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import re
from os import walk, mkdir, makedirs
from os.path import relpath, join, exists, expanduser
from os.path import relpath, join, exists
from typing import Set
from tqdm import tqdm
from typing import List
Expand All @@ -10,7 +10,7 @@
from triviaqa.utils import split, flatten_iterable, group
from triviaqa.read_data import normalize_wiki_filename

TRIVIA_QA = join(expanduser("~"), "data", "triviaqa")
TRIVIA_QA = join("data", "triviaqa")

class MergeParagraphs(object):
def __init__(self, max_tokens: int):
Expand Down