forked from greenelab/pubtator
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexecute.sh
More file actions
25 lines (21 loc) · 762 Bytes
/
execute.sh
File metadata and controls
25 lines (21 loc) · 762 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# Exit on an error
set -o errexit
# PubTator FTP download
FTP_URL=ftp://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator
wget \
--timestamping \
--directory-prefix=download \
--output-file=download/bioconcepts2pubtator_offsets.gz.log \
$FTP_URL/bioconcepts2pubtator_offsets.gz
# Convert pubtator format to BioC XML
python scripts/pubtator_to_xml.py \
--documents download/bioconcepts2pubtator_offsets.gz \
--output data/pubtator-docs.xml.xz
# Extract tags from the BioC XML to a TSV
python scripts/extract_tags.py \
--input data/pubtator-docs.xml.xz \
--output data/pubtator-tags.tsv.xz
# Extract hetnet tags from the pubtator tags
python scripts/hetnet_id_extractor.py \
--input data/pubtator-tags.tsv.xz \
--output data/pubtator-hetnet-tags.tsv.xz