diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c3d9b6d..db97202 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,11 @@ CHANGELOG ========= +Version 1.4.0 +------------- + +* added genbank downloader script + Version 1.3.2 ------------- diff --git a/bio_bits/genbank.py b/bio_bits/genbank.py new file mode 100644 index 0000000..56f826b --- /dev/null +++ b/bio_bits/genbank.py @@ -0,0 +1,45 @@ +''' +Usage: + genbank +''' +from Bio import SeqIO +import dateparser +import sys +from docopt import docopt + +def rec_qualifier(rec, key): + for f in rec.features: + if f.qualifiers: + if 'collection_date' in f.qualifiers: + return f.qualifiers[key] + +def fix_date(d): + date = dateparser.parse(d) + return date.strftime('%Y-%m-%d') + +def rec_to_fasta(rec): + raw_collection_date = rec_qualifier(rec, 'collection_date')[0] + country = rec_qualifier(rec, 'country')[0] + date = dateparser.parse(raw_collection_date) + date = date.strftime('%Y-%m-%d') + rec.description = "|{}|{}".format(country, date) + return rec + + # acc = rec.name + #header = "{}|{}|{}".format(acc, country, date) + #seq = str(rec.seq) + #return rec.format("fasta") +def run(infile): + with open(infile) as input: + gb = SeqIO.parse(input, 'genbank') + results = map(rec_to_fasta, gb) + SeqIO.write(results, sys.stdout, 'fasta') + +def main(): + raw_args = docopt(__doc__, version='Version 1.0') + run(raw_args['']) + sys.exit(0) + + +# ['17-Feb-2010'] +# rec = degen.id_to_record("KJ627355") diff --git a/requirements-pip.txt b/requirements-pip.txt index 21186bf..8fea4e1 100644 --- a/requirements-pip.txt +++ b/requirements-pip.txt @@ -1,5 +1,6 @@ argparse docopt +dateparser future schema sh diff --git a/setup.py b/setup.py index 46e5069..109f230 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ 'degen = bio_bits.degen:main', 'plot_muts = bio_bits.plot_muts:main', 'fasta = bio_bits.fasta:main', + 'genbank = bio_bits.genbank:main', #'sequence_concat = bio_bits.sequence_concat:main', #'sequence_files_concat = bio_bits.sequence_files_concat:main', #'sequence_split = bio_bits_old.sequence_split:main',