Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 88 additions & 55 deletions access_genbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@
#-------------------------------

def build_search_term(gene_name, min_len=0, max_len=-1):
""" generate a search query for NCBI's nucleotide database (GenBank). Bio.Entrez does not provide filters such as sequence length and gene name
"""
Generate a search query for NCBI's nucleotide database (GenBank). Bio.Entrez does not provide filters such as sequence length and gene name
so a search query keywords must be used instead.
Used to make database queries in extract_database and download_database

Parameters
----------
gene_name: (str) name of the gene of interest
min_len: (int, optional) lower bound of the sequence length filter, by default 0
max_len: (int, optional) upper bound of the sequence length filter, by default -1 means no limit
Parameters:
gene_name(str): name of the gene of interest
min_len(int, optional): lower bound of the sequence length filter, by default 0
max_len(int, optional): upper bound of the sequence length filter, by default -1 means no limit

Returns
----------
term: (str) search query
Returns:
term(str): search query
"""
term = f"{gene_name}[Gene Name]"
if max_len == -1:
Expand All @@ -32,17 +31,16 @@ def build_search_term(gene_name, min_len=0, max_len=-1):


def extract_database(gene_name, seqlen_start=0, seqlen_stop=-1):
'''Request data on GenBank from NCBI (with Entrez module), with filtering options
'''
Request data on GenBank from NCBI (with Entrez module), with filtering options

Parameters
----------
gene_name: (str) name of the gene to extract from GenBank
seqlen_start: (int, optional) lower bound of the sequence length filter, by default 0
seqlen_stop: (int, optional) upper bound of the sequence length filter, by default -1 means no limit
Parameters:
gene_name(str): name of the gene to extract from GenBank
seqlen_start(int, optional): lower bound of the sequence length filter, by default 0
seqlen_stop(int, optional): upper bound of the sequence length filter, by default -1 means no limit

Returns
----------
seq_record: (str) raw data of all sequences
Returns:
seq_record(str): raw data of all sequences
'''
handle = Entrez.esearch(db= "nucleotide", term= build_search_term(gene_name, seqlen_start, seqlen_stop), retmax = 10000)
record = Entrez.read(handle)
Expand All @@ -60,59 +58,94 @@ def extract_database(gene_name, seqlen_start=0, seqlen_stop=-1):


def download_database(gene_name, seqlen_start=0, seqlen_stop=-1):
'''Download data to a .fasta file, with filtering options
'''
Download data to a .fasta file, with filtering options

Args:
gene_name: (str) name of the gene to extract from GenBank
seqlen_start: (int) lower bound of the sequence length filter
seqlen_stop: (int) upper bound of the sequence length filter
gene_name(str): name of the gene to extract from GenBank
seqlen_start(int): lower bound of the sequence length filter
seqlen_stop(int): upper bound of the sequence length filter

Returns:
data_path: (str) path to downloaded data .fasta file
data_path(str): path to downloaded data .fasta file
'''
data_path = f"{gene_name}[{seqlen_start},{seqlen_stop}].fasta"
with open(data_path, mode="w") as file:
file.write(extract_database(gene_name, seqlen_start, seqlen_stop))
return data_path

def download_sequence(species, gene_name, dst, start_length=None, stop_length= None, id = None, permissive_search = True):
def download_sequence_from_id(id: list, dst = None):
"""
download sequence from GenBank through the Entrez database using unique identifiers.

Args:
id (list): list of IDs of sequences to download from GenBank
dst(str): destination file path

Returns:
Bio.SeqIO.FastaIO.FastaIterator: iterator over the records corresponding to the input ids

"""
handle = Entrez.efetch(db="nucleotide", id=id, retmode = "fasta", rettype = "fasta")
record_parser = SeqIO.parse(handle, "fasta")
if dst != None:
with open(dst, mode="a") as writer:
SeqIO.write(record_parser,writer,"fasta")
return SeqIO.parse(dst,"fasta")
return record_parser

def get_species_from_ids(ids:list):
"""
download sequence from GenBank through the Entrez database.
Return the species from GenBank entries with certain ids.

Parameters:
----------
species(str): name of species
gene_name(str): name of gene
dst(str,Path-like): destination file path
start_length(int): minimum length of sequence
stop_length(int): maximum length of sequence
id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species.
permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query.
ids: list of ids to search

Returns:
dictionary that matches ids to organism names

"""
species_dict= dict()
for id in ids:
handle = Entrez.efetch(db="nucleotide", id=id, retmode = "gb", rettype = "gb")
records = SeqIO.parse(handle,"genbank")
one_record = next(records)
species_dict[id]=one_record.annotations["organism"]
return species_dict

def download_sequence(species, gene_name, dst = None, start_length=None, stop_length= None, permissive_search = True):
"""
Download sequence from GenBank through the Entrez database.

Parameters:
species(str): name of species
gene_name(str): name of gene
dst(str,Path-like): destination file path, if None does not write to a file, only returns
start_length(int): minimum length of sequence
stop_length(int): maximum length of sequence
id(list): list of NCBi ids of sequences to download. If provided, overrides gene_name and species.
permissive_search(bool, default = True): when True, if Advanced NCBI query returns nothing, replace it with a less precise general query.

Returns:
Bio.SeqRecord.SeqRecord: sequence found from GenBank

"""

if id == None:
search_term = f"{gene_name}[Gene Name] AND {species}[Organism]"
if start_length!= None or stop_length!= None:
search_term += f" {start_length}:{stop_length}[Sequence Length]"

search_term = f"{gene_name}[Gene Name] AND {species}[Organism]"
if start_length!= None or stop_length!= None:
search_term += f" {start_length}:{stop_length}[Sequence Length]"
handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc")
search_result = Entrez.read(handle)
handle.close()
id = search_result["IdList"]
if len(id)==0 and permissive_search:
search_term = f"{gene_name} {species} {start_length}:{stop_length}[Sequence Length]"
handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc")
search_result = Entrez.read(handle)
handle.close()
id = search_result["IdList"]
n=0
for i in id:
n+=1
if n==0 and permissive_search:
search_term = f"{gene_name} {species} {start_length}:{stop_length}[Sequence Length]"
handle = Entrez.esearch(db = "nucleotide", term = search_term, retmax = 1, idtype = "acc")
search_result = Entrez.read(handle)
handle.close()
id = search_result["IdList"]
n=0
for i in id:
n+=1
if n==1:
handle = Entrez.efetch(db="nucleotide", id=id, retmode = "fasta", rettype = "fasta")
sequence = handle.read()
handle.close()
with open(dst, mode="a") as writer:
writer.write(sequence)
if len(id)==1:
return next(download_sequence_from_id(id,dst))
else:
raise ValueError("Your search did not return any results")
34 changes: 34 additions & 0 deletions create_fake_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from access_genbank import download_sequence_from_id
from generate_fastq import generate_fastq, multiplex
import os
import os.path as ospath
import argparse
from Bio import SeqIO

def main():

destination_folder = "test_fake4"
ids= ["ON409953.1", "AF288129.1", "AF288126.1", "AF288125.1" ]
multiplexed = True



destination_folder_path= ospath.join("data", destination_folder)
if not ospath.exists(destination_folder_path):
os.makedirs(destination_folder_path)

record_parser = download_sequence_from_id(ids)

for id, record in zip(ids,record_parser):
new_demultiplexed_fastq_path = ospath.join(destination_folder_path, str(id)+".fastq")
generate_fastq(record.seq, 100, new_demultiplexed_fastq_path)
print("left for loop")
if multiplexed:
multiplex_path=ospath.join(destination_folder_path,"multiplexed.fastq")
multiplex([destination_folder_path],multiplex_path)

for record in SeqIO.parse(multiplex_path,"fastq"):
print(record)

if __name__ == "__main__":
main()
Loading