diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 358c50e..7b3a484 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -10,10 +10,12 @@ NEW FEATURES: - Getting gene neighborhoods and overlaying on the tree - Getting basic information about related genes - Getting BLAST support for a gene family. + - Building an organism tree from a selection of core genes Note you must have already built the database before running the GUI. Using the gui requires you to have access to an X11 server and the easygui Python package. It should be considered highly experimental / breakable at this point. -- OrthoMCL wrapper now takes as input an arbitrary table of BLAST results from the ITEP database +- OrthoMCL wrapper now takes as input an arbitrary table of BLAST results from the ITEP database instead of trying to + run OrthoMCL with everything. Use db_getBlastResultsBetweenSpecificOrganisms.py to generate the input table. - makeCoreClusterAnalysisTree.py now optionally can be used to analyze presence-absence patterns relative to sister clades, rather than relative to all organisms in the cluster run. See help text for details. - db_makeClusterComparisonTable.py - identify genes in the same cluster as a reference across cluster runs and provide @@ -21,6 +23,8 @@ NEW FEATURES: - db_getEquivalentGenesInOrganism.py - Given a list of query genes, a run ID and an organism, finds all genes in the provided organism that are in the same cluster as the query genes and returns a conversion table. - Make a diagram of neighbors for a single gene +- Most table-generating functions support use of a --header argument (use this only for the last command in a pipeline).The header contains descriptiosn of the rows' contents. + Exceptions include scripts that add columns to an existing table and functions that always produce headers (e.g. db_getPresenceAbsenceTable.py) BUG FIXES: - Check for biopython version no longer fails due to letters in the version number. Thanks to Matt Richards for reporting. @@ -37,6 +41,8 @@ BUG FIXES: - Made SourceMe.sh more robust to spaces in the current PATH definition. Thanks to drjcthrash for the inspiration. - Improved performance of run ID listing and organism links. - Fixed bug in definition of distinctorgs view (and turned it into a table). +- Improved concurrency of multiple scripts. Removed all temporary tables in sqlite except one (in db_getBlastResultsBetweenSpecificOrganisms) for performance reasons. + Some scripts now require explicit declaration of an output location which did not before, to remove confusion on where files end up. VIRTUAL MACHINE FIXES: - Added FastTreeMP to virtual machines diff --git a/checkForDependencies.sh b/checkForDependencies.sh index ebe6ecd..5061931 100755 --- a/checkForDependencies.sh +++ b/checkForDependencies.sh @@ -22,6 +22,7 @@ command -v blastp > /dev/null 2>&1 || { echo "ERROR: Unable to find NCBI BLAST+ which is required"; echo "It can be found at ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/" echo "Note that the \"blast\" in aptitude is NOT BLAST+ - you must have e.g. the blastp executable" + echo "Also note that to use RPSBLAST you will need a recent enough version. 2.28 is known to work." echo ""; $STATUS=1; } @@ -56,6 +57,7 @@ if [ $? -ne 0 ]; then echo "" echo "ERROR: Unable to find required package Biopython (Bio). This package is needed for many IO and graphics operations in ITEP" echo "It can be downloaded from http://biopython.org or installed (via setuptools) with sudo easy_install -f http://biopython.org/DIST/ biopython" + echo "Newer versions are less likely to have issues with parsing Genbank files." echo "" $STATUS=1; fi @@ -98,14 +100,14 @@ command -v CompareToBootstrap.pl > /dev/null 2>&1 || { command -v FastTreeMP > /dev/null 2>&1 || { echo ""; echo "WARNING: Unable to find FastTreeMP with the default program name - will need to specify the actual program name to"; - echo "use the wrapper script for FastTree"; + echo "use the wrapper script for FastTree (The default name is FastTreeMP)"; echo ""; } command -v Gblocks > /dev/null 2>&1 || { echo ""; echo "WARNING: Unable to find Gblocks with default program name - will need to specify the actual program name to "; - echo "use the wrapper script for GBlocks"; + echo "use the wrapper script for GBlocks (the default name is Gblocks)."; echo "If you don't have it, it can be found at http://molevol.cmima.csic.es/castresana/Gblocks.html"; echo ""; } @@ -147,7 +149,8 @@ command -v perl > /dev/null 2>&1 || { command -v 'raxmlHPC-PTHREADS' > /dev/null 2>&1 || { echo ""; echo "WARNING: Unable to find raxml with the default program name - will need to specify the actual program name to"; - echo "use the wrapper script for RaxML"; + echo "use the wrapper script for RaxML. The default program name is raxmlHPC-PTHREADS"; + echo ""; echo "If you dont have RaxML it can be downloaded at http://www.exelixis-lab.org/ or checked out of github" echo "using git clone git@github.com:stamatak/standard-RAxML (requires a github account and SSH key)." echo ""; @@ -194,4 +197,12 @@ if [ $? -ne 0 ]; then echo "" fi +python -c 'import xlwt' +if [ $? -ne 0 ]; then + echo "" + echo "WARNING: Unable to find the Python package xlwt. This package is needed to export to Excel (most scripts do not need this)." + echo "It can be found at https://pypi.python.org/pypi/xlwt or installed via pip install xlwt" + echo "" +fi + exit ${STATUS}; diff --git a/gui/FetchOrganismFromGenbank.py b/gui/FetchOrganismFromGenbank.py new file mode 100644 index 0000000..a113a9d --- /dev/null +++ b/gui/FetchOrganismFromGenbank.py @@ -0,0 +1,103 @@ +#!/usr/bin/python + +''' This file contains some functions to make it easier to automate fetching genomes from Entrez ''' + +import easygui +import optparse +import sys +from Bio import Entrez + +from GuiBase import * + +class NcbiError(GuiError): + pass + +class OrganismImporter(GuiBase): + ''' + You must specify an email so NCBI can nag you if you use too much of their resources. + + This class contains functions for users to identify Genbank files to download. + The class functions download the selected files and places them in the specified location. + ''' + def __init__(self, email): + Entrez.email = email + return + def _checkForEntrezError(self, record): + ''' Raise specific errors if Entrez raised them ''' + if "ErrorList" in record: + errorstr = "" + for key in record["ErrorList"]: + if len(record["ErrorList"][key]) > 0: + errorstr += "Error %s: %s\n" %(key, record["ErrorList"][key]) + raise NcbiError("Error querying database: \n\n %s \n " %(errorstr)) + return + def searchForOrganism(self, orgname): + ''' Look for organisms matching the specified organism string in the Genome database. ''' + RETMAX=100 + handle = Entrez.esearch(db="genome", term="%s[organism]" %(orgname), retmax=RETMAX) + record = Entrez.read(handle) + self._checkForEntrezError(record) + print record + + idList = record["IdList"] + # Note - we could run into trouble here if idList is too long. + # should break it up. + handle = Entrez.efetch(db="genome", id=idList, rettype = "docsum", retmax=RETMAX) + records = Entrez.read(handle) + results_tuples = [] + for record in records: + self._checkForEntrezError(record) + genome_id = record["Id"] + assembly_id = record["AssemblyID"] + organism_name = record["Organism_Name"] + results_tuples.append( (genome_id, assembly_id, organism_name) ) + return results_tuples + def selectOrganism(self, results_tuples): + ''' Ask user for which of the found organisms he or she wants. ''' + choices = [] + for tup in results_tuples: + stri = "%s || %s || %s" %(tup[2], tup[0], tup[1]) + choices.append(stri) + chosen_list = easygui.multchoicebox(msg="Select one or more organisms.\n Format is organism name || Genome ID || Assembly ID. \n", + title = "Select organisms. ", choices=choices) + if chosen_list is None or len(chosen_list) == 0: + raise UserCancelError("User cancelled the operation or did not select any organisms!") + + genome_ids = [] + for choice in chosen_list: + spl = choice.split("||") + # We only need the assembly IDs + genome_id = spl[1].replace(" ", "") + genome_ids.append(genome_id) + return genome_ids + def downloadGenbank(self, genome_ids): + ''' Download Genbank files for a genome with specified Genome ID ''' + # Ask where are we going to save the files? + # Get nucleotide IDs (elink to nuccore) + # Efetch them [Note - will need to do batching here]. Get docsum first. + # Check the ID ("caption" field) and match to the appropriate database. + # Make sure "ReplacedBy" is empty. + # Efetch again but get genbank (gb) this time. + # Save the files with name = a sanitized version of "Title" + + +if __name__ == "__main__": + + usage = "%prog organism_name email" + description = """A UI for searching for and downloading Genbank files for a single organism or group of organisms + from the NCBI Genome database. Note that the search is done by species, and that the Genbank files for individual strains +will need to be separated and concatenated after this script is done running.""" + + parser = optparse.OptionParser(usage=usage, description=description) + (options, args) = parser.parse_args() + + # FIXME: These should be inputted into the UI. + if len(args) < 2: + sys.stderr.write("ERROR: Email address and organism name are required arguments.\n") + exit(2) + + orgstring = args[0] + email = args[1] + importer = OrganismImporter(email) + found_organisms = importer.searchForOrganism(orgstring) + selected_organisms = importer.selectOrganism(found_organisms) diff --git a/gui/PhyloTrees.py b/gui/PhyloTrees.py new file mode 100644 index 0000000..4fed5ed --- /dev/null +++ b/gui/PhyloTrees.py @@ -0,0 +1,216 @@ +#!/usr/bin/python + +''' + +This script contains UI functions helpful for building +(relatively quick and dirty) phylogenetic trees for organisms +and analyzing them. + + +Usage: +ui = PhyloUI() +ui.start() + +''' + +import easygui +import operator +import os +import sqlite3 +import sys + +from ete2 import Tree + +from GuiBase import * +from ClusterFuncs import * +from CoreGeneFunctions import * + +class PhyloUI(GuiBase): + def __init__(self, cur): + self.sqlite_cursor = cur + return + def start(self): + tasks = [ 'Build an organism tree', + 'Analyze an organism tree' ] + msg = "Please choose a task" + task = easygui.choicebox(msg, 'Select a task', tasks) + + if task == 'Build an organism tree': + tb = TreeBuilder(self.sqlite_cursor) + tb.buildOrganismTree() + pass + elif task == 'Analyze an organism tree': + ta = OrganismTreeAnalyzer(self.sqlite_cursor) + ta.chooseTreeAnalysis() + else: + raise UserCancelError("Unknown or cancelled operation") + +class OrganismTreeAnalyzer(PhyloUI): + def _getTreeFile(): + raise GuiError("This is not yet implemented. Sorry!") + return + def _make_hr_tree(): + raise GuiError("This is not yet implemented. Sorry!") + return + def _reroot_tree(): + raise GuiError("This is not yet implemented. Sorry!") + # Get a list of human-readable leaf names to reroot to (if possible) + # Should make a function in lib/ that can make this dictionary from original to sanitized human-readable names. + # I seem to need it a lot. + # Ask for a new root + # Translate to original ID + # Reroot the tree + # Return the rerooted tree + return + def _perform_pa_analysis(): + raise GuiError("This is not yet implemented. Sorry!") + return +### Public functions + def __init__(self, cur): + PhyloUI.__init__(self, cur) + self.treefile = self._getTreeFile() + self.tree = Tree(self.treefile) + def chooseTreeAnalysis(): + tasks = [ 'Create human-readable tree', + 'Reroot tree', + 'Generate excel file and presence absence analysis diagram' ] + # User can specify an arbitrary number of analyses + while 1: + task = easygui.choicebox(msg, 'Select an analysis', tasks) + if task == 'Reroot tree': + self._reroot_tree() + self._save_file_dialogs(extension="nwk") + elif task == 'Create human-readable tree': + self._make_hr_tree() + elif task == 'Generate excel file and presence absence analysis diagram': + self._perform_pa_analysis() + else: + raise UserCancelError("User cancelled operation") + +class TreeBuilder(PhyloUI): + def __init__(self, cur): + PhyloUI.__init__(self, cur) + def _get_runid(self): + ''' This is a slightly different function from the one in singleGeneAnalysis since we aren't limiting it to runs containing a specific gene + Maybe at some point I'll merge them. ''' + sys.stderr.write("WARNING: _get_runid is not yet implemented. I hardcoded something.\n") + return "orthomcl_Methanosarcina_typestrains_I_1.5000_c_-5_50" + def buildOrganismTree(self): + ''' + Build an organism tree for a specific cluster run. + Optionally pick a subset of (core) clusters to built it based on. + ''' + + # Set up results locations + results_location = self._get_directory() + if results_location is None: + raise GuiError("You must specify a location to save results files!") + if not os.path.exists(results_location): + os.makedirs(results_location) + fasta_dir = os.path.join(results_location, "fasta_files") + if not os.path.exists(fasta_dir): + os.makedirs(fasta_dir) + alignment_dir = os.path.join(results_location, "alignments") + if not os.path.exists(alignment_dir): + os.makedirs(alignment_dir) + trimmed_dir = os.path.join(results_location, "trimmed_alignments") + if not os.path.exists(trimmed_dir): + os.makedirs(trimmed_dir) + + # Now we need a list of core genes. + runid = self._get_runid() + sys.stderr.write("Getting a list of core genes in the specified cluster run. Please be patient...\n") + orglist = getOrganismsInClusterRun(runid, self.sqlite_cursor) + # ALL and UNIQUE - the genes must have representatives in every organism in the cluster run and have exactly one copy per organism. + cluster_runs = findGenesByOrganismList(orglist, runid, cl = None, all_org = True, uniq_org = True, pct_cutoff = None, outgroup = None) + + # The user can choose to only make a tree from a subset of the clusters if he or she wants. + # Subset is the default because it is way faster to compute. + msg = "Do you want to make a tree from ALL core genes or only a SUBSET of them? (Note that using ALL takes a lot of time)" + choices = ("Subset", "All") + choice = easygui.buttonbox(msg=msg, title='', choices=choices) + if choice is None: + raise UserCancelError("User cancelled the operation") + + if choice == "Subset": + # Make a subset + # Build a list of the clusters with annotation + # User selects which one(s) he or she wants + ann2id = {} + sys.stderr.write("Populating list of clusters from which to choose...\n") + for cr in cluster_runs: + runid = cr[0] + clusterid = cr[1] + ann = findRepresentativeAnnotation(runid, clusterid, self.sqlite_cursor) + ann += "(ID %s)" %(clusterid) + ann2id[ann] = (runid, clusterid) + msg = """Please select one or more of the following clusters to include (click to select and click OK when finished)""" + choices = ann2id.keys() + chosen = easygui.multchoicebox(msg=msg, choices=choices) + final_cluster_set = [] + for choice in chosen: + final_cluster_set.append(ann2id[choice]) + pass + elif choice == "All": + # No: Use all of them. + final_cluster_set = cluster_runs + + # Make fasta files + sys.stderr.write("Creating FASTA files for your chosen clusters...\n") + for cr in final_cluster_set: + geneinfo = getClusterGeneInfo(cr[0], cr[1], self.sqlite_cursor) + filename = os.path.join(fasta_dir, cr[0] + "_" + cr[1] + ".faa") + fid = open(filename, "w") + for info in geneinfo: + fid.write(">%s %s\n%s\n" %(info[0], info[9], info[11])) + fid.close() + # Align the fasta files. + sys.stderr.write("Creating protein alignments using MAFFT...\n") + fasta_files = [ f for f in os.listdir(fasta_dir) if os.path.isfile(os.path.join(fasta_dir,f)) ] + for f in fasta_files: + fasta_path = os.path.join(fasta_dir, f) + # Same name as the input file but different location. + alignment_path = os.path.join(alignment_dir, f) + cmd = "mafft --auto \"%s\" > \"%s\"" %(fasta_path, alignment_path) + print cmd + os.system(cmd) + + # Trim the alignments. + alignment_files = [ f for f in os.listdir(alignment_dir) if os.path.isfile(os.path.join(alignment_dir,f)) ] + + # For now I use conservative trimming. + sys.stderr.write("Trimming alignments (using Gblocks)...\n") + for f in alignment_files: + alignment_path = os.path.join(alignment_dir, f) + trimmed_path = os.path.join(trimmed_dir, f) + cmd = "cat \"%s\" | Gblocks_wrapper.py -r > \"%s\"" %(alignment_path, trimmed_path) + print cmd + os.system(cmd) + + # Concatenate the trimmed alignments. + sys.stderr.write("Concatenating the trimmed alignments...\n") + cat_aln_path = os.path.join(results_location, "Concatenated_alignment.faa") + cmd = "catAlignments.py \"%s\" > \"%s\"" %(alignment_dir, cat_aln_path) + print cmd + os.system(cmd) + + # Build a tree with organism IDs. + sys.stderr.write("Building a tree from the concatenated alignment...\n") + tree_path = os.path.join(results_location, "Concatenated_protein_tree.nwk") + cmd = "cat \"%s\" | FastTreeMP -wag -gamma > \"%s\"" %(cat_aln_path, tree_path) + print cmd + os.system(cmd) + + # Done. + sys.stderr.write("""Done. Final tree location: %s . +Use the UI or replaceOrgsWithAbbrev.py to put organism names on the tree leaves.\n""" %(tree_path)) + + + return tree_path + +if __name__ == "__main__": + con = sqlite3.connect(locateDatabase()) + cur = con.cursor() + ui = PhyloUI(cur) + ui.start() + con.close() diff --git a/gui/SingleGeneAnalysis.py b/gui/SingleGeneAnalysis.py index 216a812..5ee81c6 100644 --- a/gui/SingleGeneAnalysis.py +++ b/gui/SingleGeneAnalysis.py @@ -155,7 +155,7 @@ def _get_conserved_domains(self): tmpdir = tempfile.mkdtemp() f1 = subprocess.Popen(["echo", self.accumulated_data['ITEP_id'] ], stdout=subprocess.PIPE ) # Limiting hits to 8 is probably more reasonable than applying a generic Evalue cutoff. - f2 = subprocess.Popen(["db_displayExternalClusterHits.py", "--maxhits", "15", "--outdir", tmpdir, "--showevalue" ], stdin=f1.stdout) + f2 = subprocess.Popen(["db_displayExternalClusterHits.py", "-o", tmpdir, "--maxhits", "15", "--showevalue" ], stdin=f1.stdout) f1.stdout.close() # Allow echo to receive a SIGPIPE if the second command exits before the first. f2.communicate() outfiles = [ f for f in os.listdir(tmpdir) if os.path.isfile(os.path.join(tmpdir,f)) ] @@ -268,6 +268,7 @@ def _display_crude_neighborhood_tree(self): if output_file is not None: output_file = output_file[0:len(output_file)-4] + # Create tree (nwk_file, nwk_fname) = self._createTemporaryFile(delete=True) cluster = self._getClusterId() @@ -278,7 +279,6 @@ def _display_crude_neighborhood_tree(self): # View tree with neighborhoods second_cmd = 'db_makeNeighborhoodTree.py -p %s -r %s -d -l' %(nwk_fname, self.accumulated_data['runid']) -# second_cmd = 'db_makeNeighborhoodTree.py -p %s -r %s -d' %(nwk_fname, self.accumulated_data['runid']) if output_file is not None: second_cmd += " -o %s --png" %(output_file) @@ -350,12 +350,14 @@ def _setUpGeneInfo(self, alias): # Support either sanitized or unsanitized versions. itep_id = unsanitizeGeneId(alias) geneinfo = getGeneInfo( [ itep_id ], self.sqlite_cursor) + # Not an ITEP ID. if len(geneinfo) == 0: + alias = alias.upper() alias_file = locateAliasesFile() alias2gene = {} for line in open(locateAliasesFile()): spl = line.strip("\r\n").split("\t") - alias2gene[spl[1]] = spl[0] + alias2gene[spl[1].upper()] = spl[0] if alias not in alias2gene: raise NoGeneError("Sorry, we could not find gene ID %s in the database or in our aliases file. It might not be in this database.\n" %(alias)) itep_id = alias2gene[alias] diff --git a/lib/ClusterFuncs.py b/lib/ClusterFuncs.py index 7fb7456..7197f54 100755 --- a/lib/ClusterFuncs.py +++ b/lib/ClusterFuncs.py @@ -8,7 +8,6 @@ import os import sqlite3 import sys -import tempfile from FileLocator import * from sanitizeString import * diff --git a/lib/ClusterGraph.py b/lib/ClusterGraph.py index b38a30e..9058a9c 100755 --- a/lib/ClusterGraph.py +++ b/lib/ClusterGraph.py @@ -70,7 +70,6 @@ def makeNetworkObjectFromBlastResults( blastres, score_method, cutoff, cur ): genelist = list(set(querygenes + targetgenes)) for gene in genelist: geneinfo = getGeneInfo( [ gene ], cur ) - print gene, geneinfo # Not sure if the string sanitizing will be necessary. G.add_node(gene, organism=geneinfo[0][1], annotation=geneinfo[0][9]) diff --git a/lib/GuiBase.py b/lib/GuiBase.py index 3c4e30e..83efbb2 100644 --- a/lib/GuiBase.py +++ b/lib/GuiBase.py @@ -58,6 +58,30 @@ def _success_dialog(self, filename): ''' Dialog box for successful saving of a file.''' easygui.msgbox(msg = "Successfully saved results to file %s" %(filename) ) return + def _get_directory(self): + ''' Dialog box for getting a directory to which to save files. ''' + return easygui.diropenbox(msg="Where would you like to save the results files?", title="Choose a directory", default=None) + def _get_file_name(self, extension="txt"): + ''' + + This returns the name of a file the user wants to save as + or None if the user cancels or does not name a file. + + ''' + filename = easygui.filesavebox(msg = "Where do you want to save the file (extension %s will automatically be added)?" %(extension)) + # User cancelled. + if filename is None: + return None + filename = filename + "." + extension + # Check for file existence and ask if it is OK to overwrite the file. + if os.path.exists(filename): + ok_to_overwrite = easygui.buttonbox(msg="File %s already exists. Overwrite?" %(filename), choices = ("No", "Yes") ) + if ok_to_overwrite == "Yes": + return filename + else: + return None + else: + return filename def _save_file_dialogs(self, extension = "txt"): ''' Dialogs asking users to save file, sanity checks for existence of file, etc. @@ -70,18 +94,8 @@ def _save_file_dialogs(self, extension = "txt"): # If user cancels it defaults to the FIRST choice. We want default to be NO so I reverse the default of choices here. saveornot = easygui.buttonbox(msg="Do you want to save results to a file?", choices = ("No", "Yes") ) if saveornot == "Yes": - filename = easygui.filesavebox(msg = "Where do you want to save the file (extension %s will automatically be added)?" %(extension)) - if filename is None: - return None - filename = filename + "." + extension - if os.path.exists(filename): - ok_to_overwrite = easygui.buttonbox(msg="File %s already exists. Overwrite?" %(filename), choices = ("No", "Yes") ) - if ok_to_overwrite == "Yes": - return filename - else: - return None - else: - return filename + filename = self._get_file_name(extension=extension) + return filename else: return None def _print_readable_table(self, rows, header=True, separator = '|'): diff --git a/src/Gblocks_wrapper.py b/src/Gblocks_wrapper.py index 87006ae..fefe554 100755 --- a/src/Gblocks_wrapper.py +++ b/src/Gblocks_wrapper.py @@ -23,7 +23,9 @@ from Bio import AlignIO from Bio import SeqIO -usage = "%prog [options] < Fasta_alignment > Fasta_alignment_filtered" +usage = """%prog [options] < Fasta_alignment > Fasta_alignment_filtered + +Output: Trimmed FASTA alignment file""" description="Run GBLOCKS with specified parameter values. GBLOCKS is a program to filter low-quality sections out of a multiple alignment" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-p", "--program", help="Name or location of your GBLOCKS program (D: Gblocks)", action="store", type="str", dest="program", default="Gblocks") diff --git a/src/blastResultsToDistanceMatrix.py b/src/blastResultsToDistanceMatrix.py index 18ac2a7..97a4599 100755 --- a/src/blastResultsToDistanceMatrix.py +++ b/src/blastResultsToDistanceMatrix.py @@ -4,7 +4,9 @@ okmethods = ["pctid", "logevalue", "minbit", "maxbit"] -usage = "%prog [options] < BLAST_results > distance_matrix" +usage = """%prog [options] < BLAST_results > distance_matrix + +Output: Query genes on rows and target genes on columns. Each entry is BLAST distance between query and target.""" description = """Turn a table of BLAST results into a distance matrix. The distance matrix will be suitable for conversion into a heatmap with plotHeatmap.py diff --git a/src/db_TBlastN_wrapper.py b/src/db_TBlastN_wrapper.py index 4465277..34ff2cb 100755 --- a/src/db_TBlastN_wrapper.py +++ b/src/db_TBlastN_wrapper.py @@ -3,15 +3,20 @@ # Wrapper script for TBLASTN verification of presence\absence # across different genomes -import fileinput, os, optparse, random, sqlite3, sys +import fileinput, os, optparse, random, sqlite3, sys, tempfile from FileLocator import * +from ClusterFuncs import * -TABLEDESCRIPTION = """queryid, querylen, subcontig, organism, tblaststart, tblastend, tblastlen, queryoverlappct, evalue, bitscore, hitframe, - strandedString, targetgeneid, targetannotation, targetgenelen, targetoverlappct, TBLASTN_hitID""" +header = [ 'queryid', 'querylen', 'subcontig', 'organism', + 'tblaststart', 'tblastend', 'tblastlen', 'queryoverlappct', + 'evalue', 'bitscore', 'hitframe', 'strandedString', 'targetgeneid', + 'targetannotation', 'targetgenelen', 'targetoverlappct', + 'tBLASTn_hitID' ] usage = """%prog (-d|-f|-o) [options] < Protein_ids > Tblastn_table -Output: """ + TABLEDESCRIPTION +Output: """ + " ".join(header) + description = """Attempts to run TBLASTN and identify missing genes. It identifies called genes that match the hit location and also tries to find genes on the opposite strand that conflict. @@ -22,7 +27,7 @@ parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-o", "--organism", help="Organism ID to BLAST against.", action="store", type="str", dest="org", default=None) -parser.add_option("-f", "--orgfile", help="File of organism IDs to BLAST against (use this option if you want to test against multiple organisms", action="store", type="str", dest="orgfile", default=None) +parser.add_option("-f", "--orgfile", help="File of organism IDs to BLAST against (use this option if you want to test against multiple organisms)", action="store", type="str", dest="orgfile", default=None) parser.add_option("-d", "--db", help="BLAST database to use for BLASTing (use this option if you already have generated a blast database)", action="store", type="str", dest="db", default=None) parser.add_option("-c", "--cutoff", help="E-value cutoff for TBLASTN (D=1E-5)", action="store", type="float", dest="cutoff", default=1E-5) parser.add_option("-t", "--translation", help="Translation table number for TBLASTN (D=11 - bacteria, archaea and plant plastids)", action="store", type="int", dest="translation", default=11) @@ -32,6 +37,8 @@ parser.add_option("-a", "--calledgeneoverlap", help="Cutoff for overlap of called genes (D: 1% - if the called gene overlaps with the hit with less than 1% of its length it is ignored", action="store", type="float", dest="calledgeneoverlap", default=1) parser.add_option("-k", "--keep", help="Keep temporary files (D: Delete them)", action="store_true", dest="keep", default=False) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() ############# @@ -52,6 +59,7 @@ sys.stderr.write("ERROR: Cannot specify more than one of -o, -f, and -d (input options)\n") exit(2) + ############# # Get a list of input protein IDs ############# @@ -66,24 +74,25 @@ # Get protein sequences from the specified gene IDs # and put them into a FASTA file for input into TBLASTN -rn = random.randint(0, 2**30) -qfile = "%d.faa" %(rn) -fid = open(qfile, "w") -q = "SELECT geneid, aaseq FROM processed WHERE geneid = ?;" -for pid in pids: - cur.execute(q, (pid, )) - for res in cur: - fid.write(">%s\n%s\n" %(str(res[0]), str(res[1]))) -fid.close() + +sys.stderr.write("Getting query genes..\n") +qfid = tempfile.NamedTemporaryFile(delete=False) +qfile = qfid.name +geneinfo = getGeneInfo(pids, cur) +for info in geneinfo: + qfid.write(">%s\n%s\n" %(info[0], info[11])) +qfid.close() # Compile a BLAST database to search against. # The way we do this depends a bit on how the function was called. # If an organism ID is passed... just find all the contigs from the database # and compile them. organism = None +db = None if options.org is not None: - db = "%d.fna" %(rn) - fid = open(db, "w") + sys.stderr.write("Getting contigs for specified organism %s...\n" %(options.org)) + fid = tempfile.NamedTemporaryFile(delete=False) + db = fid.name q = """SELECT contigs.contig_mod, contigs.seq FROM contigs WHERE contigs.organismid = ?""" cur.execute(q, (options.org, ) ) @@ -99,9 +108,10 @@ os.system("makeblastdb -dbtype nucl -in %s > /dev/null" %(db)) # If a file of organism IDs was specified, do the same thing but just do it for all of the specified IDs elif options.orgfile is not None: - db = "%d.fna" %(rn) + sys.stderr.write("Getting contigs for organisms in specified file %s...\n" %(options.orgfile)) + fid_out = tempfile.NamedTemporaryFile(delete=False) + db = fid_out.name oc = options.oc - 1 - fid_out = open(db, "w") q = """SELECT contigs.contig_mod, contigs.seq FROM contigs WHERE contigs.organismid = ?""" for line in open(options.orgfile, "r"): @@ -126,7 +136,8 @@ # Run TBLASTN # Note - -soft_masking true is needed to help prevent two hits to a continuous protein due to filtering of low-complexity regions -ofile = "%d.out" %(rn) +outfid = tempfile.NamedTemporaryFile(delete=False) +ofile = outfid.name cmd = "tblastn -db %s -soft_masking true -evalue %1.1e -query %s -out %s -db_gencode %d -outfmt \"6 qseqid sseqid sstart send evalue bitscore sframe\" " %(db, options.cutoff, qfile, ofile, options.translation) sys.stderr.write("Now executing TBLASTN with command: \n%s\n" %(cmd)) os.system(cmd) @@ -135,6 +146,10 @@ q = "SELECT ABS(genestart - geneend) FROM processed WHERE geneid = ?;" q2 = "SELECT organism FROM organisms INNER JOIN contigs ON contigs.organismid = organisms.organismid WHERE contigs.contig_mod = ?;" q3 = "SELECT geneid, genestart, geneend, annotation FROM processed WHERE contig_mod = ? AND MIN(genestart, geneend) <= MAX(?,?) AND MIN(?,?) <= MAX(genestart, geneend);" + +if options.header: + print "\t".join(header) + # For each TBLASTN hit... for line in open(ofile, "r"): spl = line.strip("\r\n").split("\t") @@ -204,8 +219,15 @@ # Clean up if not options.keep: - os.system("rm %d.*" %(rn)) - -sys.stderr.write(TABLEDESCRIPTION + "\n") + if options.db is None: + os.remove(db) + os.remove(qfile) + os.remove(ofile) +else: + sys.stderr.write("The temporary files were not removed on request and are the following:\n") + sys.stderr.write("tBLASTn target BLAST database: %s\n" %(db)) + sys.stderr.write("Query file: %s\n" %(qfile)) + sys.stderr.write("BLAST raw result file: %s\n" %(ofile)) + sys.stderr.write("It is recommeneded that the user move the file(s) they want to another directory for further analysis, as temp files are deleted periodically from the OS.\n") con.close() diff --git a/src/db_bidirectionalBestHits.py b/src/db_bidirectionalBestHits.py index 758d179..29a90d9 100755 --- a/src/db_bidirectionalBestHits.py +++ b/src/db_bidirectionalBestHits.py @@ -17,16 +17,21 @@ okmethods = [ "evalue", "maxbit", "minbit" ] -usage="""%prog [options] > BBH_table""" -description = "Return a list of bidirectional best blast hits based on the specified scoring criteria. Output table has (tab-delimited): Query gene, target gene, query genome, forward score, backward score" +header = [ "Query_gene", "Target_gene", "Query_organism", "Forward_score", "Backward_score" ] +usage="""%prog [options] > BBH_table + +Output: """ + " ".join(header) +description = "Return a list of bidirectional best blast hits based on the specified scoring criteria." parser = optparse.OptionParser(description=description, usage=usage) parser.add_option("-m", "--method", help="Scoring metric to use to define best hit (D=evalue). Defined methods: %s" %(" ".join(okmethods)), action="store", type="str", dest="method", default="evalue") parser.add_option("-r", "--runid", help="Get bidirectional best BLAST hits for organisms in this cluster run only (D: Get them for all organisms in the database)", action="store", type="str", dest="runid", default=None) -parser.add_option("-f", "--orgfile", help="File containing s list of organisms to which to limit search. Use \"-\" for stdin. Cannot use with -r. (D: Get hits between all organisms in the database)", +parser.add_option("-f", "--orgfile", help="File containing a list of organism names to which to limit search. Use \"-\" for stdin. Cannot use with -r. (D: Get hits between all organisms in the database)", action="store", type="str", dest="orgfile", default=None) parser.add_option("-o", "--orgcol", help="Column number for organism ids (ignored unless -f is specified), starting from 1 (D=1)", action="store", type="int", dest="oc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() oc = options.oc - 1 @@ -103,6 +108,9 @@ # Look at all the best queries and see if the target gene paired with the same organism # has a best hit as the query. sys.stderr.write("Calculating bidirectional bests...\n") + +if options.header: + print "\t".join(header) for pair in Best_pairs: forward_hit = Best_pairs[pair] diff --git a/src/db_displayExternalClusterHits.py b/src/db_displayExternalClusterHits.py index 5aa70ec..7d38498 100755 --- a/src/db_displayExternalClusterHits.py +++ b/src/db_displayExternalClusterHits.py @@ -70,22 +70,23 @@ def getRpsBlastForQueryGenes(querygene_list, evalue, cur, database = "all"): from ClusterFuncs import * from FileLocator import * -usage = "%prog [options] < gene_id_list" +usage = "%prog -o outdir [options] < gene_id_list" description = """ Use Biopython to display the hits to external clusters (via RPSBLAST) mapped onto a protein or a list of proteins. You have to run setup_step4.sh before this function will work. + +Output directory is a required argument. """ parser = optparse.OptionParser(usage=usage, description=description) +parser.add_option("-o", "--outdir", help="Output directory (REQUIRED)", action="store", type="str", dest="outdir", default=None) parser.add_option("-d", "--database", help="Only display hits for this database (e.g. PFAM, COG, cd, ...) (D: Show all of them in separate rows above the gene)", action="store", type="str", dest="database", default="all") parser.add_option("-e", "--evalue", help="E-value cutoff for hits to display (D:1E-5)", action="store", type="float", dest="evalue", default=1E-5) parser.add_option("-g", "--geneocl", help="Column number for gene ID, starting from 1 (D=1)", action = "store", type="int", dest="gc", default=1) -parser.add_option("-o", "--outdir", help="Output directory for all of the PNG files for the input genes (D: externalHitGraphics)", - action="store", type="str", dest="outdir", default="externalHitGraphics") parser.add_option("-s", "--showevalue", help="Show E-value for RPSBLAST hits along with the names (D: Show names only)", action="store_true", dest="showevalue", default=False) parser.add_option("-m", "--maxhits", help="Maximum number of hits to display on the figure (D:Not limited). Try this if the figure gets too messy.", @@ -93,7 +94,7 @@ def getRpsBlastForQueryGenes(querygene_list, evalue, cur, database = "all"): (options, args) = parser.parse_args() if options.outdir is None: - sys.stderr.write("ERROR: output directory (-o) is a required argument\n") + sys.stderr.write("ERROR: Output directory (positional) is a required argument.\n") exit(2) # Make it not fail if the directory doesn't exist already. diff --git a/src/db_displayTree.py b/src/db_displayTree.py index 84cd5b2..2b3c826 100755 --- a/src/db_displayTree.py +++ b/src/db_displayTree.py @@ -16,9 +16,11 @@ #################################### # Lets read input arguments first. #################################### -usage = "%prog (-s -b basename|-p -b basename|-n -b basename|-d) [options] Newick_file" +usage = """%prog (-s -b basename|-p -b basename|-n -b basename|-d) [options] Newick_file + +Output: Depending on user option, either a PNG, SVG or altered Newick file, or just display the tree on the screen""" description="""Display a tree with annotations and specified root and formats. -There is no default activity; one of -s, -p, -n, or -d must be specified..""" +There is no default activity; one of -s, -p, -n, or -d must be specified.""" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-d", "--display", help="Display result", action="store_true", dest="display", default=False) parser.add_option("-s", "--savesvg", help="Convert the file to svg (requires -b)", action="store_true", dest="savesvg", default=False) diff --git a/src/db_findBadMutationsFromTblastn.py b/src/db_findBadMutationsFromTblastn.py index 3251548..a7bfb4b 100755 --- a/src/db_findBadMutationsFromTblastn.py +++ b/src/db_findBadMutationsFromTblastn.py @@ -9,8 +9,8 @@ from FileLocator import * from getSequenceRegion import * -header = [ "contig", "query_gene" ] -usage = """%prog [options] < TBLASTN_output > bad_mutation_list""" +usage = """%prog [options] < TBLASTN_output > bad_mutation_list +""" description = """Given a TBLASTN output from db_TBlastN_wrapper, attempt to automatically identify frameshifts and fragments of the same diff --git a/src/db_findClustersByOrganismList.py b/src/db_findClustersByOrganismList.py index db77fad..a0d0e6b 100755 --- a/src/db_findClustersByOrganismList.py +++ b/src/db_findClustersByOrganismList.py @@ -13,11 +13,11 @@ usage="""%prog (-a|-n|-p|-s|-y) [options] run_id < organism_names > cluster_run_id_list Output: """ + " ".join(header) -description="""Find clusters with a paritcular quality relative to the list of organisms you specified. -Note: To find conserved gene clusters use -a -To find core gene clusters for a particular group, use both -a and -u -To find core genes only in a parituclar group (to the exclusion of all the others in that cluster run), use -a, -u, and -s -To find clusers that exclude all the specified organisms use -n +description="""Find clusters with a paritcular quality relative to a specified list of organisms: +To find conserved gene clusters use -a. +To find core gene clusters for a particular group, use both -a and -u. +To find core genes only in a parituclar group (to the exclusion of all the others in that cluster run), use -a, -u, and -s. +To find clusers that exclude all the specified organisms use -n. Using only -u or various contradictory combinations will result in an error. """ parser = optparse.OptionParser(usage=usage, description=description) @@ -31,6 +31,8 @@ parser.add_option("-p", "--pct_cutoff", help="Percentage of organisms in the specified list of organisms that must have a member to be included (incompatible with -a, -y, and -n but can be used with -s)", action="store", type="float", dest="pct_cutoff", default=None) #parser.add_option("-m", "--minorgs", help="Minimum number of organisms in clusters to be included (D=no minimum)", action="store", type="int", dest="minorg", default=None) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() if not len(args) == 1: @@ -70,5 +72,7 @@ pct_cutoff = options.pct_cutoff ) +if options.header: + print "\t".join(header) for cr in clusterrun_list: print "%s\t%s" %(cr[0], cr[1]) diff --git a/src/db_getAlignmentBetweenGenes.py b/src/db_getAlignmentBetweenGenes.py index 4a8fa31..4a1b2a8 100755 --- a/src/db_getAlignmentBetweenGenes.py +++ b/src/db_getAlignmentBetweenGenes.py @@ -5,7 +5,7 @@ import fileinput, optparse, os, random, sqlite3, sys from FileLocator import * -usage="%prog [options] < gene_list > alignment" +usage="%prog [options] < gene_list > fasta_alignment" description="""Get a quick and dirty (whole-gene) alignment between a set of genes of interest piped from stdin. Runs the alignment using mafft with the --auto and --reorder flags. diff --git a/src/db_getAllBlastResults.py b/src/db_getAllBlastResults.py index a696e28..d9e85d5 100755 --- a/src/db_getAllBlastResults.py +++ b/src/db_getAllBlastResults.py @@ -8,9 +8,17 @@ import optparse from FileLocator import * -usage="%prog > all_blast_results" +header = [ 'Query_gene', 'Target_gene', 'Percent_identity', 'HSP_length', 'Percent_mismatch', + 'Gap_opens', 'Query_start', 'Query_end', 'Target_start', 'Target_end', 'E_value', + 'Bit_score', 'Query_selfbit', 'Target_selfbit' ] + +usage="""%prog > all_blast_results + +Output: """ + " ".join(header) description="Print all blast results available in the database (without further filtering)" parser = optparse.OptionParser(usage=usage, description=description) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args)=parser.parse_args() con = sqlite3.connect(locateDatabase()) @@ -19,6 +27,8 @@ # Generate a list of blast results with query matching one of the desiredgenes cur.execute("""SELECT * FROM blastres_selfbit;""") +if options.header: + print "\t".join(header) for l in cur: s = list(l) stri = "\t".join(str(t) for t in s) diff --git a/src/db_getAllClusterRuns.py b/src/db_getAllClusterRuns.py index 46501ce..5ad3a0b 100755 --- a/src/db_getAllClusterRuns.py +++ b/src/db_getAllClusterRuns.py @@ -12,9 +12,15 @@ from ClusterFuncs import * from FileLocator import * -usage = "%prog > run_id_list" +header = "runid" + +usage = """%prog > run_id_list + +Output: """ + " ".join(header) description = "Return list of all run IDs from the database" parser = optparse.OptionParser(usage=usage, description=description) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() con = sqlite3.connect(locateDatabase()) @@ -22,6 +28,8 @@ runs = getAllClusterRuns(cur) +if options.header: + print header print "\n".join(runs) con.close() diff --git a/src/db_getAllClustersSpecRun.py b/src/db_getAllClustersSpecRun.py index 366d4b6..ecdb5a8 100755 --- a/src/db_getAllClustersSpecRun.py +++ b/src/db_getAllClustersSpecRun.py @@ -11,10 +11,16 @@ import fileinput, sqlite3, optparse, os from FileLocator import * -usage="%prog [options] < run_ids > cluster_runs" +header = [ "runid", "clusterid", "geneid" ] + +usage="""%prog [options] < run_ids > cluster_runs + +Output: """ + " ".join(header) description="Given a set of run IDs (from stdin), returns all cluster IDs associated with that run ID" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-r", "--runcol", help="Column number for run ID (start from 1, D=1)", action="store", type="int", dest="rc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() rc = options.rc - 1 @@ -22,6 +28,8 @@ con = sqlite3.connect(locateDatabase()) cur = con.cursor() +if options.header: + print "\t".join(header) for line in fileinput.input("-"): spl = line.strip('\r\n').split("\t") runid = spl[rc] diff --git a/src/db_getBlastResultsBetweenSpecificGenes.py b/src/db_getBlastResultsBetweenSpecificGenes.py index cefa29f..2cb3181 100755 --- a/src/db_getBlastResultsBetweenSpecificGenes.py +++ b/src/db_getBlastResultsBetweenSpecificGenes.py @@ -11,11 +11,21 @@ from FileLocator import * from ClusterFuncs import * -usage = "%prog [options] < gene_ids > blast_results" +header = [ 'Query_gene', 'Target_gene', 'Percent_identity', 'HSP_length', 'Percent_mismatch', + 'Gap_opens', 'Query_start', 'Query_end', 'Target_start', 'Target_end', 'E_value', + 'Bit_score', 'Query_selfbit', 'Target_selfbit' ] + +usage = """%prog [options] < gene_ids > blast_results + +Output: """ + " ".join(header) + description = "Given list of genes to match, returns a list of BLAST results between genes in the list only" + parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-g", "--gcolumn", help="Column number (start from 1) for gene ID", action="store", type="int", dest="genecolumn", default=1) parser.add_option("-n", "--blastn", help="Get BLASTN results (D: BLASTP results)", action="store_true", dest="blastn", default=False) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() gc = options.genecolumn - 1 @@ -28,6 +38,8 @@ geneids.append(spl[gc]) blastres = getBlastResultsBetweenSpecificGenes(geneids, cur, blastn=options.blastn) +if options.header: + print "\t".join(header) for res in blastres: print "\t".join(res) diff --git a/src/db_getBlastResultsBetweenSpecificOrganisms.py b/src/db_getBlastResultsBetweenSpecificOrganisms.py index f2741bf..c372caa 100755 --- a/src/db_getBlastResultsBetweenSpecificOrganisms.py +++ b/src/db_getBlastResultsBetweenSpecificOrganisms.py @@ -15,14 +15,25 @@ import sqlite3, optparse from FileLocator import * -usage = "%prog \"Organism 1\" \"Organism 2\" ... > blast_results" +header = [ 'Query_gene', 'Target_gene', 'Percent_identity', 'HSP_length', 'Percent_mismatch', + 'Gap_opens', 'Query_start', 'Query_end', 'Target_start', 'Target_end', 'E_value', + 'Bit_score', 'Query_selfbit', 'Target_selfbit' ] + + +usage = """%prog \"Organism 1\" \"Organism 2\" ... > blast_results + +Output: """ + " ".join(header) + description = """Given list of organism names to match, returns a list of BLAST results between organisms matching any of those keywords.""" + parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-s", "--strict", help="Require exact name matches for organisms (D: Partial name matches are OK)", action="store_true", dest="strict", default=False) parser.add_option("-n", "--blastn", help="Return BLASTN hits rather than BLASTP hits. (D: BLASTP)", action="store_true", dest="blastn", default=False) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() # Do we want exact matches or LIKE %org%? @@ -75,6 +86,8 @@ cur.execute(cmd) +if options.header: + print "\t".join(header) for l in cur: s = list(l) stri = "\t".join(str(t) for t in s) diff --git a/src/db_getBlastResultsContainingGenes.py b/src/db_getBlastResultsContainingGenes.py index f8a40dc..3beeb60 100755 --- a/src/db_getBlastResultsContainingGenes.py +++ b/src/db_getBlastResultsContainingGenes.py @@ -13,7 +13,14 @@ from ClusterFuncs import * from FileLocator import * -usage = "%prog [options] < gene_ids > blast_results" +header = [ 'Query_gene', 'Target_gene', 'Percent_identity', 'HSP_length', 'Percent_mismatch', + 'Gap_opens', 'Query_start', 'Query_end', 'Target_start', 'Target_end', 'E_value', + 'Bit_score', 'Query_selfbit', 'Target_selfbit' ] + +usage = """%prog [options] < gene_ids > blast_results + +Output: """ + " ".join(header) + description = "Given list of genes to match, returns a list of BLAST results containing any gene ID in your list as either a query or a target (for blast results only BETWEEN the query genes, see db_getBlastResultsBetweenSpecificGenes.py)" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-g", "--gcolumn", help="Column number (start from 1) for gene ID", action="store", type="int", dest="genecolumn", default=1) @@ -21,6 +28,8 @@ parser.add_option("-n", "--blastn", help="Base the results on BLASTN instead of BLASTP (D: BLASTP)", action="store_true", dest="blastn", default=False) parser.add_option("-o", "--onlyquery", help="Only return results for which one of the specified genes is the query (by default returns results whether it is a query or a target", action="store_true", dest="only_query", default=False) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() gc = options.genecolumn - 1 @@ -38,6 +47,8 @@ blastres = getBlastResultsContainingGenes(genelist, cur, cutoff=options.cutoff, blastn=options.blastn, only_query=options.only_query) +if options.header: + print "\t".join(header) for blast in blastres: print "\t".join(blast) diff --git a/src/db_getClusterGeneInformation.py b/src/db_getClusterGeneInformation.py index 0a50436..2871bbd 100755 --- a/src/db_getClusterGeneInformation.py +++ b/src/db_getClusterGeneInformation.py @@ -29,10 +29,10 @@ # Get input arguments -headers = [ "organism_name", "contig_id", "start", "stop", "strand", "strandnum", "annotation", "DNA_seq", "AA_seq", "run_id", "cluster_id" ] +header = [ "organism_name", "contig_id", "start", "stop", "strand", "strandnum", "annotation", "DNA_seq", "AA_seq", "run_id", "cluster_id" ] usage = """%prog [options] < runid_clusterid_table > cluster_gene_info -Output table: """ + " ".join(headers) +Output table: """ + " ".join(header) description = """Given a list of run ID / cluster ID pairs (one pair in each row of the input table), get a list of gene information for each gene in each of the input clusters. The results include @@ -42,6 +42,8 @@ parser.add_option("-r", "--rcolumn", help="Column number (start from 1) for run ID", action="store", type="int", dest="runcolumn", default=1) parser.add_option("-c", "--ccolumn", help="Column number (start from 1) for cluster ID", action="store", type="int", dest="clustercolumn", default=2) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() rc = options.runcolumn - 1 # Convert to Pythonic indexes @@ -56,6 +58,8 @@ con = sqlite3.connect(locateDatabase()) cur = con.cursor() +if options.header: + print "\t".join(header) for cr in clusterruns: geneinfo = getClusterGeneInfo(cr[0], cr[1], cur) for info in geneinfo: diff --git a/src/db_getClustersContainingGenes.py b/src/db_getClustersContainingGenes.py index 83b46ad..56b4ef6 100755 --- a/src/db_getClustersContainingGenes.py +++ b/src/db_getClustersContainingGenes.py @@ -12,11 +12,18 @@ from FileLocator import * from ClusterFuncs import * -usage = "%prog [options] < gene_ids > clusters_containing_genes" -description = "Given a list of gene IDs, gets a list of clusters containing those genes (in all run IDs)" +header = [ "Run_id", "Cluster_id", "Provided_Gene_id" ] + +usage = """%prog [options] < gene_ids > clusters_containing_genes + +Output: """ + " ".join(header) +description = """Given a list of gene IDs, gets a list of clusters containing those genes (in all run IDs). +Only returns the genes you specified (not all genes in those clusters).""" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-g", "--gcolumn", help="Column number (start from 1) for gene ID", action="store", type="int", dest="genecolumn", default=1) parser.add_option("-r", "--runid", help="Restrict results to the given run ID", action="store", type="str", dest="runid", default=None) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() gc = options.genecolumn - 1 # Convert to Pythonic indexes con = sqlite3.connect(locateDatabase()) @@ -32,6 +39,8 @@ desiredGenes.append("fig|%s" %(spl[gc])) res = getClustersContainingGenes(desiredGenes, cur, runid=options.runid) +if options.header: + print "\t".join(header) for tup in res: print "\t".join(tup) diff --git a/src/db_getClustersWithAnnotation.py b/src/db_getClustersWithAnnotation.py index 9ef48a1..22774ea 100755 --- a/src/db_getClustersWithAnnotation.py +++ b/src/db_getClustersWithAnnotation.py @@ -20,10 +20,17 @@ import sqlite3, fileinput, optparse, sys from FileLocator import * -usage = "%prog [options] \"Annotation 1\" \"Annotation 2\" ... < run_ids > clusters_with_genes_containing_annotation_words" -description = "Given list of run IDs, returns a list of genes and clusters containing given word(s) in the annotation - separate inputs are combined with OR statements" +header = [ "Runid", "Cluster_id", "Matching_gene", "Matching_annotation" ] +usage = """%prog [options] \"Annotation 1\" \"Annotation 2\" ... < run_ids > clusters_with_genes_containing_annotation_words + +Output: """ + " ".join(header) +description = """Given list of run IDs, returns a list of genes and clusters containing at least one of the given word(s) in the annotation. +Note that only the genes matching the annotation are returned, not all of the genes in the clusters. Use db_getGenesInClusters.py to get +all of the genes in a list of clusters.""" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-r", "--runcol", help="Column number for run ID, starting from 1 (D=1)", action="store", type="int", dest="rc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() if len(args) < 1: @@ -50,6 +57,8 @@ query = query + "OR " query = query + ");" +if options.header: + print "\t".join(header) for line in fileinput.input("-"): # Note - "+" for tuples is the concatination operator spl = line.strip('\r\n').split("\t") diff --git a/src/db_getClustersWithNumGenes.py b/src/db_getClustersWithNumGenes.py index e24e610..c29b262 100755 --- a/src/db_getClustersWithNumGenes.py +++ b/src/db_getClustersWithNumGenes.py @@ -11,11 +11,16 @@ import fileinput, sqlite3, optparse, sys from FileLocator import * -usage = "%prog -n numgenes [options] < run_ids > clusters_with_specified_num_genes" +header = [ "run_id", "cluster_id" ] +usage = """%prog -n numgenes [options] < run_ids > clusters_with_specified_num_genes + +Output: """ + " ".join(header) description = "Get all of the clusters with the specified number of genes in the specified cluster runs" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-r", "--rcolumn", help="Column number (start from 1) for run ID", action="store", type="int", dest="runcolumn", default=1) parser.add_option("-n", "--numcluster", help="Desired number of genes in each cluster to extract", action="store", type="int", dest="numcluster", default=None) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() rc = options.runcolumn - 1 # Convert to Pythonic indexes @@ -29,11 +34,12 @@ con = sqlite3.connect(locateDatabase()) cur = con.cursor() +if options.header: + print "\t".join(header) for line in fileinput.input("-"): spl = line.strip('\r\n').split("\t") cur.execute("""SELECT runid, clusterid FROM clusters WHERE clusters.runid = ? GROUP BY clusterid HAVING count(*) = ?;""", (spl[rc], int(nc))) - for l in cur: s = list(l) stri = "\t".join(str(t) for t in s) diff --git a/src/db_getContigSeqs.py b/src/db_getContigSeqs.py index 6eb7300..1063842 100755 --- a/src/db_getContigSeqs.py +++ b/src/db_getContigSeqs.py @@ -8,7 +8,10 @@ from FileLocator import * from ClusterFuncs import * -usage = "%prog [options] < contig_ids > contig_sequences" +header = [ "contig", "nucleotide_sequence" ] +usage = """%prog [options] < contig_ids > contig_sequences + +Output : """ + " ".join(header) + "\n" + """Alternative output: FASTA file""" description = """Get the complete DNA sequence for contigs with specified IDs.""" parser = optparse.OptionParser(usage=usage, description=description) @@ -16,6 +19,8 @@ action="store_true", dest="fasta", default=False) parser.add_option("-c", "--contigcol", help="Column number for contig ID, starting from 1 (D;1)", action="store", dest="cc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() cc = options.cc - 1 @@ -29,6 +34,8 @@ cur = con.cursor() contigdict = getContigSequence(cur, contiglist) +if options.header: + print "\t".join(header) for contig in contigdict: if options.fasta: print ">%s\n%s\n" %(contig, contigdict[contig]) diff --git a/src/db_getContigs.py b/src/db_getContigs.py index 6fdacd7..5f1c581 100755 --- a/src/db_getContigs.py +++ b/src/db_getContigs.py @@ -7,7 +7,11 @@ from FileLocator import * from ClusterFuncs import * -usage = "%prog [options] > contig_ids" +header = "contig_id" + +usage = """%prog [options] > contig_ids + +Output: """ + header description = """Get contig IDs. By default returns ALL contig IDs. Optionally return contigs only for specific organisms. @@ -17,6 +21,8 @@ parser.add_option("-i", "--organismid", help="Only return contigs for organisms matching this organism ID", action="store", type="str", dest="organismid", default=None) parser.add_option("-o", "--organism", help="Only return contigs with the specified organism name", action="store", type="str", dest="organism", default=None) parser.add_option("-s", "--sanitized", help="Specify this if the organism's name is sanitized", action="store", type="str", dest="sanitized", default=False) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() con = sqlite3.connect(locateDatabase()) @@ -27,6 +33,8 @@ contiglist = getContigIds(cur, orgid=options.organismid, orgname=options.organism, issanitized=options.sanitized) +if options.header: + print header for contig in contiglist: print contig diff --git a/src/db_getEquivalentGenesInOrganism.py b/src/db_getEquivalentGenesInOrganism.py new file mode 100755 index 0000000..2839343 --- /dev/null +++ b/src/db_getEquivalentGenesInOrganism.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import fileinput +import optparse +import sqlite3 +import sys + +from FileLocator import * +from ClusterFuncs import * + +header = [ 'Original_gene', 'Equivalent_genes' ] + +usage = """ +%prog runID -n 'organism_name' < gene_list > equivalent_genes +%prog runID -i 'organism_id' < gene_list > equivalent_genes + +Output: """ + " ".join(header) + +description = """Given a list of genes and an organism name or ID, identifies all genes +in the same cluster as the target genes in the specified organism. + +Returns a two-column table containing the list of genes in the first column +and the equivalent gene IDs in the other organism separated by semicolons in the second. +Note that the order of the output is not necessarily the same as the order of the input. +""" + +parser = optparse.OptionParser(usage=usage, description=description) +parser.add_option("-n", "--name", help="Name of target organism.", action="store", type="str", dest="orgname", default=None) +parser.add_option("-i", "--id", help="Organism ID for target organism.", action="store", type="str", dest="orgid", default=None) +parser.add_option("-g", "--genecol", help="Column number for gene Id starting from 1 (D:1)", action="store", type="int", dest="gc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) +(options, args) = parser.parse_args() + +if len(args) < 1: + sys.stderr.write("ERROR: run ID is a required argument.\n") + exit(2) + +runid = args[0] + +gc = options.gc - 1 + +genelist = set() +for line in fileinput.input("-"): + spl = line.strip("\r\n").split("\t") + genelist.add(spl[gc]) + +con = sqlite3.connect(locateDatabase()) +cur = con.cursor() + +equivalent_dict = getEquivalentGenesInOrganism( genelist, runid, cur, orgid=options.orgid, orgname=options.orgname ) + +if options.header: + print "\t".join(header) +for gene in equivalent_dict: + print "\t".join( [ gene, ";".join(equivalent_dict[gene]) ] ) + +con.close() diff --git a/src/db_getExternalClusterGroups.py b/src/db_getExternalClusterGroups.py index 42df207..b4bc64a 100755 --- a/src/db_getExternalClusterGroups.py +++ b/src/db_getExternalClusterGroups.py @@ -3,11 +3,15 @@ import fileinput, optparse, sqlite3, sys from FileLocator import * +header = [ "query_gene", "CDD_id", "pct_identity", "alignment_len", "mismatches", "gapopens", "querystart", "queryend", "substart", "subend", "E-value", "bitscore", "Domain_ID" ] + ok_databases = ["all", "cd", "cog", "pfam", "tigr", "prk", "smart"] usage = """%prog [options] < gene_id_list > External_cluster_ids -Supported databases: """ + " ".join(ok_databases) +Supported databases: """ + " ".join(ok_databases) + """ + +Output: """ + " ".join(header) description = """Given a list of gene IDs, identifies the external cluster IDs that are homologous to those gene IDs as determined by RPSBLAST. By default @@ -21,7 +25,8 @@ parser.add_option("-a", "--adddescription", help="Add external cluster description to the table (D: dont)", action="store_true", dest="adddescription", default=False) parser.add_option("-n", "--addname", help="Add external cluster name to the table (D: dont)", action="store_true", dest="addname", default=False) parser.add_option("-g", "--genecol", help="Column number for gene ID starting from 1 (D: 1)", action="store", type="int", dest="gc", default=1) - +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() gc = options.gc - 1 @@ -44,6 +49,14 @@ INNER JOIN external_clusters ON external_clusters.cdd_id = rpsblast_results.cdd_id WHERE rpsblast_results.querygene = ? %s """ %(addn, addd, addl) +if options.header: + head = "\t".join(header) + if options.addname: + head += "\t" + "Domain_name" + if options.adddescription: + head += "\t" + "Domain_description" + print head + for line in fileinput.input("-"): spl = line.strip("\r\n").split("\t") geneid = spl[gc] diff --git a/src/db_getExternalClustersById.py b/src/db_getExternalClustersById.py index 65011e6..021af5e 100755 --- a/src/db_getExternalClustersById.py +++ b/src/db_getExternalClustersById.py @@ -7,11 +7,16 @@ from FileLocator import * -usage = "%prog [options] < external_clusterids > descriptions" +header = [ 'CDD_ID', 'Domain_ID', 'Domain_name', 'Description', 'Alignment_length' ] +usage = """%prog [options] < external_clusterids > descriptions + +Output: """ + " ".join(header) description = "Get descriptions of external clusters described by the specified external clusterIDs" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-c", "--column", help="Column number (starting from 1) for the external cluster ID (D=1)", action="store", type="int", dest="idc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() idc = options.idc - 1 @@ -20,6 +25,8 @@ cur = con.cursor() query = """SELECT * FROM external_clusters WHERE external_clusterid = ?""" +if options.header: + print "\t".join(header) for line in fileinput.input("-"): spl = line.strip("\r\n").split("\t") eid = spl[idc] diff --git a/src/db_getGeneInformation.py b/src/db_getGeneInformation.py index 2a08726..c3de853 100755 --- a/src/db_getGeneInformation.py +++ b/src/db_getGeneInformation.py @@ -19,11 +19,11 @@ import fileinput, sqlite3, optparse from FileLocator import * -headers = [ "organism_name", "contig_id", "start", "stop", "strand", "strandnum", "annotation", "DNA_seq", "AA_seq" ] +header = [ "organism_name", "contig_id", "start", "stop", "strand", "strandnum", "annotation", "DNA_seq", "AA_seq" ] usage="""%prog [options] < gene_ids > gene_info -Output table: """ + " ".join(headers) +Output table: """ + " ".join(header) description="""Given a list of gene IDs, get their gene info, including annotations, contig, organism, strand, and sequences. Start is the first nucleotide of the start codon (e.g. A in ATG) @@ -34,6 +34,8 @@ parser.add_option("-g", "--gcolumn", help="Column number (start from 1) for gene ID", action="store", type="int", dest="genecolumn", default=1) parser.add_option("-a", "--add", help="Add gene information to the end of the existing file (D: only return the gene information)", action="store_true", dest="keep", default=False) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() gc = options.genecolumn - 1 # Convert to Pythonic indexes @@ -41,6 +43,8 @@ con = sqlite3.connect(locateDatabase()) cur = con.cursor() +if options.header: + print "\t".join(header) for line in fileinput.input("-"): spl = line.strip('\r\n').split("\t") diff --git a/src/db_getGenesWithAnnotation.py b/src/db_getGenesWithAnnotation.py index 0bf32da..dda7648 100755 --- a/src/db_getGenesWithAnnotation.py +++ b/src/db_getGenesWithAnnotation.py @@ -9,10 +9,18 @@ from FileLocator import * from ClusterFuncs import * -usage = "%prog \"Annotation 1\" \"Annotation 2\" ... > [Gene_id_list]" +header = [ 'Gene_id', 'Annotation' ] + +usage = """%prog \"Annotation 1\" \"Annotation 2\" ... > [Gene_id_list] + +Output: """ + " ".join(header) + description = """Get a list of genes in the database matching at least one of the specified annotations (Note - does not have to match ALL of them)""" + parser = optparse.OptionParser(usage=usage, description=description) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() if len(args) == 0: @@ -37,6 +45,8 @@ cur.execute(query, teststr) +if options.header: + print "\t".join(header) for l in cur: s = list(l) stri = "\t".join(str(t) for t in s) diff --git a/src/db_getOrganismsInCluster.py b/src/db_getOrganismsInCluster.py index 972e2cd..645d83a 100755 --- a/src/db_getOrganismsInCluster.py +++ b/src/db_getOrganismsInCluster.py @@ -4,11 +4,17 @@ from FileLocator import * from ClusterFuncs import * -usage="""%prog [options] < runid_clusterid_pair > organism_list""" +header = [ "organism_name" ] + +usage="""%prog [options] < runid_clusterid_pair > organism_list + +Output: """ + " ".join(header) description="Get a list of organisms included in the specified run / cluster ID pair" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-r", "--runcol", help="Column number for run ID starting from 1 (D=1)", action="store", type="int", dest="rc", default=1) parser.add_option("-c", "--clustercol", help="Column number for cluster ID starting from 1 (D=2)", action="store", type="int", dest="cc", default=2) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() rc = options.rc - 1 @@ -19,6 +25,7 @@ numlines = 0 mystr = "" + for line in fileinput.input("-"): numlines = numlines+1 if numlines > 1: @@ -30,6 +37,8 @@ orglist = getOrganismsInCluster(runid, clusterid, cur) stri = "\n".join(orglist) +if options.header: + print "\t".join(header) print stri con.close() diff --git a/src/db_getOrganismsInClusterRun.py b/src/db_getOrganismsInClusterRun.py index 7e46702..029505a 100755 --- a/src/db_getOrganismsInClusterRun.py +++ b/src/db_getOrganismsInClusterRun.py @@ -4,11 +4,17 @@ from FileLocator import * from ClusterFuncs import * -usage="%prog [options] < runid > organism_list" +header = [ 'runid', 'organism_name' ] + +usage="""%prog [options] < runid > organism_list + +Output: """ + " ".join(header) description="""Get a list of organisms included in each piped-in cluster run (Note - the results are most useful if you only provide ONE)""" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-r", "--runcol", help="Column number for run ID starting from 1 (D=1)", action="store", type="int", dest="rc", default=1) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() rc = options.rc - 1 @@ -16,6 +22,8 @@ con = sqlite3.connect(locateDatabase()) cur = con.cursor() +if options.header: + print "\t".join(header) for line in fileinput.input("-"): spl = line.strip("\r\n").split("\t") runid = spl[rc] diff --git a/src/db_getPresenceAbsenceTable.py b/src/db_getPresenceAbsenceTable.py index 9d06860..f8fbeae 100755 --- a/src/db_getPresenceAbsenceTable.py +++ b/src/db_getPresenceAbsenceTable.py @@ -9,10 +9,11 @@ from FileLocator import * from sanitizeString import * -usage="%prog [options] > presence_absence_table" -description="""Generates a presence - absence table (or slices thereof) based on -the one automatically loaded as part of setup_step2.sh. -Default activity is to dump the database as is (pegs).""" +usage="""%prog [options] > presence_absence_table + +Output: Run_id Cluster_id Representative_annotation (organism1_PA) (organism2_PA) ... """ +description="""Generates a presence - absence table (or slices thereof) based on pre-computed clusters. +Default activity is to print all available presence-absence from all clusters and all cluster runs.""" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-n", "--number", help="Rather than printing PEGs, print the number of representatives in each organism (D: Prints pegs)", action="store_true", dest="number", default=False) diff --git a/src/db_getSequencesFromBlastResults.py b/src/db_getSequencesFromBlastResults.py index f2bb92d..d636a5f 100755 --- a/src/db_getSequencesFromBlastResults.py +++ b/src/db_getSequencesFromBlastResults.py @@ -4,7 +4,9 @@ from FileLocator import * from getSequenceRegion import * -usage = "%prog [options] < table > table_with_sequences" +usage = """%prog [options] < table > table_with_sequences + +Output: Same table as put in, but with sequences added on another column.""" description = """Identify the sequence of the HSP based on the provided ID (contig or gene) and the start and stop locations of the HSP. @@ -69,6 +71,7 @@ startcol = options.startcol - 1 endcol = options.endcol - 1 +couldBeHeader = True for line in fileinput.input("-"): spl = line.strip("\r\n").split("\t") try: @@ -79,9 +82,14 @@ sys.stderr.write("ERROR: Input table does not have enough columns for the specified column numbers for ID, start and stop location\n") exit(2) except ValueError: - sys.stderr.write("ERROR: The specified column numbers for start and stop locations do not appear to have integers in them - check that the specified column numbers are correct\n") - exit(2) - + if couldBeHeader: + # If the user asked for a header row in a middle table and then passed it here, we don't want to crash due to that. But we only expect one header row. + couldBeHeader = False + continue + else: + sys.stderr.write("ERROR: The specified column numbers for start and stop locations do not appear to have integers in them - check that the specified column numbers are correct\n") + exit(2) + cur.execute(q, (myid,)) for rec in cur: startingSeq = str(rec[0]) diff --git a/src/db_getUpstreamRegion.py b/src/db_getUpstreamRegion.py index 2db6d4a..ab02b1b 100755 --- a/src/db_getUpstreamRegion.py +++ b/src/db_getUpstreamRegion.py @@ -8,10 +8,10 @@ # For sequence transposing from Bio import Seq -headers = [ "geneid", "status", "upstream_sequence" ] +header = [ "geneid", "status", "upstream_sequence" ] usage = """%prog [options] < geneids > geneids_with_upstream -Output table: """ + " ".join(headers) +Output table: """ + " ".join(header) description="""Get the upstream nucleotide sequence of the given set of genes. Requires you to have the contigs loaded into the database. It is careful to only print sequences up to the next called gene or gap (N) in the sequence unless you @@ -37,6 +37,8 @@ parser.add_option("-l", "--othergenelength", help="If allowing gene overlaps, still ignore called genes less than this length (in nucleotides) within the upstream region (D=0 - cut off after ANY gene)", action="store", type="int", dest="othergenelength", default=0) parser.add_option("-g", "--genecolumn", help="Column number for gene ID in input, starting from 1 (D=1)", action="store", type="int", dest="gc", default=1) parser.add_option("-i", "--ingene", help="Number of nucleotides WITHIN the gene to gather in addition to the upstream region (D=3 - i.e. grab the start codon only)", action="store", type="int", dest="ingene", default=3) +parser.add_option("--header", help="Specify to add header to the output file (useful if you want to take the results and put into Excel or similar programs)", + action="store_true", default=False) (options, args) = parser.parse_args() # Read stdin to get gene IDs @@ -67,6 +69,8 @@ AND ABS(genestart - geneend) > ?""" NUMINGENE = options.ingene +if options.header: + print "\t".join(header) for gene in geneids: # Where is this gene located? cur.execute(q1, (gene, ) ) diff --git a/src/db_makeClusterAlignment.py b/src/db_makeClusterAlignment.py index c366497..58701dc 100755 --- a/src/db_makeClusterAlignment.py +++ b/src/db_makeClusterAlignment.py @@ -20,7 +20,9 @@ valid_methods = ['mafft_linsi', 'mafft_einsi', 'mafft_ginsi', 'mafft_default', 'clustalw_default'] -usage="%prog -m Method (-n|-t|-g|-p) [options] < Cluster_RunID > Final_alignment" +usage="""%prog -m Method (-n|-t|-g|-p) [options] < Cluster_RunID > Final_alignment + +Output: FASTA alignment""" description="""Generates a new alignment from a piped-in pair of run\cluster ID """ parser = optparse.OptionParser(usage=usage, description=description) ### Input options @@ -82,11 +84,11 @@ faa = "%d.faa" %(rann) alncmd = "" if options.method == 'mafft_linsi': - alncmd = "mafft --maxiterate 1000 --localpair %s > %s 2> /dev/null" %(fasta, faa) + alncmd = "mafft --anysymbol --maxiterate 1000 --localpair %s > %s 2> /dev/null" %(fasta, faa) if options.method == 'mafft_einsi': - alncmd = "mafft --maxiterate 1000 --genafpair %s > %s 2> /dev/null" %(fasta, faa) + alncmd = "mafft --anysymbol --maxiterate 1000 --genafpair %s > %s 2> /dev/null" %(fasta, faa) if options.method == 'mafft_ginsi': - alncmd = "mafft --maxiterate 1000 --globalpair %s > %s 2> /dev/null" %(fasta, faa) + alncmd = "mafft --anysymbol --maxiterate 1000 --globalpair %s > %s 2> /dev/null" %(fasta, faa) if options.method == 'mafft_default': alncmd = "mafft %s > %s 2> /dev/null" %(fasta, faa) if options.method == 'clustalw_default': diff --git a/src/db_makeClusterComparisonTable.py b/src/db_makeClusterComparisonTable.py index 3e7acc1..144b791 100755 --- a/src/db_makeClusterComparisonTable.py +++ b/src/db_makeClusterComparisonTable.py @@ -10,7 +10,13 @@ from sanitizeString import * from ClusterFuncs import * -usage = """%prog [options] focus_gene < gene_ids > cluster_comparison_table""" +usage = """%prog [options] focus_gene < gene_ids > cluster_comparison_table + +Output: Gene_id (Comp_runid1) (Comp_runid2) ... + +The entries are 1 (yes) if gene "Gene_id" is in the same cluster as the +focus gene in the specified cluster run, 0 (no) if it is not and -1 (NA) if +the gene is not present in the cluster run.""" description = ''' Given a list of gene IDs and a focus gene, identifies all of the clusters (across all cluster runs in the database) that contain the gene. Then identifies if the @@ -49,6 +55,7 @@ res = getClustersContainingGenes( [ focusgene ], cur) # Just in case there is more than one cluster per run clusterrun_to_genes = {} +focus_runids = set() for result in res: runid = result[0] clusterid = result[1] @@ -56,6 +63,12 @@ continue geneids = getGenesInCluster(runid, clusterid, cur) clusterrun_to_genes[(runid, clusterid)] = set(geneids) + focus_runids.add(runid) + +# Get the organisms in each run which we are comparing here. +focus_run_to_org = {} +for runid in focus_runids: + focus_run_to_org[runid] = getOrganismsInClusterRun(runid, cur) # Read in the remainder of the gene IDs geneids = set() @@ -99,10 +112,23 @@ else: printstr = "1" else: - if options.yesno: - printstr = "no" + # Is the organism valid? + geneinfo = getGeneInfo([gene], cur) + one_geneinfo = geneinfo[0] + organism = one_geneinfo[1] + if organism not in focus_run_to_org[clusterrun[0]]: + if options.yesno: + printstr = "NA" + else: + printstr = "-1" else: - printstr = "0" + if options.yesno: + printstr = "no" + else: + printstr = "0" + pass + pass + pass row.append(printstr) print "\t".join(row) diff --git a/src/db_makeClusterGraph.py b/src/db_makeClusterGraph.py index 21cffa7..d2b543c 100755 --- a/src/db_makeClusterGraph.py +++ b/src/db_makeClusterGraph.py @@ -11,7 +11,9 @@ symmetric_methods, unsymmetric_methods = getValidBlastScoreMethods() all_methods = symmetric_methods + unsymmetric_methods -usage = """ %prog -m method -u cutoff [options] < cluster_runid_pairs """ +usage = """ %prog -m method -u cutoff -o outdir [options] < cluster_runid_pairs + +Output: .gml file (import into Cytoscape or other graph visualization software)""" description = """From a given cluster/runID pair, calcualte and export a GML file that can be opened in Cytoscape or similar viewers to visualize the cluster. By default, it creates GML files with name "runid_clusterid.gml" . If you specify your own @@ -24,14 +26,15 @@ parser.add_option("-r", "--runcol", help="Column number for Run ID starting from 1 (D:1)", action="store", type="int", dest="rc", default=1) parser.add_option("-c", "--clusterid", help="Column number for Cluster ID starting from 1 (D: 2)", action="store", type="int", dest="cc", default=2) parser.add_option("-n", "--blastn", help="Use BLASTN instead of BLASTP for scoring results (D: BLASTP)", action="store_true", dest="blastn", default=False) +parser.add_option("-o", "--outdir", help="Output directory for GML files", action="store", type="str", dest="outdir", default=None) (options, args) = parser.parse_args() # Convert to pythonic indexes rc = options.rc - 1 cc = options.cc - 1 -if options.method is None or options.cutoff is None: - sys.stderr.write("ERROR: both method (-m) and cutoff (-u) are required arguments\n") +if options.method is None or options.cutoff is None or options.outdir is None: + sys.stderr.write("ERROR: method (-m), cutoff (-u), and output directory (-o) are required arguments\n") exit(2) cluster_runs = [] @@ -50,6 +53,8 @@ blast_str = "blastn" else: blast_str = "blastp" - exportGraphToGML(G, "%s_%s_%s_%1.2f_%s.gml" %(runid, clusterid, options.method, options.cutoff, blast_str ) ) + if not os.path.exists(options.outdir): + os.makedirs(options.outdir) + exportGraphToGML(G, os.path.join(options.outdir,"%s_%s_%s_%1.2f_%s.gml" %(runid, clusterid, options.method, options.cutoff, blast_str ) ) ) con.close() diff --git a/src/db_makeGraphFromBlastResults.py b/src/db_makeGraphFromBlastResults.py index 95aa084..f1bc355 100755 --- a/src/db_makeGraphFromBlastResults.py +++ b/src/db_makeGraphFromBlastResults.py @@ -11,7 +11,9 @@ symmetric_methods, unsymmetric_methods = getValidBlastScoreMethods() all_methods = symmetric_methods + unsymmetric_methods -usage = """ %prog -m method -u cutoff [options] < BLAST_results """ +usage = """ %prog -m method -u cutoff [options] < BLAST_results + +Output: .gml file (import into Cytoscape or other graph visualization software) """ description = """Generate a GML file for all edges in a BLAST results table. Currently implemented methods: %s """ %(" ".join(all_methods) ) diff --git a/src/db_makeNeighborhoodDiagram.py b/src/db_makeNeighborhoodDiagram.py index 28f01de..4696d2b 100755 --- a/src/db_makeNeighborhoodDiagram.py +++ b/src/db_makeNeighborhoodDiagram.py @@ -10,18 +10,24 @@ from FileLocator import * from sanitizeString import * -usage = "%prog runid [options] < gene_ids" +usage = """%prog runid [options] < gene_ids + +Output: One PNG file for each specified gene.""" description = """Saves neighborhood diagrams for genes in a specified directory. The run ID is used to color code the neighborhood diagram. The color code is not necessarily consistent across different input genes but is internally consistent for neighborhoods of a given gene.""" parser = optparse.OptionParser(usage=usage, description=description) -parser.add_option("-d", "--directory", help="Directory in which to save neighborhood diagrams. Default is 'geneNeighborhoods'", action="store", - dest="directory", type="str", default='geneNeighborhoods') +parser.add_option("-d", "--directory", help="Directory in which to save neighborhood diagrams (REQUIRED)", action="store", + dest="directory", type="str", default=None) parser.add_option("-l", "--labeltype", help="Type of label to use. Valid types are 'aliases' or 'clusterid' (D: aliases)", action="store", dest="labeltype", type="str", default="aliases") parser.add_option("-g", "--genecol", help="Column number for gene IDs starting from 1 (D: 1)", action="store", dest="gc", type="int", default=1) (options, args) = parser.parse_args() +if options.directory is None: + sys.stderr.write("Output directory (-d) is a required argument.\n") + exit(2) + gc = options.gc - 1 if len(args) < 1: diff --git a/src/db_makeNeighborhoodTree.py b/src/db_makeNeighborhoodTree.py index 4d95c27..165092c 100755 --- a/src/db_makeNeighborhoodTree.py +++ b/src/db_makeNeighborhoodTree.py @@ -178,6 +178,8 @@ def treelegend(ts, getcolor, greyout, clusterrunid, cur): col = cnum%colorcols cnum += 1 samplefunc = findRepresentativeAnnotation(clusterrunid, str(cluster), cur) + if samplefunc is None: + samplefunc = "NONE (probably a singleton excluded in an orthomcl run)" text = treelegendtext(str(cluster) + ": " + samplefunc[0:63], color) #placement of this legend ts.legend.add_face(text, column=col) @@ -185,35 +187,41 @@ def treelegend(ts, getcolor, greyout, clusterrunid, cur): return ts if __name__ == "__main__": - ''' - Given a protein tree, make a figure containing that tree with neighborhoods overlayed onto the tree. - ''' - usage="%prog -p protein_tree -r runid [options]" + usage=""" +%prog -p protein_tree -r runid -d [options] +%prog -p protein_tree -r runid -b basename --svg [options] +%prog -p protein_tree -r runid -b basename --png [options] + + Output: PNG or SVG, or just display the tree on the screen. You can specify more than one of -d, --svg or --png if you wish.""" description="""Generates a tree with gene regions""" parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-r", "--runid", help="Run ID (required)", action="store", type="str", dest="clusterrunid", default=None) parser.add_option("-p", "--prottree", help="Protein tree (required)", action="store", type="str", dest="treeinfile", default=None) + parser.add_option("-o", "--outfile", help="Base name for output file (required if you want to save the file)", action="store", type="str", dest="outfile", default=None) parser.add_option("-t", "--treetitle", help="Tree title (D: same as run ID)", action="store", type="str", dest="gene", default=None) - parser.add_option("-o", "--outfile", help="Base name for output file (D: Same as input tree)", action="store", type="str", dest="outfile", default=None) parser.add_option("-d", "--display", help="Display the resulting tree (D: Don't display, just save)", action="store_true", dest="display", default=False) parser.add_option("-c", "--cutoff", help="Number of members of a cluster below which a gene is greyed out (D: 3 - 2 or less are greyed out)", action="store", type="int", dest="cutoff", default=3) parser.add_option("-l", "--label", help="Add labels to the genes. The labels are the cluster IDs for the clusters in which the genes are found in the specified cluster run (D: Dont because its very messy)", action="store_true", dest="label", default=False) parser.add_option("--png", help="Save high-quality PNG and SVG images (with base name specified by -o or by default, with the same name as input file)", action="store_true", dest="savepng", default=False) + parser.add_option("--svg", help="Save an SVG image of the final tree.", action="store_true", dest="savesvg", default=False) (options,args) = parser.parse_args() if options.treeinfile is None: sys.stderr.write("ERROR: -p (protein input tree) is required\n") exit(2) - # James - how did you plan on using muliple clusters? Doesnt make a lot of sense to me. if options.clusterrunid is None: - sys.stderr.write("ERROR - -r (runid) is a required argument\n") + sys.stderr.write("ERROR: -r (runid) is a required argument\n") + exit(2) + + if options.outfile is None and (options.savepng or options.savesvg): + sys.stderr.write("ERROR: -o (output file base name) is a required argument if --png or --svg is specified") exit(2) - if options.outfile is None: - options.outfile = options.treeinfile + if not (options.savepng or options.savesvg or options.display): + sys.stderr.write("ERROR: Must specify one of -d (display), --svg (save SVG file) or --png (save PNG file)") clusterrunid = options.clusterrunid treeinfile = options.treeinfile @@ -270,10 +278,11 @@ def treelegend(ts, getcolor, greyout, clusterrunid, cur): t, ts = prettifyTree(t, title = gene + " cluster regions", show_bootstraps = False, ts=ts) - if options.savepng: + if options.savepng or options.savesvg: os.system("rm test.svg 2> /dev/null") t.render("%s.svg" %(options.outfile), tree_style=ts) - os.system("convert -trim -depth 32 -background transparent %s.svg %s.png" %(options.outfile, options.outfile)) + if options.savepng: + os.system("convert -trim -depth 32 -background transparent %s.svg %s.png" %(options.outfile, options.outfile)) if options.display: t.show(tree_style=ts) diff --git a/src/db_replaceGeneNameWithAnnotation.py b/src/db_replaceGeneNameWithAnnotation.py index 719f58b..67541eb 100755 --- a/src/db_replaceGeneNameWithAnnotation.py +++ b/src/db_replaceGeneNameWithAnnotation.py @@ -4,13 +4,17 @@ from FileLocator import * from sanitizeString import * -usage="%prog [options] < infile > outfile" -description="""Look for things that look like gene IDs (fig|#.#.peg.#) in the input file -and replace them with annotation and\or organism name, properly sanitized to work in a Newick file. -Also works if the input has sanitized gene IDs (fig_#_#_peg_#) +usage=""" +%prog [options] < infile > outfile Replace with organism only: use -o Replace with annotation only: use -a -Replace with organism and annotation and keep original (sanitized) gene id: use -a -o -k""" +Replace with organism and annotation and keep original (sanitized) gene id: use -a -o -k +""" + +description="""Look for things that look like gene IDs (fig|#.#.peg.#) in the input file +and replace them with annotation and\or organism name, properly sanitized to work in a Newick file. +Also works if the input has sanitized gene IDs (fig_#_#_peg_#). """ + parser = optparse.OptionParser(usage=usage, description=description) parser.add_option("-a", "--annote", help="Include annotation (D: False)", action="store_true", dest="ann", default=False) parser.add_option("-o", "--organism", help="Include organism as part of the annotation (D: False)", action="store_true", dest="org", default=False) diff --git a/src/getRepresentativesOfCluster.sh b/src/getRepresentativesOfCluster.sh index 2d97263..5f5fac1 100755 --- a/src/getRepresentativesOfCluster.sh +++ b/src/getRepresentativesOfCluster.sh @@ -8,6 +8,9 @@ if [ $# -ne 2 ]; then echo "" echo "Description: Given a run and cluster ID, identify representatives" echo "according to the MyRAST function svr_representative_sequences." + echo "" + echo "Usage depends on installation of MyRAST on your system." + echo "" echo "WARNING: At the moment this is not a finished script and will likely be heavily" echo "modified or removed." exit 0; diff --git a/src/internal/builddb_3.sql b/src/internal/builddb_3.sql index 0f21170..3941bfc 100644 --- a/src/internal/builddb_3.sql +++ b/src/internal/builddb_3.sql @@ -26,6 +26,7 @@ CREATE TABLE contigs( /* If this one isn't unique we're in trouble */ CREATE UNIQUE INDEX contigcontigs ON contigs(contig_mod); +CREATE INDEX contigorgs ON contigs(organismid); CREATE TABLE tblastn( "queryid" TEXT, diff --git a/src/makeCoreClusterAnalysisTree.py b/src/makeCoreClusterAnalysisTree.py index f982459..3b2a374 100755 --- a/src/makeCoreClusterAnalysisTree.py +++ b/src/makeCoreClusterAnalysisTree.py @@ -17,7 +17,12 @@ description=""" Generate an ETE tree with internal node labels corresponding to the number of clusters conserved in the nodes beneath it (conservation being defined by a variery of options -below). The input MUST be a Newick file with organism IDs REPLACED with their names. +below). The input can be a Newick file with organism IDs REPLACED with their names or with the +IDs (possibly fig|[id]) as leaves. If the tree has organism IDs you should specify -i so that +the program processes it correctly. + +If the specified tree has IDs on the leaves, the names are replaced AFTER trying to reroot. +So you should specify an ID for the organism with -r The function alternatively (or in addition) exports a XLS file with sheet names equal to the node number printed on the tree containing the cluster, runid pair and a representative annotation from each cluster @@ -25,6 +30,9 @@ """ parser = optparse.OptionParser(usage=usage, description=description) +parser.add_option("-i", "--ids", + help="Select if input ID has organism IDs and not names.", + action="store_true", dest="ids", default=False) parser.add_option("-d", "--display", help="Display tree", action="store_true", dest="display", default=False) @@ -86,7 +94,6 @@ sys.stderr.write("ERROR: At least one of -a, -u, -s, -n, -y is required\n") exit(2) - if options.savepng: options.savesvg = True @@ -113,6 +120,12 @@ if options.reroot_org is not None: t = rerootEteTree(t, root_leaf_part = options.reroot_org) +if options.ids: + for node in t.iter_leaves(): + # The node may or may not have fig| in front of it. + name = node.name.replace("fig|", "") + node.name = sanitizeString(organismIdToName(name, cur), False) + # Make something pretty out of it. # We don't want bootstraps here since they just make the tree more cluttered. t, ts = prettifyTree(t, show_bootstraps = False) diff --git a/src/orthoMclWrapper.py b/src/orthoMclWrapper.py index 655d249..4aeea08 100755 --- a/src/orthoMclWrapper.py +++ b/src/orthoMclWrapper.py @@ -332,8 +332,8 @@ def getWantedOption(line, optionfinder, cmdlineoption): # Check if the output file for the specified settings already exists. # Otherwise why bother re-doing all of this? - -expectedfilename = "orthomcl_all_I_%1.4f_c_%d_%d" %(options.inflation, options.evalcut, options.pctcut) +base = os.path.basename(blastresfile) +expectedfilename = "orthomcl_%s_I_%1.4f_c_%d_%d" %(base, options.inflation, options.evalcut, options.pctcut) expectedoutput = os.path.join(os.path.dirname(locateDatabase()), "..", "clusters", expectedfilename) if not (options.forcereload or reloadDb): try: