mattb112885 · ghost · May 28, 2014 · May 28, 2014 · May 28, 2014 · May 29, 2014
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -10,17 +10,21 @@ NEW FEATURES:
     - Getting gene neighborhoods and overlaying on the tree
     - Getting basic information about related genes
     - Getting BLAST support for a gene family.
+    - Building an organism tree from a selection of core genes
   Note you must have already built the database before running the GUI. Using the gui requires you to have
   access to an X11 server and the easygui Python package. It should be considered highly experimental / breakable
   at this point.
-- OrthoMCL wrapper now takes as input an arbitrary table of BLAST results from the ITEP database
+- OrthoMCL wrapper now takes as input an arbitrary table of BLAST results from the ITEP database instead of trying to
+  run OrthoMCL with everything. Use db_getBlastResultsBetweenSpecificOrganisms.py to generate the input table.
 - makeCoreClusterAnalysisTree.py now optionally can be used to analyze presence-absence patterns relative to sister clades,
   rather than relative to all organisms in the cluster run. See help text for details.
 - db_makeClusterComparisonTable.py - identify genes in the same cluster as a reference across cluster runs and provide
   a table in a suitable format for displaying with db_displayTree.py
 - db_getEquivalentGenesInOrganism.py - Given a list of query genes, a run ID and an organism, finds all genes in the provided organism that are in the
   same cluster as the query genes and returns a conversion table.
 - Make a diagram of neighbors for a single gene
+- Most table-generating functions support use of a --header argument (use this only for the last command in a pipeline).The header contains descriptiosn of the rows' contents. 
+  Exceptions include scripts that add columns to an existing table and functions that always produce headers (e.g. db_getPresenceAbsenceTable.py)
 
 BUG FIXES:
 - Check for biopython version no longer fails due to letters in the version number. Thanks to Matt Richards for reporting.
@@ -37,6 +41,8 @@ BUG FIXES:
 - Made SourceMe.sh more robust to spaces in the current PATH definition. Thanks to drjcthrash for the inspiration.
 - Improved performance of run ID listing and organism links.
 - Fixed bug in definition of distinctorgs view (and turned it into a table).
+- Improved concurrency of multiple scripts. Removed all temporary tables in sqlite except one (in db_getBlastResultsBetweenSpecificOrganisms) for performance reasons.
+  Some scripts now require explicit declaration of an output location which did not before, to remove confusion on where files end up.
 
 VIRTUAL MACHINE FIXES:
 - Added FastTreeMP to virtual machines

diff --git a/checkForDependencies.sh b/checkForDependencies.sh
@@ -22,6 +22,7 @@ command -v blastp > /dev/null 2>&1 || {
     echo "ERROR: Unable to find NCBI BLAST+ which is required";
     echo "It can be found at ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/"
     echo "Note that the \"blast\" in aptitude is NOT BLAST+ - you must have e.g. the blastp executable"
+    echo "Also note that to use RPSBLAST you will need a recent enough version. 2.28 is known to work."
     echo "";
     $STATUS=1;
 }
@@ -56,6 +57,7 @@ if [ $? -ne 0 ]; then
     echo ""
     echo "ERROR: Unable to find required package Biopython (Bio). This package is needed for many IO and graphics operations in ITEP"
     echo "It can be downloaded from http://biopython.org or installed (via setuptools) with sudo easy_install -f http://biopython.org/DIST/ biopython"
+    echo "Newer versions are less likely to have issues with parsing Genbank files."
     echo ""
     $STATUS=1;
 fi
@@ -98,14 +100,14 @@ command -v CompareToBootstrap.pl > /dev/null 2>&1 || {
 command -v FastTreeMP > /dev/null 2>&1 || {
     echo "";
     echo "WARNING: Unable to find FastTreeMP with the default program name - will need to specify the actual program name to";
-    echo "use the wrapper script for FastTree";
+    echo "use the wrapper script for FastTree (The default name is FastTreeMP)";
     echo "";
 }
 
 command -v Gblocks > /dev/null 2>&1 || {
     echo "";
     echo "WARNING: Unable to find Gblocks with default program name - will need to specify the actual program name to ";
-    echo "use the wrapper script for GBlocks";
+    echo "use the wrapper script for GBlocks (the default name is Gblocks).";
     echo "If you don't have it, it can be found at http://molevol.cmima.csic.es/castresana/Gblocks.html";
     echo "";
 }
@@ -147,7 +149,8 @@ command -v perl > /dev/null 2>&1 || {
 command -v 'raxmlHPC-PTHREADS' > /dev/null 2>&1 || {
     echo "";
     echo "WARNING: Unable to find raxml with the default program name - will need to specify the actual program name to";
-    echo "use the wrapper script for RaxML";
+    echo "use the wrapper script for RaxML. The default program name is raxmlHPC-PTHREADS";
+    echo "";
     echo "If you dont have RaxML it can be downloaded at http://www.exelixis-lab.org/ or checked out of github"
     echo "using git clone git@github.com:stamatak/standard-RAxML (requires a github account and SSH key)."
     echo "";
@@ -194,4 +197,12 @@ if [ $? -ne 0 ]; then
     echo ""
 fi
 
+python -c 'import xlwt'
+if [ $? -ne 0 ]; then
+    echo ""
+    echo "WARNING: Unable to find the Python package xlwt. This package is needed to export to Excel (most scripts do not need this)."
+    echo "It can be found at https://pypi.python.org/pypi/xlwt or installed via pip install xlwt"
+    echo ""
+fi
+
 exit ${STATUS};
diff --git a/gui/FetchOrganismFromGenbank.py b/gui/FetchOrganismFromGenbank.py
@@ -0,0 +1,103 @@
+#!/usr/bin/python
+
+''' This file contains some functions to make it easier to automate fetching genomes from Entrez '''
+
+import easygui
+import optparse
+import sys
+from Bio import Entrez
+
+from GuiBase import *
+
+class NcbiError(GuiError):
+    pass
+
+class OrganismImporter(GuiBase):
+    ''' 
+    You must specify an email so NCBI can nag you if you use too much of their resources.
+
+    This class contains functions for users to identify Genbank files to download.
+    The class functions download the selected files and places them in the specified location.
+    '''
+    def __init__(self, email):
+        Entrez.email = email
+        return
+    def _checkForEntrezError(self, record):
+        ''' Raise specific errors if Entrez raised them '''
+        if "ErrorList" in record:
+            errorstr = ""
+            for key in record["ErrorList"]:
+                if len(record["ErrorList"][key]) > 0:
+                    errorstr += "Error %s: %s\n" %(key, record["ErrorList"][key])
+            raise NcbiError("Error querying database: \n\n %s \n " %(errorstr))
+        return
+    def searchForOrganism(self, orgname):
+        ''' Look for organisms matching the specified organism string in the Genome database. '''
+        RETMAX=100
+        handle = Entrez.esearch(db="genome", term="%s[organism]" %(orgname), retmax=RETMAX)
+        record = Entrez.read(handle)
+        self._checkForEntrezError(record)
+        print record
+
+        idList = record["IdList"]
+        # Note - we could run into trouble here if idList is too long.
+        # should break it up.
+        handle = Entrez.efetch(db="genome", id=idList, rettype = "docsum", retmax=RETMAX)
+        records = Entrez.read(handle)
+        results_tuples = []
+        for record in records:
+            self._checkForEntrezError(record)
+            genome_id = record["Id"]
+            assembly_id = record["AssemblyID"]
+            organism_name = record["Organism_Name"]
+            results_tuples.append( (genome_id, assembly_id, organism_name) )
+        return results_tuples
+    def selectOrganism(self, results_tuples):
+        ''' Ask user for which of the found organisms he or she wants. '''
+        choices = []
+        for tup in results_tuples:
+            stri = "%s || %s || %s" %(tup[2], tup[0], tup[1])
+            choices.append(stri)
+        chosen_list = easygui.multchoicebox(msg="Select one or more organisms.\n Format is organism name || Genome ID || Assembly ID. \n", 
+                                             title = "Select organisms. ", choices=choices)
+        if chosen_list is None or len(chosen_list) == 0:
+            raise UserCancelError("User cancelled the operation or did not select any organisms!")
+
+        genome_ids = []
+        for choice in chosen_list:
+            spl = choice.split("||")
+            # We only need the assembly IDs
+            genome_id = spl[1].replace(" ", "")
+            genome_ids.append(genome_id)
+        return genome_ids
+    def downloadGenbank(self, genome_ids):
+        ''' Download Genbank files for a genome with specified Genome ID '''
+        # Ask where are we going to save the files?
+        # Get nucleotide IDs (elink to nuccore)
+        # Efetch them [Note - will need to do batching here]. Get docsum first.
+        # Check the ID ("caption" field) and match to the appropriate database.
+        # Make sure "ReplacedBy" is empty.
+        # Efetch again but get genbank (gb) this time.
+        # Save the files with name = a sanitized version of "Title"
+
+
+if __name__ == "__main__":
+
+    usage = "%prog organism_name email"
+    description = """A UI for searching for and downloading Genbank files for a single organism or group of organisms
+ from the NCBI Genome database. Note that the search is done by species, and that the Genbank files for individual strains  
+will need to be separated and concatenated after this script is done running."""
+
+    parser = optparse.OptionParser(usage=usage, description=description)
+    (options, args) = parser.parse_args()
+
+    # FIXME: These should be inputted into the UI.
+    if len(args) < 2:
+        sys.stderr.write("ERROR: Email address and organism name are required arguments.\n")
+        exit(2)
+
+    orgstring = args[0]
+    email = args[1]
+    importer = OrganismImporter(email)
+    found_organisms = importer.searchForOrganism(orgstring)
+    selected_organisms = importer.selectOrganism(found_organisms)