diff --git a/pom.xml b/pom.xml index cdff170..acb56fd 100644 --- a/pom.xml +++ b/pom.xml @@ -83,10 +83,16 @@ ${powermock.version} test + uk.ac.ebi.uniprot japi - 1.0.31 + 1.0.38 org.hamcrest @@ -94,7 +100,6 @@ 2.1 test - diff --git a/src/main/java/org/reactome/release/orthopairs/Main.java b/src/main/java/org/reactome/release/orthopairs/Main.java index ef6cacc..ae10868 100644 --- a/src/main/java/org/reactome/release/orthopairs/Main.java +++ b/src/main/java/org/reactome/release/orthopairs/Main.java @@ -47,7 +47,7 @@ public static void main( String[] args ) throws IOException, ParseException, Ser String pathToSpeciesConfig = props.getProperty("pathToSpeciesConfig", "src/main/resources/Species.json"); String pantherQfOFilename = props.getProperty("pantherQfOFilename", "QfO_Genome_Orthologs.tar.gz"); String pantherHCOPFilename = props.getProperty("pantherHCOPFilename", "Orthologs_HCOP.tar.gz"); - +// String uniprotQueryBatchSize = props.getProperty("uniprotQueryBatchSize", "100"); if (releaseNumber.isEmpty()) { logger.fatal("Please populate config.properties file with releaseNumber"); throw new IllegalStateException("No releaseNumber attribute in config.properties"); diff --git a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java index 225512a..e9bb613 100644 --- a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java +++ b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java @@ -12,17 +12,22 @@ import uk.ac.ebi.uniprot.dataservice.client.uniprot.UniProtService; import uk.ac.ebi.uniprot.dataservice.query.Query; +import java.net.SocketTimeoutException; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.time.Duration; import java.util.*; public class UniProtGeneNamesRetriever { + private static final int MAX_NUM_ATTEMPTS = 10; private static final Logger logger = LogManager.getLogger(); - private static final int MAX_UNIPROT_BATCH_QUERY_SIZE = 250; + // TODO: Make this value configurable. + private static final int MAX_UNIPROT_BATCH_QUERY_SIZE = 100; /** * Queries the UniProt mapping service through their Java API library. All Uniprot accession IDs are taken from the Panther @@ -129,25 +134,58 @@ public static Set retrieveGeneNamesFromUniProt(List> partiti int count = 0; Set uniprotAccessionsToGeneNames = new HashSet<>(); for (Set uniprotIdentifierPartition : partitionedUniProtIds) { + int currentAttemptNum = 0; // Build UniProt API query from Set of 250 UniProt identifiers. Query query = UniProtQueryBuilder.accessions(uniprotIdentifierPartition); // Perform UniProt API query to retrieve gene names associated with identifiers. - QueryResult> uniprotEntries = uniprotService.getGenes(query); - - while (uniprotEntries.hasNext()) { - count++; - // Get Gene object returned from UniProt. - UniProtComponent geneObject = uniprotEntries.next(); - if (!geneObject.getComponent().isEmpty()) { - // Iterate through all Gene components in the response. - for (Gene geneComponent : geneObject.getComponent()) { - // Tab-separate UniProt accession ID and its associated gene name, and then store these in the Set that will be returned. - uniprotAccessionsToGeneNames.add(geneObject.getAccession().toString() + "\t" + geneComponent.getGeneName().toString() + "\n"); + QueryResult> uniprotEntries = null; + while (currentAttemptNum < MAX_NUM_ATTEMPTS && uniprotEntries == null) + { + try + { + currentAttemptNum++; + uniprotEntries = uniprotService.getGenes(query); + while (uniprotEntries.hasNext()) { + count++; + // Get Gene object returned from UniProt. + UniProtComponent geneObject = uniprotEntries.next(); + if (!geneObject.getComponent().isEmpty()) { + // Iterate through all Gene components in the response. + for (Gene geneComponent : geneObject.getComponent()) { + // Tab-separate UniProt accession ID and its associated gene name, and then store these in the Set that will be returned. + uniprotAccessionsToGeneNames.add(geneObject.getAccession().toString() + "\t" + geneComponent.getGeneName().toString() + "\n"); + } + } + + if (count % 1000 == 0) { + logger.info(count + " UniProt identifiers have been queried for gene names"); + } } } - - if (count % 1000 == 0) { - logger.info(count + " UniProt identifiers have been queried for gene names"); + catch (ServiceException e) + { + // Log the exception right away, just in case any code in the exception handler fails and we don't get a chance to log it later. + logger.error(e); + // If the exception was caused by a Timeout, then we want to retry. + boolean timeoutFound = false; + int i = 0; + while (!timeoutFound && i < e.getStackTrace().length) + { + // don't be too specific - there are other classes for timeouts. ANY type timeout should trigger a wait-retry. + timeoutFound = e.getStackTrace()[i].getClassName().toUpperCase().contains("TIMEOUT"); + i++; + } + // If a timeout was found, sleep for a bit and then retry. + if (timeoutFound) + { + long sleepAmt = Duration.ofSeconds(currentAttemptNum * 2L).toMillis(); + logger.warn("A timeout exception was caught while trying to connect to the UniProt service after {} attempts. A retry will be performed after {} milliseconds", currentAttemptNum, sleepAmt); + Thread.sleep(sleepAmt); + } + else + { + logger.error("ServiceException caught while trying to communicate with UniProt: " + e.getMessage(), e); + } } } }