From 3d29ba0ec3313e0b769f8a62dae90970cb48b331 Mon Sep 17 00:00:00 2001 From: SolomonShorser-OICR Date: Fri, 21 May 2021 11:31:10 -0400 Subject: [PATCH 1/4] Udpate dependency. --- pom.xml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index cdff170..acb56fd 100644 --- a/pom.xml +++ b/pom.xml @@ -83,10 +83,16 @@ ${powermock.version} test + uk.ac.ebi.uniprot japi - 1.0.31 + 1.0.38 org.hamcrest @@ -94,7 +100,6 @@ 2.1 test - From c8c99cb49b7ec0b4746d499d52f38811b083eb50 Mon Sep 17 00:00:00 2001 From: SolomonShorser-OICR Date: Fri, 21 May 2021 11:32:01 -0400 Subject: [PATCH 2/4] Retry if UniProt throws a timeout exception. --- .../orthopairs/UniProtGeneNamesRetriever.java | 71 ++++++++++++++----- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java index 225512a..d0f0aed 100644 --- a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java +++ b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java @@ -12,15 +12,19 @@ import uk.ac.ebi.uniprot.dataservice.client.uniprot.UniProtService; import uk.ac.ebi.uniprot.dataservice.query.Query; +import java.net.SocketTimeoutException; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.time.Duration; import java.util.*; public class UniProtGeneNamesRetriever { + private static final int MAX_NUM_ATTEMPTS = 10; private static final Logger logger = LogManager.getLogger(); private static final int MAX_UNIPROT_BATCH_QUERY_SIZE = 250; @@ -129,27 +133,60 @@ public static Set retrieveGeneNamesFromUniProt(List> partiti int count = 0; Set uniprotAccessionsToGeneNames = new HashSet<>(); for (Set uniprotIdentifierPartition : partitionedUniProtIds) { + int currentAttemptNum = 0; // Build UniProt API query from Set of 250 UniProt identifiers. Query query = UniProtQueryBuilder.accessions(uniprotIdentifierPartition); // Perform UniProt API query to retrieve gene names associated with identifiers. - QueryResult> uniprotEntries = uniprotService.getGenes(query); - - while (uniprotEntries.hasNext()) { - count++; - // Get Gene object returned from UniProt. - UniProtComponent geneObject = uniprotEntries.next(); - if (!geneObject.getComponent().isEmpty()) { - // Iterate through all Gene components in the response. - for (Gene geneComponent : geneObject.getComponent()) { - // Tab-separate UniProt accession ID and its associated gene name, and then store these in the Set that will be returned. - uniprotAccessionsToGeneNames.add(geneObject.getAccession().toString() + "\t" + geneComponent.getGeneName().toString() + "\n"); - } - } - - if (count % 1000 == 0) { - logger.info(count + " UniProt identifiers have been queried for gene names"); + QueryResult> uniprotEntries = null; + while (currentAttemptNum < MAX_NUM_ATTEMPTS && uniprotEntries == null) + { + try + { + currentAttemptNum++; + uniprotEntries = uniprotService.getGenes(query); + while (uniprotEntries.hasNext()) { + count++; + // Get Gene object returned from UniProt. + UniProtComponent geneObject = uniprotEntries.next(); + if (!geneObject.getComponent().isEmpty()) { + // Iterate through all Gene components in the response. + for (Gene geneComponent : geneObject.getComponent()) { + // Tab-separate UniProt accession ID and its associated gene name, and then store these in the Set that will be returned. + uniprotAccessionsToGeneNames.add(geneObject.getAccession().toString() + "\t" + geneComponent.getGeneName().toString() + "\n"); + } + } + + if (count % 1000 == 0) { + logger.info(count + " UniProt identifiers have been queried for gene names"); + } + } } - } + catch (ServiceException e) + { + // Log the exception right away, just in case any code in the exception handler fails and we don't get a chance to log it later. + logger.error(e); + // If the exception was caused by a Timeout, then we want to retry. + boolean timeoutFound = false; + int i = 0; + while (!timeoutFound && i < e.getStackTrace().length) + { + // don't be too specific - there are other classes for timeouts. ANY type timeout should trigger a wait-retry. + timeoutFound = e.getStackTrace()[i].getClassName().toUpperCase().contains("TIMEOUT"); + i++; + } + // If a timeout was found, sleep for a bit and then retry. + if (timeoutFound) + { + long sleepAmt = Duration.ofSeconds(currentAttemptNum * 2L).toMillis(); + logger.warn("A timeout exception was caught while trying to connect to the UniProt service after {} attempts. A retry will be performed after {} milliseconds", currentAttemptNum, sleepAmt); + Thread.sleep(sleepAmt); + } + else + { + logger.error("ServiceException caught while trying to communicate with UniProt: " + e.getMessage(), e); + } + } + } } uniprotService.stop(); From 385117ae540dc40f5436630cee310ac2cc8bfad5 Mon Sep 17 00:00:00 2001 From: SolomonShorser-OICR Date: Fri, 21 May 2021 11:36:41 -0400 Subject: [PATCH 3/4] fix whitespace *sigh* --- .../orthopairs/UniProtGeneNamesRetriever.java | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java index d0f0aed..3f9a4e6 100644 --- a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java +++ b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java @@ -133,60 +133,60 @@ public static Set retrieveGeneNamesFromUniProt(List> partiti int count = 0; Set uniprotAccessionsToGeneNames = new HashSet<>(); for (Set uniprotIdentifierPartition : partitionedUniProtIds) { - int currentAttemptNum = 0; + int currentAttemptNum = 0; // Build UniProt API query from Set of 250 UniProt identifiers. Query query = UniProtQueryBuilder.accessions(uniprotIdentifierPartition); // Perform UniProt API query to retrieve gene names associated with identifiers. QueryResult> uniprotEntries = null; - while (currentAttemptNum < MAX_NUM_ATTEMPTS && uniprotEntries == null) - { + while (currentAttemptNum < MAX_NUM_ATTEMPTS && uniprotEntries == null) + { try { - currentAttemptNum++; - uniprotEntries = uniprotService.getGenes(query); - while (uniprotEntries.hasNext()) { - count++; - // Get Gene object returned from UniProt. - UniProtComponent geneObject = uniprotEntries.next(); - if (!geneObject.getComponent().isEmpty()) { - // Iterate through all Gene components in the response. - for (Gene geneComponent : geneObject.getComponent()) { - // Tab-separate UniProt accession ID and its associated gene name, and then store these in the Set that will be returned. - uniprotAccessionsToGeneNames.add(geneObject.getAccession().toString() + "\t" + geneComponent.getGeneName().toString() + "\n"); - } - } - - if (count % 1000 == 0) { - logger.info(count + " UniProt identifiers have been queried for gene names"); - } - } + currentAttemptNum++; + uniprotEntries = uniprotService.getGenes(query); + while (uniprotEntries.hasNext()) { + count++; + // Get Gene object returned from UniProt. + UniProtComponent geneObject = uniprotEntries.next(); + if (!geneObject.getComponent().isEmpty()) { + // Iterate through all Gene components in the response. + for (Gene geneComponent : geneObject.getComponent()) { + // Tab-separate UniProt accession ID and its associated gene name, and then store these in the Set that will be returned. + uniprotAccessionsToGeneNames.add(geneObject.getAccession().toString() + "\t" + geneComponent.getGeneName().toString() + "\n"); + } + } + + if (count % 1000 == 0) { + logger.info(count + " UniProt identifiers have been queried for gene names"); + } + } } - catch (ServiceException e) + catch (ServiceException e) { - // Log the exception right away, just in case any code in the exception handler fails and we don't get a chance to log it later. - logger.error(e); - // If the exception was caused by a Timeout, then we want to retry. - boolean timeoutFound = false; - int i = 0; - while (!timeoutFound && i < e.getStackTrace().length) - { - // don't be too specific - there are other classes for timeouts. ANY type timeout should trigger a wait-retry. - timeoutFound = e.getStackTrace()[i].getClassName().toUpperCase().contains("TIMEOUT"); - i++; - } - // If a timeout was found, sleep for a bit and then retry. - if (timeoutFound) - { - long sleepAmt = Duration.ofSeconds(currentAttemptNum * 2L).toMillis(); - logger.warn("A timeout exception was caught while trying to connect to the UniProt service after {} attempts. A retry will be performed after {} milliseconds", currentAttemptNum, sleepAmt); - Thread.sleep(sleepAmt); - } - else - { - logger.error("ServiceException caught while trying to communicate with UniProt: " + e.getMessage(), e); - } - } - } + // Log the exception right away, just in case any code in the exception handler fails and we don't get a chance to log it later. + logger.error(e); + // If the exception was caused by a Timeout, then we want to retry. + boolean timeoutFound = false; + int i = 0; + while (!timeoutFound && i < e.getStackTrace().length) + { + // don't be too specific - there are other classes for timeouts. ANY type timeout should trigger a wait-retry. + timeoutFound = e.getStackTrace()[i].getClassName().toUpperCase().contains("TIMEOUT"); + i++; + } + // If a timeout was found, sleep for a bit and then retry. + if (timeoutFound) + { + long sleepAmt = Duration.ofSeconds(currentAttemptNum * 2L).toMillis(); + logger.warn("A timeout exception was caught while trying to connect to the UniProt service after {} attempts. A retry will be performed after {} milliseconds", currentAttemptNum, sleepAmt); + Thread.sleep(sleepAmt); + } + else + { + logger.error("ServiceException caught while trying to communicate with UniProt: " + e.getMessage(), e); + } + } + } } uniprotService.stop(); From 54c80e4da444ca9fe1ebd6efc5469a8b3cb26ce2 Mon Sep 17 00:00:00 2001 From: SolomonShorser-OICR Date: Fri, 21 May 2021 11:48:04 -0400 Subject: [PATCH 4/4] Reduce batch size. --- src/main/java/org/reactome/release/orthopairs/Main.java | 2 +- .../reactome/release/orthopairs/UniProtGeneNamesRetriever.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/reactome/release/orthopairs/Main.java b/src/main/java/org/reactome/release/orthopairs/Main.java index ef6cacc..ae10868 100644 --- a/src/main/java/org/reactome/release/orthopairs/Main.java +++ b/src/main/java/org/reactome/release/orthopairs/Main.java @@ -47,7 +47,7 @@ public static void main( String[] args ) throws IOException, ParseException, Ser String pathToSpeciesConfig = props.getProperty("pathToSpeciesConfig", "src/main/resources/Species.json"); String pantherQfOFilename = props.getProperty("pantherQfOFilename", "QfO_Genome_Orthologs.tar.gz"); String pantherHCOPFilename = props.getProperty("pantherHCOPFilename", "Orthologs_HCOP.tar.gz"); - +// String uniprotQueryBatchSize = props.getProperty("uniprotQueryBatchSize", "100"); if (releaseNumber.isEmpty()) { logger.fatal("Please populate config.properties file with releaseNumber"); throw new IllegalStateException("No releaseNumber attribute in config.properties"); diff --git a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java index 3f9a4e6..e9bb613 100644 --- a/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java +++ b/src/main/java/org/reactome/release/orthopairs/UniProtGeneNamesRetriever.java @@ -26,7 +26,8 @@ public class UniProtGeneNamesRetriever { private static final int MAX_NUM_ATTEMPTS = 10; private static final Logger logger = LogManager.getLogger(); - private static final int MAX_UNIPROT_BATCH_QUERY_SIZE = 250; + // TODO: Make this value configurable. + private static final int MAX_UNIPROT_BATCH_QUERY_SIZE = 100; /** * Queries the UniProt mapping service through their Java API library. All Uniprot accession IDs are taken from the Panther