-
Notifications
You must be signed in to change notification settings - Fork 11
Open
Description
I'm getting classification data using this package for a very large set of chemicals (>70k), so I wrote a custom function to cache results using SaveRDS every 1 minute, and the ability to restart from the saved file. This is extremely useful, as their API is SLOOOOOWWWW!! For my dataset, it's estimated that it will take >70 hrs to pull, with an average of ~4.7 seconds per chemical. This is ~8x slower than the NTP's API, btw.
It seems that this package is not being updated regularly, so for fellow data miners, feel free to use my custom function below to implement caching and progress updates!
Input is a list of InChI_Keys and a cache file location in your repo. Just change resume = T to start from cached file! Cheers!
get_classification_custom_function <- function(InChI_Keys,
cache_file = "data/classifications_cache.rds",
resume = F){
# initalize
library(crayon)
all_results <- list()
processing_times <- c()
last_save_time <- Sys.time() # Track the last time results were saved
queried_ids <- c() # Track already queried identifiers
# Load cached results if resuming
if (resume && file.exists(cache_file)) {
cat("Resuming from cached results...\n")
all_results <- readRDS(cache_file)
queried_ids <- unique(all_results$InChI_key) # Extract queried identifiers
cat(sprintf("Number of chemicals in cached results: %d\n", length(queried_ids)))
}
# Exclude already queried chemicals
remaining_ids <- setdiff(InChI_Keys, queried_ids)
cat(sprintf("Number of chemicals remaining to query: %d\n", length(remaining_ids)))
cat(blue(sprintf("Starting API query from ClassyFire for %d chemicals.\n", length(remaining_ids))))
start_time <- Sys.time() # Start time for the entire process
## get classification data for all remaining chems
for (i in seq_along(remaining_ids)) {
chemical_id <- remaining_ids[i]
# Measure the time for this query
single_start_time <- Sys.time()
#get classification data from ClassyFire API
result <- get_classification(chemical_id)
if (!is.null(result)) {
#verbose description
description <- result@description
# main classificaiton datatable
result <- as.data.frame(result@classification) %>%
mutate(description = description,
InChI_key = chemical_id)
all_results <- bind_rows(all_results, result)
} else {
result <- data.frame(InChI_key = chemical_id)
all_results <- bind_rows(all_results, result)
}
# Calculate processing time for this chemical
single_end_time <- Sys.time()
processing_time <- as.numeric(difftime(single_end_time, single_start_time, units = "secs"))
processing_times <- c(processing_times, processing_time)
# Save results every 1 minute
current_time <- Sys.time()
if (as.numeric(difftime(current_time, last_save_time, units = "mins")) >= 1) {
cat(yellow(sprintf("Saving intermediate results at chemical %d...\n", i)))
saveRDS(all_results, cache_file)
cat(yellow(sprintf("Total size of the file so far: %s\n", format(gdata::object_size(all_results), units = "auto"))))
last_save_time <- current_time
}
# Report progress at multiples of 10 chemicals
if (i %% 10 == 0) {
avg_time <- mean(tail(processing_times, 1000)) # Moving average of the last 10 chemicals
remaining_chemicals <- length(remaining_ids) - i
estimated_time <- avg_time * remaining_chemicals # Estimate remaining time
cat(magenta(sprintf("%d chemicals queried so far...\n", i)))
cat(blue(sprintf("Average time per chemical (last 1,000): %.2f seconds\n", avg_time)))
cat(blue(sprintf("Estimated time remaining: %.2f seconds (%.2f minutes; %.2f hours)\n", estimated_time, estimated_time / 60, estimated_time/3600)))
}
} #end for-loop
# Report total time taken
end_time <- Sys.time()
total_time <- as.numeric(difftime(end_time, start_time, units = "secs"))
cat(blue(sprintf("Total time taken: %.2f seconds (%.2f minutes; %.2f hours)\n", total_time, total_time / 60, total_time/3600)))
# Count the number of chemicals with succesful classifications
total_count <- n_distinct(remaining_ids)
successful_data_count <- n_distinct(all_results %>% filter(InChI_key %in% remaining_ids) %>% drop_na(Classification) %>% pull(InChI_key))
cat(cyan(sprintf("%d Successful classifications of %d chemicals queried\n", successful_data_count, total_count)))
total_count_RDS <- n_distinct(all_results$InChI_key)
cat(cyan(sprintf("%d Total chemicals in saved classification RDS file\n", total_count_RDS)))
# Save final results
saveRDS(all_results, cache_file)
cat(yellow("Final results saved to cache file!\n"))
return(all_results)
} #close functionMetadata
Metadata
Assignees
Labels
No labels