Skip to content

Suggestion to integrate caching and progress w/example provided =) #57

@ScottCoffin

Description

@ScottCoffin

I'm getting classification data using this package for a very large set of chemicals (>70k), so I wrote a custom function to cache results using SaveRDS every 1 minute, and the ability to restart from the saved file. This is extremely useful, as their API is SLOOOOOWWWW!! For my dataset, it's estimated that it will take >70 hrs to pull, with an average of ~4.7 seconds per chemical. This is ~8x slower than the NTP's API, btw.

It seems that this package is not being updated regularly, so for fellow data miners, feel free to use my custom function below to implement caching and progress updates!

Input is a list of InChI_Keys and a cache file location in your repo. Just change resume = T to start from cached file! Cheers!

get_classification_custom_function <- function(InChI_Keys, 
                                               cache_file = "data/classifications_cache.rds",
                                               resume = F){
  
  # initalize
  library(crayon)
  all_results <- list()
  processing_times <- c()
  last_save_time <- Sys.time()  # Track the last time results were saved
  queried_ids <- c()  # Track already queried identifiers
  
  # Load cached results if resuming
  if (resume && file.exists(cache_file)) {
    cat("Resuming from cached results...\n")
    all_results <- readRDS(cache_file)
    queried_ids <- unique(all_results$InChI_key)  # Extract queried identifiers
    cat(sprintf("Number of chemicals in cached results: %d\n", length(queried_ids)))
  }
  
  # Exclude already queried chemicals
  remaining_ids <- setdiff(InChI_Keys, queried_ids)
  cat(sprintf("Number of chemicals remaining to query: %d\n", length(remaining_ids)))
  
  cat(blue(sprintf("Starting API query from ClassyFire for %d chemicals.\n", length(remaining_ids))))
  
  start_time <- Sys.time()  # Start time for the entire process
  
  ## get classification data for all remaining chems
  for (i in seq_along(remaining_ids)) {
    chemical_id <- remaining_ids[i]
    
    # Measure the time for this query
    single_start_time <- Sys.time()
    #get classification data from ClassyFire API
    result <- get_classification(chemical_id)
    
    if (!is.null(result)) {
      #verbose description
      description <- result@description
      
      # main classificaiton datatable
      result <- as.data.frame(result@classification) %>% 
        mutate(description = description,
               InChI_key = chemical_id)
      
      all_results <- bind_rows(all_results, result)
    } else {
      result <- data.frame(InChI_key = chemical_id)
      
      all_results <- bind_rows(all_results, result)
    }
    
    # Calculate processing time for this chemical
    single_end_time <- Sys.time()
    processing_time <- as.numeric(difftime(single_end_time, single_start_time, units = "secs"))
    processing_times <- c(processing_times, processing_time)
    
    # Save results every 1 minute
    current_time <- Sys.time()
    if (as.numeric(difftime(current_time, last_save_time, units = "mins")) >= 1) {
      cat(yellow(sprintf("Saving intermediate results at chemical %d...\n", i)))
      saveRDS(all_results, cache_file)
      cat(yellow(sprintf("Total size of the file so far: %s\n", format(gdata::object_size(all_results), units = "auto"))))
      last_save_time <- current_time
    }
    
    # Report progress at multiples of 10 chemicals
    if (i %% 10 == 0) {
      avg_time <- mean(tail(processing_times, 1000))  # Moving average of the last 10 chemicals
      remaining_chemicals <- length(remaining_ids) - i
      estimated_time <- avg_time * remaining_chemicals  # Estimate remaining time
      
      cat(magenta(sprintf("%d chemicals queried so far...\n", i)))
      cat(blue(sprintf("Average time per chemical (last 1,000): %.2f seconds\n", avg_time)))
      cat(blue(sprintf("Estimated time remaining: %.2f seconds (%.2f minutes; %.2f hours)\n", estimated_time, estimated_time / 60, estimated_time/3600)))
    }
  } #end for-loop
  
  # Report total time taken
  end_time <- Sys.time()
  total_time <- as.numeric(difftime(end_time, start_time, units = "secs"))
  cat(blue(sprintf("Total time taken: %.2f seconds (%.2f minutes; %.2f hours)\n", total_time, total_time / 60, total_time/3600)))
  
  # Count the number of chemicals with succesful classifications
  total_count <- n_distinct(remaining_ids)
  successful_data_count <- n_distinct(all_results %>% filter(InChI_key %in% remaining_ids) %>%  drop_na(Classification) %>% pull(InChI_key))
  
  cat(cyan(sprintf("%d Successful classifications of %d chemicals queried\n", successful_data_count, total_count)))
  
  total_count_RDS <- n_distinct(all_results$InChI_key)
  cat(cyan(sprintf("%d Total chemicals in saved classification RDS file\n", total_count_RDS)))
  # Save final results
  saveRDS(all_results, cache_file)
  cat(yellow("Final results saved to cache file!\n"))
  
  return(all_results)
  
} #close function

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions