Gibbons-Lab · cdiener · Mar 3, 2025 · Mar 3, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml
@@ -0,0 +1,37 @@
+name: Build and push the MEDI image
+
+on:
+  push:
+    branches:
+      - "**"
+    tags:
+      - '[0-9]+.[0-9]+.[0-9]+'
+  pull_request:
+
+env:
+  IMAGE_NAME: cdiener/medi
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.IMAGE_NAME }}
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v5
+        with:
+          push: ${{ contains(github.ref, 'tag') }}
+          tags: ${{ steps.meta.outputs.tags }}
+          annotations: ${{ steps.meta.outputs.annotations }}
+      - name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+FROM docker.io/condaforge/miniforge3:latest
+
+RUN mkdir /tmp/medi /tmp/medi/bin
+
+COPY medi.yml Makefile patches/*.patch /tmp/medi
+
+RUN mamba env create -n medi -f /tmp/medi/medi.yml && \
+    . ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate medi && \
+    cd /tmp/medi && make report && mv /tmp/medi/bin/kraken2-report /bin && \
+    patch ${CONDA_DIR}/envs/medi/share/kraken2-2.1.3-4/libexec/build_kraken2_db.sh /tmp/medi/build.patch && \
+    patch ${CONDA_DIR}/envs/medi/share/kraken2-2.1.3-4/libexec/download_genomic_library.sh /tmp/medi/download_genomic.patch && \
+    conda clean --tarballs --index-cache --packages --yes && \
+    find ${CONDA_DIR} -follow -type f -name '*.a' -delete && \
+    find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete && \
+    conda clean --force-pkgs-dirs --all --yes  && \
+    rm -rf /tmp/medi
+
+ENTRYPOINT ["mamba", "run", "-n", "medi"]
diff --git a/bin/download.R b/bin/download.R
@@ -7,11 +7,16 @@ library(futile.logger)
 library(Biostrings)
 library(R.utils)
 
+MAX_SEQLENGTH = 2e9
+
 args <- commandArgs(trailingOnly = TRUE)
 
 matches <- fread(args[1])
-threads <- as.numeric(args[2])
+group <- args[2]
 out_folder <- args[3]
+target_id <- args[4]
+rsync <- as.logical(args[5])
+
 
 if (is.null(getOption("reutils.api.key"))) {
     rate <- 0.9
@@ -21,9 +26,14 @@ if (is.null(getOption("reutils.api.key"))) {
 
 dir.create(out_folder, recursive = TRUE, showWarnings = FALSE)
 
-ncbi_rsync <- function(url, out) {
-    rsync_url <- gsub("https://", "rsync://", url)
-    ret <- system2("rsync", c("--no-motd", rsync_url, out))
+ncbi_download <- function(url, out, rsync = TRUE) {
+    if (rsync) {
+        rsync_url <- gsub("https://", "rsync://", url)
+        ret <- system2("rsync", c("--no-motd", rsync_url, out))
+    } else {
+        https_url <- gsub("ftp://", "https://", url)
+        ret <- system2("wget", c("-cN", https_url, out))
+    }
     Sys.chmod(out, "0755")
     return(ret)
 }
@@ -36,7 +46,7 @@ download_genome <- function(hit, out_dir="sequences") {
     for (i in 0:7) {
         if (file.exists(hit$filename)) unlink(hit$filename)
         ret <- tryCatch(
-            ncbi_rsync(hit$url, hit$filename),
+            ncbi_download(hit$url, hit$filename, rsync),
             error = function(e) return(1),
             warning = function(e) return(1)
         )
@@ -47,6 +57,7 @@ download_genome <- function(hit, out_dir="sequences") {
         flog.error("Failed downloading %s :(", hit$url)
         stop()
     }
+    flog.info("Downloaded genome for assembly %s...", id)
     fa <- readDNAStringSet(hit$filename)
     short_names <- tstrsplit(names(fa), "\\s+")[[1]]
     names(fa) <- paste0(short_names, "_", 1:length(short_names),
@@ -55,33 +66,49 @@ download_genome <- function(hit, out_dir="sequences") {
     writeXStringSet(fa, hit$filename, compress = "gzip")
     hit$num_records <- length(fa)
     hit$seqlength <- as.double(sum(width(fa)))
-    flog.info("Downloaded genome for assembly %s...", id)
+
     return(hit)
 }
 
-download_sequences <- function(hits, taxid, out_dir="sequences") {
-    hits <- copy(hits)
-    filename <- file.path(out_dir, paste0(as.character(taxid), ".fna"))
-    flog.info("Downloading sequences for taxon %s...", taxid)
-    for (i in 0:7) {
-        Sys.sleep(1/rate + 2^i)
-        if (file.exists(filename)) unlink(filename)
-        post <- epost(unique(hits$id), db = "nuccore")
-        Sys.sleep(1/rate)
-        fetch <- suppressMessages(
+fragmented_efetch <- function(hits, taxid, filename) {
+    hits[, "group" := floor(cumsum(seqlength) / MAX_SEQLENGTH)]
+    if (file.exists(filename)) unlink(filename)
+    flog.info("Downloading sequences for taxon %s [fragmented into %d groups]...",
+        taxid, hits[, uniqueN(group)])
+
+    for (g in unique(hits$group)) {
+        for (i in 0:7) {
+            Sys.sleep(1/rate + 2^i)
+            post <- epost(hits[group == g, id], db = "nuccore")
+            Sys.sleep(1/rate)
+            fetch <- suppressMessages(
                 efetch(post, db = "nuccore",
                        rettype = "fasta", retmode = "text")
-        )
-        if (length(getError(fetch)) == 1) {
-            write(content(fetch), filename)
-            if (file.exists(filename) && grepl(">", content(fetch))) {
-                break
+            )
+            if (length(getError(fetch)) == 1) {
+                write(content(fetch), filename, append=TRUE)
+                if (file.exists(filename) && grepl(">", content(fetch))) {
+                    done <- TRUE
+                    break
+                }
             }
         }
+        if (i == 7) {
+            if (file.exists(filename)) unlink(filename)
+            done <- FALSE
+            break
+        }
     }
-    if (!file.exists(filename) || !grepl(">", content(fetch))) {
-        flog.error("Failed downloading %s. UIDs=%s) :(", taxid, paste(unique(hits$id), collpase=", "))
-        print(post)
+
+    return(done)
+}
+
+download_sequences <- function(hits, taxid, out_dir="sequences") {
+    hits <- copy(hits) %>% unique(by="id")
+    filename <- file.path(out_dir, paste0(as.character(taxid), ".fna"))
+    done <- fragmented_efetch(hits, taxid, filename)
+    if (!done) {
+        flog.error("Failed downloading %s. UIDs=%s) :(", taxid, paste(unique(hits$id), collapse=", "))
         stop()
     }
     hit <- hits[1]
@@ -100,26 +127,36 @@ download_sequences <- function(hits, taxid, out_dir="sequences") {
 }
 
 # Download additional contigs
-if (any(matches$db == "nucleotide")) {
+if ((args[2] == "nucleotide") && (any(matches$db == "nucleotide"))) {
     contigs <- matches[
         db == "nucleotide",
         download_sequences(.SD, matched_taxid[1]),
         by = "matched_taxid"]
     flog.info("Downloaded contigs for %d additional taxa.", nrow(contigs))
+    contigs[, "orig_taxid" := NULL]
+    fwrite(contigs, "nucleotide.csv")
 }
 
 # Download full genomes
-gb <- matches[db == "genbank"]
-flog.info("Downloading %d genomes with %d threads.", gb[, uniqueN(id)], threads)
-dls <- parallel::mclapply(
-    gb[, unique(id)],
-    function(i) download_genome(gb[id == i]),
-    mc.cores=threads
-)
-genomes <- rbindlist(dls)
-flog.info("Downloaded %d full genomes.", nrow(genomes))
-
-manifest <- rbind(genomes, contigs)
-manifest[, "orig_taxid" := NULL]
-fwrite(manifest, "manifest.csv")
+if (args[2] == "genbank") {
+    target = args[4]
+    gb <- matches[db == "genbank"]
+    flog.info("Downloading genome %s with.", target)
+    report <- download_genome(gb[id == target])
+    flog.info(
+        "Found %d records summing to %.3g Mbps.",
+        report$num_records,
+        report$seqlength / 1e6
+    )
 
+    report[, "orig_taxid" := NULL]
+    fwrite(report, paste0(target, ".csv"))
+}
+
+if (args[2] == "decoys") {
+    flog.info("Downloading %d additional decoys.", nrow(matches))
+    decoys <- matches[, download_genome(.SD, out_folder), by = "id"]
+    flog.info("Downloaded genomes for %d decoys summing to %.2g Mbps.",
+                nrow(decoys), decoys[, sum(seqlength) / 1e6])
+    fwrite(decoys, "decoys.csv")
+}
diff --git a/bin/food_mapping.R b/bin/food_mapping.R
@@ -59,17 +59,17 @@ energy <- function(ab) {
     )
 }
 
-food <- fread(file.path(args[1], "Food.csv"))
+food <- fread(args[3])
 content <- fread(file.path(args[1], "Content.csv"))
 compounds <- fread(file.path(args[1], "Compound.csv"))
 nutrients <- fread(file.path(args[1], "Nutrient.csv"))
 matched <- unique(fread(args[2])[,
     .(orig_taxid, db, rank, matched_taxid, kingdom, phylum,
       class, order, family, genus, species)])
 
-food <- food[!is.na(ncbi_taxonomy_id),
+food <- food[!is.na(revised_taxonomy_id),
     .(food_id = id, wikipedia_id, food_group, food_subgroup,
-      ncbi_taxonomy_id)]
+      revised_taxonomy_id)]
 content <- content[!is.na(standard_content), .(
     compound_id = source_id, food_id, orig_content, orig_min, orig_max,
     orig_unit, standard_content, preparation_type, source_type
@@ -105,7 +105,7 @@ nutrients <- rbind(
 )
 compounds <- rbind(compounds, nutrients, use.names = TRUE, fill = TRUE)
 food_matches <- food[
-    matched, on = c(ncbi_taxonomy_id = "orig_taxid"), nomatch = 0]
+    matched, on = c(revised_taxonomy_id = "orig_taxid"), nomatch = 0]
 fwrite(food_matches, "food_matches.csv")
 food_matches <- content[food_matches, on = "food_id", nomatch = 0]
 food_matches <- compounds[