Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
782cc93
update db script
cdiener Mar 3, 2025
805683a
update db script
cdiener Mar 3, 2025
1d2252c
add the curated IDs
cdiener Mar 17, 2025
010b3dd
add the curated IDs
cdiener Mar 17, 2025
b407a18
swich to single downloads
cdiener Mar 18, 2025
ad31b08
no duplicate matches, switch to single gb downloads
cdiener Mar 18, 2025
9e2c458
fixes from cluster
cdiener Mar 18, 2025
7fa555d
fixes from cluster
cdiener Mar 18, 2025
6ba1d04
several fixes
cdiener Mar 18, 2025
f48bd1b
local fixes
Mar 18, 2025
39c9031
move logs
cdiener Mar 18, 2025
1db29f9
only download unique
Mar 18, 2025
44dd942
fix download script
cdiener Mar 18, 2025
3465218
skip problems
Mar 18, 2025
5090479
fix download
cdiener Mar 18, 2025
9b8e13c
avoid looking for nucleotide sequences for nonredundant genera
cdiener Mar 19, 2025
da5b922
some fixes
Mar 19, 2025
9ac0568
add scores beforehand
cdiener Mar 20, 2025
1f1e4c5
add scores beforehand
cdiener Mar 20, 2025
f536e5b
add scores beforehand
cdiener Mar 20, 2025
36e0d0f
fix nucleoide logger
cdiener Mar 20, 2025
2e46374
fix nucleoide logger
cdiener Mar 20, 2025
9327d03
fix nucleoide logger
cdiener Mar 20, 2025
db0f1ab
add decoys
cdiener Mar 20, 2025
a1126a2
fix decoy add
cdiener Apr 3, 2025
12411d9
major refactor to build_kraken, now cleanly resumable
cdiener Apr 8, 2025
08072e3
major refactor to build_kraken, now cleanly resumable
cdiener Apr 8, 2025
5189543
fixes to merging
Apr 3, 2025
78a24bb
more refactor
cdiener Apr 8, 2025
86af94c
add ftp option to kraken db downloads
Apr 9, 2025
60c4a6e
further improvements to build
Apr 24, 2025
b046ff7
make compatible with new NCBI ranks
cdiener Apr 17, 2025
30447cc
several fixes to pipeline
cdiener May 5, 2025
bb0684f
remove old prefix
cdiener May 5, 2025
759f56e
add rsync option to database download to enable https downloads
cdiener Apr 9, 2025
580a100
fix scoring
cdiener Oct 31, 2025
c98f0a6
changes to matching and downloading
cdiener Nov 3, 2025
52c311d
fix match
cdiener Nov 3, 2025
1a9bd9c
fix match
cdiener Nov 3, 2025
c448541
fix match
cdiener Nov 3, 2025
85a25d4
fix match
cdiener Nov 3, 2025
da68d5a
fix match
cdiener Nov 3, 2025
6e6a3a0
fix match
cdiener Nov 3, 2025
1acbc45
fix match
cdiener Nov 3, 2025
9a825bf
fix summary
cdiener Nov 4, 2025
7bd3aa5
fix summary
cdiener Nov 4, 2025
1b85040
provide more info
cdiener Nov 4, 2025
6758c5d
provide more info
cdiener Nov 4, 2025
e532ddd
provide more info
cdiener Nov 4, 2025
4927ae0
provide more info
cdiener Nov 4, 2025
76021d9
handle missing fields in summary
cdiener Nov 4, 2025
750ace4
handle missing fields in summary
cdiener Nov 4, 2025
b42f778
remove redundant entries
cdiener Nov 4, 2025
16ae788
remove redundant entries
cdiener Nov 4, 2025
133d053
remove redundant entries
cdiener Nov 4, 2025
c27a185
fix redundant entries
cdiener Nov 4, 2025
0333006
fix redundant entries
cdiener Nov 4, 2025
773aaad
add mem
cdiener Oct 31, 2025
c9208e0
some small fixes
cdiener Nov 17, 2025
50ba29c
allow more mem for preprocessing
cdiener Nov 24, 2025
4ea06a5
check for errors too
cdiener Nov 24, 2025
165561d
handle insufficient reads for bracken
cdiener Nov 24, 2025
3592df7
handle insufficient reads for bracken
cdiener Nov 24, 2025
0060d61
add Dockerfile
cdiener Dec 2, 2025
3d30a26
add image build
cdiener Dec 2, 2025
bc7c499
see if this works
cdiener Dec 2, 2025
a0884f4
see if this works
cdiener Dec 2, 2025
8fda5e0
see if this works
cdiener Dec 2, 2025
883e965
see if this works
cdiener Dec 2, 2025
a34fde0
see if this works
cdiener Dec 2, 2025
a5814aa
see if this works
cdiener Dec 2, 2025
9805439
fix paths
cdiener Dec 2, 2025
7c6a011
add patch
cdiener Dec 2, 2025
2f9743d
add patch
cdiener Dec 2, 2025
d9a6a62
add patch
cdiener Dec 2, 2025
954268d
add patch
cdiener Dec 2, 2025
fb850f6
add patch
cdiener Dec 2, 2025
4a9cfeb
add patch
cdiener Dec 2, 2025
c3ea9d6
fix copy
cdiener Dec 2, 2025
33ce339
change entrypoint
cdiener Dec 3, 2025
45d98e9
add install script
cdiener Dec 3, 2025
90b1a84
update quant as well
cdiener Jan 29, 2026
eabc1df
Merge branch 'main' into feature/new_db
cdiener Jan 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/workflows/image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Build and push the MEDI image

on:
push:
branches:
- "**"
tags:
- '[0-9]+.[0-9]+.[0-9]+'
pull_request:

env:
IMAGE_NAME: cdiener/medi

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.IMAGE_NAME }}
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Build and push
id: docker_build
uses: docker/build-push-action@v5
with:
push: ${{ contains(github.ref, 'tag') }}
tags: ${{ steps.meta.outputs.tags }}
annotations: ${{ steps.meta.outputs.annotations }}
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
18 changes: 18 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM docker.io/condaforge/miniforge3:latest

RUN mkdir /tmp/medi /tmp/medi/bin

COPY medi.yml Makefile patches/*.patch /tmp/medi

RUN mamba env create -n medi -f /tmp/medi/medi.yml && \
. ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate medi && \
cd /tmp/medi && make report && mv /tmp/medi/bin/kraken2-report /bin && \
patch ${CONDA_DIR}/envs/medi/share/kraken2-2.1.3-4/libexec/build_kraken2_db.sh /tmp/medi/build.patch && \
patch ${CONDA_DIR}/envs/medi/share/kraken2-2.1.3-4/libexec/download_genomic_library.sh /tmp/medi/download_genomic.patch && \
conda clean --tarballs --index-cache --packages --yes && \
find ${CONDA_DIR} -follow -type f -name '*.a' -delete && \
find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete && \
conda clean --force-pkgs-dirs --all --yes && \
rm -rf /tmp/medi

ENTRYPOINT ["mamba", "run", "-n", "medi"]
113 changes: 75 additions & 38 deletions bin/download.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@ library(futile.logger)
library(Biostrings)
library(R.utils)

MAX_SEQLENGTH = 2e9

args <- commandArgs(trailingOnly = TRUE)

matches <- fread(args[1])
threads <- as.numeric(args[2])
group <- args[2]
out_folder <- args[3]
target_id <- args[4]
rsync <- as.logical(args[5])


if (is.null(getOption("reutils.api.key"))) {
rate <- 0.9
Expand All @@ -21,9 +26,14 @@ if (is.null(getOption("reutils.api.key"))) {

dir.create(out_folder, recursive = TRUE, showWarnings = FALSE)

ncbi_rsync <- function(url, out) {
rsync_url <- gsub("https://", "rsync://", url)
ret <- system2("rsync", c("--no-motd", rsync_url, out))
ncbi_download <- function(url, out, rsync = TRUE) {
if (rsync) {
rsync_url <- gsub("https://", "rsync://", url)
ret <- system2("rsync", c("--no-motd", rsync_url, out))
} else {
https_url <- gsub("ftp://", "https://", url)
ret <- system2("wget", c("-cN", https_url, out))
}
Sys.chmod(out, "0755")
return(ret)
}
Expand All @@ -36,7 +46,7 @@ download_genome <- function(hit, out_dir="sequences") {
for (i in 0:7) {
if (file.exists(hit$filename)) unlink(hit$filename)
ret <- tryCatch(
ncbi_rsync(hit$url, hit$filename),
ncbi_download(hit$url, hit$filename, rsync),
error = function(e) return(1),
warning = function(e) return(1)
)
Expand All @@ -47,6 +57,7 @@ download_genome <- function(hit, out_dir="sequences") {
flog.error("Failed downloading %s :(", hit$url)
stop()
}
flog.info("Downloaded genome for assembly %s...", id)
fa <- readDNAStringSet(hit$filename)
short_names <- tstrsplit(names(fa), "\\s+")[[1]]
names(fa) <- paste0(short_names, "_", 1:length(short_names),
Expand All @@ -55,33 +66,49 @@ download_genome <- function(hit, out_dir="sequences") {
writeXStringSet(fa, hit$filename, compress = "gzip")
hit$num_records <- length(fa)
hit$seqlength <- as.double(sum(width(fa)))
flog.info("Downloaded genome for assembly %s...", id)

return(hit)
}

download_sequences <- function(hits, taxid, out_dir="sequences") {
hits <- copy(hits)
filename <- file.path(out_dir, paste0(as.character(taxid), ".fna"))
flog.info("Downloading sequences for taxon %s...", taxid)
for (i in 0:7) {
Sys.sleep(1/rate + 2^i)
if (file.exists(filename)) unlink(filename)
post <- epost(unique(hits$id), db = "nuccore")
Sys.sleep(1/rate)
fetch <- suppressMessages(
fragmented_efetch <- function(hits, taxid, filename) {
hits[, "group" := floor(cumsum(seqlength) / MAX_SEQLENGTH)]
if (file.exists(filename)) unlink(filename)
flog.info("Downloading sequences for taxon %s [fragmented into %d groups]...",
taxid, hits[, uniqueN(group)])

for (g in unique(hits$group)) {
for (i in 0:7) {
Sys.sleep(1/rate + 2^i)
post <- epost(hits[group == g, id], db = "nuccore")
Sys.sleep(1/rate)
fetch <- suppressMessages(
efetch(post, db = "nuccore",
rettype = "fasta", retmode = "text")
)
if (length(getError(fetch)) == 1) {
write(content(fetch), filename)
if (file.exists(filename) && grepl(">", content(fetch))) {
break
)
if (length(getError(fetch)) == 1) {
write(content(fetch), filename, append=TRUE)
if (file.exists(filename) && grepl(">", content(fetch))) {
done <- TRUE
break
}
}
}
if (i == 7) {
if (file.exists(filename)) unlink(filename)
done <- FALSE
break
}
}
if (!file.exists(filename) || !grepl(">", content(fetch))) {
flog.error("Failed downloading %s. UIDs=%s) :(", taxid, paste(unique(hits$id), collpase=", "))
print(post)

return(done)
}

download_sequences <- function(hits, taxid, out_dir="sequences") {
hits <- copy(hits) %>% unique(by="id")
filename <- file.path(out_dir, paste0(as.character(taxid), ".fna"))
done <- fragmented_efetch(hits, taxid, filename)
if (!done) {
flog.error("Failed downloading %s. UIDs=%s) :(", taxid, paste(unique(hits$id), collapse=", "))
stop()
}
hit <- hits[1]
Expand All @@ -100,26 +127,36 @@ download_sequences <- function(hits, taxid, out_dir="sequences") {
}

# Download additional contigs
if (any(matches$db == "nucleotide")) {
if ((args[2] == "nucleotide") && (any(matches$db == "nucleotide"))) {
contigs <- matches[
db == "nucleotide",
download_sequences(.SD, matched_taxid[1]),
by = "matched_taxid"]
flog.info("Downloaded contigs for %d additional taxa.", nrow(contigs))
contigs[, "orig_taxid" := NULL]
fwrite(contigs, "nucleotide.csv")
}

# Download full genomes
gb <- matches[db == "genbank"]
flog.info("Downloading %d genomes with %d threads.", gb[, uniqueN(id)], threads)
dls <- parallel::mclapply(
gb[, unique(id)],
function(i) download_genome(gb[id == i]),
mc.cores=threads
)
genomes <- rbindlist(dls)
flog.info("Downloaded %d full genomes.", nrow(genomes))

manifest <- rbind(genomes, contigs)
manifest[, "orig_taxid" := NULL]
fwrite(manifest, "manifest.csv")
if (args[2] == "genbank") {
target = args[4]
gb <- matches[db == "genbank"]
flog.info("Downloading genome %s with.", target)
report <- download_genome(gb[id == target])
flog.info(
"Found %d records summing to %.3g Mbps.",
report$num_records,
report$seqlength / 1e6
)

report[, "orig_taxid" := NULL]
fwrite(report, paste0(target, ".csv"))
}

if (args[2] == "decoys") {
flog.info("Downloading %d additional decoys.", nrow(matches))
decoys <- matches[, download_genome(.SD, out_folder), by = "id"]
flog.info("Downloaded genomes for %d decoys summing to %.2g Mbps.",
nrow(decoys), decoys[, sum(seqlength) / 1e6])
fwrite(decoys, "decoys.csv")
}
8 changes: 4 additions & 4 deletions bin/food_mapping.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ energy <- function(ab) {
)
}

food <- fread(file.path(args[1], "Food.csv"))
food <- fread(args[3])
content <- fread(file.path(args[1], "Content.csv"))
compounds <- fread(file.path(args[1], "Compound.csv"))
nutrients <- fread(file.path(args[1], "Nutrient.csv"))
matched <- unique(fread(args[2])[,
.(orig_taxid, db, rank, matched_taxid, kingdom, phylum,
class, order, family, genus, species)])

food <- food[!is.na(ncbi_taxonomy_id),
food <- food[!is.na(revised_taxonomy_id),
.(food_id = id, wikipedia_id, food_group, food_subgroup,
ncbi_taxonomy_id)]
revised_taxonomy_id)]
content <- content[!is.na(standard_content), .(
compound_id = source_id, food_id, orig_content, orig_min, orig_max,
orig_unit, standard_content, preparation_type, source_type
Expand Down Expand Up @@ -105,7 +105,7 @@ nutrients <- rbind(
)
compounds <- rbind(compounds, nutrients, use.names = TRUE, fill = TRUE)
food_matches <- food[
matched, on = c(ncbi_taxonomy_id = "orig_taxid"), nomatch = 0]
matched, on = c(revised_taxonomy_id = "orig_taxid"), nomatch = 0]
fwrite(food_matches, "food_matches.csv")
food_matches <- content[food_matches, on = "food_id", nomatch = 0]
food_matches <- compounds[
Expand Down
Loading