From 41296111143f333d6b549ec0bfc58a7198ef5e20 Mon Sep 17 00:00:00 2001 From: Brian Fulton-Howard Date: Thu, 15 May 2025 11:10:03 -0400 Subject: [PATCH 1/6] Update for compatibility with Snakemake 8+ --- workflow/Snakefile | 52 ++++++++++++++++++++--------------------- workflow/rules/bgen.smk | 10 ++++---- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index b4fef4b..8942b89 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -172,7 +172,7 @@ if zipped: conda: 'envs/p7z.yaml' threads: 4 resources: - mem_mb = 4000, + mem_mb = 16000, time_min = 5 shell: r''' @@ -217,8 +217,8 @@ rule stats: conda: "envs/r_stats.yaml" threads: 22 resources: - mem_mb = 8000, - walltime = '8:00' + mem_mb = 176000, + runtime: "8h" script: "scripts/Post_imputation.Rmd" # Sample filtering rules @@ -244,7 +244,7 @@ rule fixheaders: threads: 1 resources: mem_mb = 2048, - walltime = '24:00' + runtime: "24h" conda: "envs/bcftools.yaml" shell: r""" @@ -270,7 +270,7 @@ if minimac_version == 'guess': threads: 1 resources: mem_mb = 2048, - walltime = '24:00' + runtime: "24h" script: 'scripts/rule_detect_minimac.py' else: rule detect_minimac: @@ -282,7 +282,7 @@ else: localrule: True resources: mem_mb = 2048, - walltime = '1:00' + runtime: "1h" shell: 'echo {params.ver} > {output}' @@ -299,8 +299,8 @@ if sampfilt and minimac_version in ['guess', '3']: sf = sampfilt threads: 8 resources: - mem_mb = 256, - walltime = '24:00' + mem_mb = 2048, + runtime: "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -332,8 +332,8 @@ elif minimac_version in ['guess', '3']: filt = qualfilt threads: 8 resources: - mem_mb = 256, - walltime = '24:00' + mem_mb = 2048, + runtime: "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -364,8 +364,8 @@ elif sampfilt: sf = sampfilt threads: 8 resources: - mem_mb = 256, - walltime = '24:00' + mem_mb = 2048, + runtime: "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -385,8 +385,8 @@ else: filt = qualfilt threads: 8 resources: - mem_mb = 256, - walltime = '24:00' + mem_mb = 2048, + runtime: "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -429,7 +429,7 @@ rule rename: conda: "envs/bcftools.yaml" threads: 2 resources: - mem_mb = 1024, + mem_mb = 2048, time_min = 60 shell: ''' @@ -460,7 +460,7 @@ rule renameAuto: conda: "envs/bcftools.yaml" threads: 2 resources: - mem_mb = 1024, + mem_mb = 2048, time_min = 60 shell: ''' @@ -473,8 +473,8 @@ rule concat_chroms_samp: output: "{impute_dir}/data/{cohort}_chrall_filtered.vcf.gz" threads: 8 resources: - mem_mb = 512, - walltime = '24:00' + mem_mb = 4096, + runtime: "24h" conda: "envs/bcftools.yaml" shell: "bcftools concat --threads 8 {input} | bcftools norm -d none -o {output} -Oz" @@ -495,8 +495,8 @@ rule merge_samples_chrom: output: "{impute_dir}/data/by_chrom/all_chr{chrom}_filtered.vcf.gz" threads: 8 resources: - mem_mb = 2000, - walltime = "36:00" + mem_mb = 16000, + runtime: "36h" conda: "envs/bcftools.yaml" shell: "bcftools merge -m none --threads 8 {input.vcf} | bcftools norm -d none -o {output} -Oz" @@ -505,8 +505,8 @@ rule concat_chroms_all: output: "{impute_dir}/data/all_chrall_filtered.vcf.gz" threads: 8 resources: - mem_mb = 256, - walltime = "24:00" + mem_mb = 2048, + runtime: "24h" conda: "envs/bcftools.yaml" shell: "bcftools concat -o {output} -Oz --threads 8 {input}" @@ -518,8 +518,8 @@ rule make_plink_all: ID = "--id-delim" if automap_tf else "--double-id" threads: 10 resources: - mem_mb = 3000, - walltime = "96:00" + mem_mb = 30000, + runtime: "96h" conda: "envs/plink.yaml" shell: "plink --keep-allele-order --vcf {input} {params.ID} --memory 20000 --threads 10 --make-bed " @@ -533,8 +533,8 @@ rule make_plink_samp: ID = "--id-delim" if automap_tf else "--double-id " threads: 10 resources: - mem_mb = 2000, - walltime = "2:00" + mem_mb = 20000, + runtime: "2h" conda: "envs/plink.yaml" shell: "plink --keep-allele-order --vcf {input} {params.ID} --memory 20000 --threads 10 --make-bed " diff --git a/workflow/rules/bgen.smk b/workflow/rules/bgen.smk index b32551c..5b2d80f 100644 --- a/workflow/rules/bgen.smk +++ b/workflow/rules/bgen.smk @@ -8,7 +8,7 @@ rule make_bgen: threads: 1 resources: mem_mb = 4000, - walltime = "24:00" + runtime: "24h" container: 'docker://befh/bgen:v1.1.7' shell: """ @@ -26,7 +26,7 @@ rule cat_bgen_samp: threads: 1 resources: mem_mb = 5000, - walltime = "120:00" + runtime: "120h" container: 'docker://befh/bgen:v1.1.7' shell: """ @@ -47,8 +47,8 @@ rule make_bgen_allsamp: args = " ".join(["-g {} -s {}".format(gen, samp) for gen, samp in zip(bga_gen, bga_samp)]) threads: 10 resources: - mem_mb = 4000, - walltime = "24:00" + mem_mb = 40000, + runtime: "24h" container: 'docker://befh/bgen:v1.1.7' shell: """ @@ -65,6 +65,6 @@ rule cat_bgen_allsamp: threads: 1 resources: mem_mb = 5000, - walltime = "120:00" + runtime: "120h" container: 'docker://befh/bgen:v1.1.7' shell: "cat-bgen -g {input.gen} -og {output.gen}; cp {input.samp} {output.samp}" From 362277452104ae4e03f035a483a4b8d89c6aba9e Mon Sep 17 00:00:00 2001 From: Brian Fulton-Howard Date: Tue, 3 Jun 2025 13:58:49 -0400 Subject: [PATCH 2/6] fixed issue with runtime resource and added ability to directly fix fams --- workflow/Snakefile | 65 +++++++++++++++++++++++++++++++---------- workflow/rules/bgen.smk | 8 ++--- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 8942b89..d1ae365 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -136,14 +136,23 @@ if config["qc"]["rsq2"] and config["qc"]["rsq2"] != 'NA': qualfilt += " || (R2 >= {R2} && MAF < {MAF})".format( R2=config["qc"]["rsq2"], MAF=config["qc"]["maf"]) + +if "fix_fam" in config and config["fix_fam"]: + path_plink_bycohort = "{impute_dir}/data/{cohort}_chrall_filtered_fixed.{ext}" + path_plink_merged = "{impute_dir}/data/all_chrall_filtered_fixed.{ext}" +else: + path_plink_bycohort = "{impute_dir}/data/{cohort}_chrall_filtered.{ext}" + path_plink_merged = "{impute_dir}/data/all_chrall_filtered.{ext}" + + outs = dict( stat_report="{impute_dir}/stats/{cohort}_impStats.html", vcf_bycohort="{impute_dir}/data/{cohort}_chrall_filtered.vcf.gz", vcf_merged="{impute_dir}/data/all_chrall_filtered.vcf.gz", bgen_bycohort="{impute_dir}/data/{cohort}_chrall_filtered.bgen", bgen_merged="{impute_dir}/data/merged/merged_chrall_filtered.bgen", - plink_bycohort="{impute_dir}/data/{cohort}_chrall_filtered.{ext}", - plink_merged="{impute_dir}/data/all_chrall_filtered.{ext}") + plink_bycohort=path_plink_bycohort, + plink_merged=path_plink_merged) def expand_outs(out): @@ -218,7 +227,7 @@ rule stats: threads: 22 resources: mem_mb = 176000, - runtime: "8h" + runtime = "8h" script: "scripts/Post_imputation.Rmd" # Sample filtering rules @@ -244,7 +253,7 @@ rule fixheaders: threads: 1 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: r""" @@ -270,7 +279,7 @@ if minimac_version == 'guess': threads: 1 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" script: 'scripts/rule_detect_minimac.py' else: rule detect_minimac: @@ -282,7 +291,7 @@ else: localrule: True resources: mem_mb = 2048, - runtime: "1h" + runtime = "1h" shell: 'echo {params.ver} > {output}' @@ -300,7 +309,7 @@ if sampfilt and minimac_version in ['guess', '3']: threads: 8 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -333,7 +342,7 @@ elif minimac_version in ['guess', '3']: threads: 8 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -365,7 +374,7 @@ elif sampfilt: threads: 8 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -386,7 +395,7 @@ else: threads: 8 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: r''' @@ -474,7 +483,7 @@ rule concat_chroms_samp: threads: 8 resources: mem_mb = 4096, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: "bcftools concat --threads 8 {input} | bcftools norm -d none -o {output} -Oz" @@ -496,7 +505,7 @@ rule merge_samples_chrom: threads: 8 resources: mem_mb = 16000, - runtime: "36h" + runtime = "36h" conda: "envs/bcftools.yaml" shell: "bcftools merge -m none --threads 8 {input.vcf} | bcftools norm -d none -o {output} -Oz" @@ -506,7 +515,7 @@ rule concat_chroms_all: threads: 8 resources: mem_mb = 2048, - runtime: "24h" + runtime = "24h" conda: "envs/bcftools.yaml" shell: "bcftools concat -o {output} -Oz --threads 8 {input}" @@ -519,7 +528,7 @@ rule make_plink_all: threads: 10 resources: mem_mb = 30000, - runtime: "96h" + runtime = "96h" conda: "envs/plink.yaml" shell: "plink --keep-allele-order --vcf {input} {params.ID} --memory 20000 --threads 10 --make-bed " @@ -534,7 +543,7 @@ rule make_plink_samp: threads: 10 resources: mem_mb = 20000, - runtime: "2h" + runtime = "2h" conda: "envs/plink.yaml" shell: "plink --keep-allele-order --vcf {input} {params.ID} --memory 20000 --threads 10 --make-bed " @@ -542,3 +551,29 @@ rule make_plink_samp: # If bgen outputs are requested include: "rules/bgen.smk" + +if "fix_fam" in config and config["fix_fam"]: + rule fix_fam: + input: + oldfam = config["fix_fam"], + newfam = '{impute_dir}/data/{cohorts}_chrall_filtered.fam' + output: "{impute_dir}/data/{cohorts}_chrall_filtered_fixed.fam" + threads: 1 + resources: + mem_mb = 1024, + time_min = 180 + conda: "envs/r.yaml" + script: 'scripts/fix_fam.R' + + rule link_fix_bedbim: + input: + bed = '{impute_dir}/data/{cohorts}_chrall_filtered.bed', + bim = '{impute_dir}/data/{cohorts}_chrall_filtered.bim' + output: + bed = '{impute_dir}/data/{cohorts}_chrall_filtered_fixed.bed', + bim = '{impute_dir}/data/{cohorts}_chrall_filtered_fixed.bim' + localrule: True + shell: ''' + ln -rs {input.bed} {output.bed} + ln -rs {input.bim} {output.bim} + ''' diff --git a/workflow/rules/bgen.smk b/workflow/rules/bgen.smk index 5b2d80f..6987dda 100644 --- a/workflow/rules/bgen.smk +++ b/workflow/rules/bgen.smk @@ -8,7 +8,7 @@ rule make_bgen: threads: 1 resources: mem_mb = 4000, - runtime: "24h" + runtime = "24h" container: 'docker://befh/bgen:v1.1.7' shell: """ @@ -26,7 +26,7 @@ rule cat_bgen_samp: threads: 1 resources: mem_mb = 5000, - runtime: "120h" + runtime = "120h" container: 'docker://befh/bgen:v1.1.7' shell: """ @@ -48,7 +48,7 @@ rule make_bgen_allsamp: threads: 10 resources: mem_mb = 40000, - runtime: "24h" + runtime = "24h" container: 'docker://befh/bgen:v1.1.7' shell: """ @@ -65,6 +65,6 @@ rule cat_bgen_allsamp: threads: 1 resources: mem_mb = 5000, - runtime: "120h" + runtime = "120h" container: 'docker://befh/bgen:v1.1.7' shell: "cat-bgen -g {input.gen} -og {output.gen}; cp {input.samp} {output.samp}" From a0088a9b4a0ae41e199ced868223598d60e19c4e Mon Sep 17 00:00:00 2001 From: Brian Fulton-Howard Date: Fri, 6 Jun 2025 17:17:03 -0400 Subject: [PATCH 3/6] full ability to fix fams --- workflow/Snakefile | 8 ++--- workflow/envs/detect_minimac.yaml | 3 +- workflow/scripts/fix_fam.R | 53 +++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 workflow/scripts/fix_fam.R diff --git a/workflow/Snakefile b/workflow/Snakefile index d1ae365..c06d494 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -308,7 +308,7 @@ if sampfilt and minimac_version in ['guess', '3']: sf = sampfilt threads: 8 resources: - mem_mb = 2048, + mem_mb = 4096, runtime = "24h" conda: "envs/bcftools.yaml" shell: @@ -341,7 +341,7 @@ elif minimac_version in ['guess', '3']: filt = qualfilt threads: 8 resources: - mem_mb = 2048, + mem_mb = 4096, runtime = "24h" conda: "envs/bcftools.yaml" shell: @@ -373,7 +373,7 @@ elif sampfilt: sf = sampfilt threads: 8 resources: - mem_mb = 2048, + mem_mb = 4096, runtime = "24h" conda: "envs/bcftools.yaml" shell: @@ -394,7 +394,7 @@ else: filt = qualfilt threads: 8 resources: - mem_mb = 2048, + mem_mb = 4096, runtime = "24h" conda: "envs/bcftools.yaml" shell: diff --git a/workflow/envs/detect_minimac.yaml b/workflow/envs/detect_minimac.yaml index ab49ed7..a570ee5 100644 --- a/workflow/envs/detect_minimac.yaml +++ b/workflow/envs/detect_minimac.yaml @@ -1,5 +1,6 @@ channels: - conda-forge + - bioconda dependencies: - python=3.11 - - pysam=0.22.0 + - pysam=0.23.1 diff --git a/workflow/scripts/fix_fam.R b/workflow/scripts/fix_fam.R new file mode 100644 index 0000000..d61a503 --- /dev/null +++ b/workflow/scripts/fix_fam.R @@ -0,0 +1,53 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages(library(dplyr)) +library(vroom) +library(tidyr) +library(purrr) + +if (!exists("snakemake")) { + setClass("snakemake_fake", representation( + params = "list", input = "list", output = "list", + log = "list", wildcards = "list")) + snakemake <- new("snakemake_fake", + input = list( + oldfam = c("temp/raw/GSA1-AD1_single-probe.fam", + "temp/raw/GSA1-AD2_single-probe.fam", + "temp/raw/GSA1-PD1_single-probe.fam", + "temp/raw/GSA1-PD2_single-probe.fam"), + newfam = "temp/raw/raw.fam" + ), + params = list(), + log = list(), + output = list("results/raw/raw.fam"), + wildcards = list() + ) +} + +col.n <- c("FID", "IID", "PID", "MID", "Sex", "Phe") +col.nn <- c("none", "FIDIID", "PID_new", "MID_new", "Sex_new", "Phe_new") +col.t <- "ccccii" + +new_fam <- snakemake@input[["newfam"]] %>% + vroom::vroom(col_names = col.nn, col_types = col.t) + +stopifnot(nrow(distinct(new_fam, FIDIID)) == nrow(new_fam)) + +old_fam <- snakemake@input[["oldfam"]] %>% + map_dfr(vroom::vroom, col_names = col.n, col_types = col.t) %>% + unite("FIDIID", FID, IID, sep = "_", remove = F) %>% + distinct(FIDIID, .keep_all = TRUE) + +stopifnot(nrow(distinct(old_fam, FIDIID)) == nrow(old_fam)) + +left_join(new_fam, old_fam, by = "FIDIID") %>% + mutate(FID = ifelse(is.na(FID), FIDIID, FID)) %>% + mutate(IID = ifelse(is.na(IID), FIDIID, IID)) %>% + mutate(PID = ifelse(is.na(PID), PID_new, PID)) %>% + mutate(MID = ifelse(is.na(MID), MID_new, MID)) %>% + mutate(Sex = ifelse(is.na(Sex), Sex_new, Sex)) %>% + mutate(Phe = ifelse(is.na(Phe), Phe_new, Phe)) %>% + select(!!col.n) %>% + mutate(IID = gsub("�", "", IID)) %>% + vroom::vroom_write(snakemake@output[[1]], col_names = F, delim = "\t", + quote = "none") From 1347bcee432e96e440838779cba69061e56823fd Mon Sep 17 00:00:00 2001 From: Brian Fulton-Howard Date: Sun, 22 Feb 2026 16:38:14 -0500 Subject: [PATCH 4/6] fix: more memory for indexing --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index c06d494..c11556d 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -493,7 +493,7 @@ rule index_samples_chrom: conda: "envs/bcftools.yaml" threads: 1 resources: - mem_mb = 256, + mem_mb = 1024, time_min = 120 shell: "bcftools index -t {input}" From cd687198124b19c386331592ebf3c2cfcb317168 Mon Sep 17 00:00:00 2001 From: Brian Fulton-Howard Date: Mon, 23 Feb 2026 12:06:38 -0500 Subject: [PATCH 5/6] feat: allow merged VCF to be temporary if calling pipeline is making further changes when using as module --- workflow/Snakefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index c11556d..baf8ad6 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -509,9 +509,15 @@ rule merge_samples_chrom: conda: "envs/bcftools.yaml" shell: "bcftools merge -m none --threads 8 {input.vcf} | bcftools norm -d none -o {output} -Oz" +def tempif_cca(path): + if config.get("temp_ccs", False): + return temp(path) + else: + return path + rule concat_chroms_all: input: expand("{{impute_dir}}/data/by_chrom/all_chr{chrom}_filtered.vcf.gz", chrom=CHROM) - output: "{impute_dir}/data/all_chrall_filtered.vcf.gz" + output: tempif_cca("{impute_dir}/data/all_chrall_filtered.vcf.gz") threads: 8 resources: mem_mb = 2048, From eafcaf5ad40c1e658004c45248098b23dd8e7766 Mon Sep 17 00:00:00 2001 From: adad Date: Fri, 24 Apr 2026 13:48:45 -0400 Subject: [PATCH 6/6] fix: deal with single cohort and merge, and remove deprecated files --- cluster.yaml | 63 ---------------------------------------------- lsf.yaml | 2 -- workflow/Snakefile | 11 ++++++-- 3 files changed, 9 insertions(+), 67 deletions(-) delete mode 100755 cluster.yaml delete mode 100644 lsf.yaml diff --git a/cluster.yaml b/cluster.yaml deleted file mode 100755 index 30de026..0000000 --- a/cluster.yaml +++ /dev/null @@ -1,63 +0,0 @@ -__default__: - project: acc_LOAD - partition: 'bode' - queue: premium - cores: 8 - mem: 2000 - time: '2:00' -stats: - mem: 6000 - cores: 22 - time: '8:00' -filters: #preparation: #filters: - time: '24:00' - mem: 128 -fixheaders: - cores: 1 - time: "24:00" -vcf_merge: - time: "36:00" -rename: - mem: 128 - time: '45' - cores: 1 -renameAuto: - mem: 1024 - time: '45' - cores: 1 -index_samples_chrom: - mem: 256 - cores: 1 -concat_chroms_all: - time: '24:00' - mem: 256 -concat_chroms_samp: - time: '24:00' - mem: 256 -make_bgen: - time: "24:00" - cores: 1 -make_bgen_allsamp: - time: "24:00" - cores: 1 -make_gen: - time: "36:00" - mem: 1000 - cores: 10 -merge_samples_chrom: - time: "36:00" -cat_bgen_samp: - time: "120:00" - cores: 1 - mem: 5000 -cat_bgen_allsamp: - time: "120:00" - cores: 1 - mem: 5000 -make_plink_all: - mem: 3000 - cores: 10 - time: "6:00" -make_plink_samp: - mem: 1000 - cores: 10 diff --git a/lsf.yaml b/lsf.yaml deleted file mode 100644 index aebd1be..0000000 --- a/lsf.yaml +++ /dev/null @@ -1,2 +0,0 @@ -stats: - - "-R 'himem'" diff --git a/workflow/Snakefile b/workflow/Snakefile index baf8ad6..9e3ef64 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -226,7 +226,7 @@ rule stats: conda: "envs/r_stats.yaml" threads: 22 resources: - mem_mb = 176000, + mem = "256GB", runtime = "8h" script: "scripts/Post_imputation.Rmd" @@ -507,7 +507,14 @@ rule merge_samples_chrom: mem_mb = 16000, runtime = "36h" conda: "envs/bcftools.yaml" - shell: "bcftools merge -m none --threads 8 {input.vcf} | bcftools norm -d none -o {output} -Oz" + shell: + """ +if [ $(echo {input.vcf} | wc -w) -eq 1 ]; then + cp {input.vcf} {output} +else + bcftools merge -m none --threads 8 {input.vcf} | bcftools norm -d none -o {output} -Oz +fi +""" def tempif_cca(path): if config.get("temp_ccs", False):