From 51fe44ff0b217551bb5054804e9721c55139be9e Mon Sep 17 00:00:00 2001 From: "arianna.landini" Date: Mon, 6 Oct 2025 11:19:27 +0200 Subject: [PATCH 01/10] Using bioalpha for concatenating more than 100 anndatas --- bin/s08bis_concat_anndata_bioalpha.py | 40 +++++++++++++++++++++++++ modules/local/concat_anndata_ba/main.nf | 33 ++++++++++++++++++++ workflows/finemap/main.nf | 14 +++++++-- 3 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 bin/s08bis_concat_anndata_bioalpha.py create mode 100644 modules/local/concat_anndata_ba/main.nf diff --git a/bin/s08bis_concat_anndata_bioalpha.py b/bin/s08bis_concat_anndata_bioalpha.py new file mode 100644 index 0000000..274bb76 --- /dev/null +++ b/bin/s08bis_concat_anndata_bioalpha.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import bioalpha as bsc +import argparse + +def main(): + parser = argparse.ArgumentParser( + description="Concatenate multiple .h5ad files listed in a text file into one AnnData object." + ) + parser.add_argument( + "-i", "--input", + required=True, + help="Path to a text file listing .h5ad files to concatenate (one per line)." + ) + parser.add_argument( + "-o", "--output_file", + required=True, + help="Path and file name for the output merged .h5ad file." + ) + args = parser.parse_args() + + # Read list of files + with open(args.input_txt, "r") as f: + h5ad_files = [line.strip() for line in f if line.strip()] + + if not h5ad_files: + raise ValueError("No valid file paths found in the input text file.") + + print(f"Loading {len(h5ad_files)} .h5ad files...") + adata_list = [bsc.sc.read_h5ad(file) for file in h5ad_files] + + print("Concatenating...") + adata = bsc.sc.concat(adata_list, join='outer', fill_value=0) + + print(f"Writing output to: {args.output_h5ad}") + adata.write_h5ad(args.output_h5ad) + + print("✅ Merge complete!") + +if __name__ == "__main__": + main() diff --git a/modules/local/concat_anndata_ba/main.nf b/modules/local/concat_anndata_ba/main.nf new file mode 100644 index 0000000..d0d074b --- /dev/null +++ b/modules/local/concat_anndata_ba/main.nf @@ -0,0 +1,33 @@ +process CONCAT_ANNDATA_BA { + tag "concat_anndata" + label "process_medium" +// conda '/software/cardinal_analysis/ht/conda_envs/ht_scrna_scalpha' + + publishDir "${params.outdir}/results/anndata/", mode: params.publish_dir_mode, pattern:"*.h5ad" + + input: + path(all_h5ad, stageAs: 'input_file???/*') + val(output_name) + + output: + path "${output_name}", emit: anndata + + script: + def args = task.ext.args ?: '' + + """ + export RETICULATE_PYTHON=\$(which python) + + ls input_file*/*.h5ad > all_h5ad_input_list.txt + + s08bis_concat_anndata_bioalpha.py \ + ${args} \ + --input all_h5ad_input_list.txt \ + --output_file ${output_name} + """ + + stub: + """ + touch ${output_name} + """ +} \ No newline at end of file diff --git a/workflows/finemap/main.nf b/workflows/finemap/main.nf index 86f6840..8ab0aaa 100644 --- a/workflows/finemap/main.nf +++ b/workflows/finemap/main.nf @@ -60,9 +60,19 @@ workflow RUN_FINEMAPPING { all_h5ad = SUSIE_FINEMAPPING.out.susie_results_h5ad .collect() - CONCAT_ANNDATA(all_h5ad, "${params.finemap_id}_anndata.h5ad") + // Branch all_h5ad based on length (how many annDatas to concatenate) - only one branch will have data + all_h5ad_branched = all_h5ad.branch { items -> + small: items.size() < 100 + return items + large: items.size() >= 100 + return items + } + + // Run annData concatenation - use R for fewer files, bioalpha for more + all_h5ad_small = CONCAT_ANNDATA(all_h5ad_branched.small, "${params.finemap_id}_anndata.h5ad") + all_h5ad_large = CONCAT_ANNDATA_BA(all_h5ad_branched.large, "${params.finemap_id}_anndata.h5ad") emit: - finemap_anndata = CONCAT_ANNDATA.out.anndata + finemap_anndata = all_h5ad_small.mix(all_h5ad_large) // Mix the concatenated anndata - only one channel will actually contain data coloc_master = APPEND_TO_MASTER_COLOC.out.coloc_master } \ No newline at end of file From 7af8f311f9006088300dc0db60a0d0d22ba0b893 Mon Sep 17 00:00:00 2001 From: "arianna.landini" Date: Mon, 6 Oct 2025 11:53:29 +0200 Subject: [PATCH 02/10] Fix typo --- bin/s08bis_concat_anndata_bioalpha.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/s08bis_concat_anndata_bioalpha.py b/bin/s08bis_concat_anndata_bioalpha.py index 274bb76..e520978 100644 --- a/bin/s08bis_concat_anndata_bioalpha.py +++ b/bin/s08bis_concat_anndata_bioalpha.py @@ -19,7 +19,7 @@ def main(): args = parser.parse_args() # Read list of files - with open(args.input_txt, "r") as f: + with open(args.input, "r") as f: h5ad_files = [line.strip() for line in f if line.strip()] if not h5ad_files: @@ -32,7 +32,7 @@ def main(): adata = bsc.sc.concat(adata_list, join='outer', fill_value=0) print(f"Writing output to: {args.output_h5ad}") - adata.write_h5ad(args.output_h5ad) + adata.write_h5ad(args.output_file) print("✅ Merge complete!") From bd064b56258e861de8902be343c97578ed63d7b8 Mon Sep 17 00:00:00 2001 From: "arianna.landini" Date: Mon, 6 Oct 2025 15:25:08 +0200 Subject: [PATCH 03/10] Using gpu for bioalpha --- conf/base.config | 5 +++++ modules/local/concat_anndata_ba/main.nf | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 13171f4..b244e87 100644 --- a/conf/base.config +++ b/conf/base.config @@ -38,4 +38,9 @@ process { withLabel:process_high_memory { memory = { 64.GB * task.attempt } } + withLabel:process_gpu { + accelerator = { 4 * task.attempt } + memory = { 64.GB * task.attempt } + time = { 6.h * task.attempt } + } } diff --git a/modules/local/concat_anndata_ba/main.nf b/modules/local/concat_anndata_ba/main.nf index d0d074b..7ed7d5a 100644 --- a/modules/local/concat_anndata_ba/main.nf +++ b/modules/local/concat_anndata_ba/main.nf @@ -1,6 +1,6 @@ process CONCAT_ANNDATA_BA { tag "concat_anndata" - label "process_medium" + label "process_gpu" // conda '/software/cardinal_analysis/ht/conda_envs/ht_scrna_scalpha' publishDir "${params.outdir}/results/anndata/", mode: params.publish_dir_mode, pattern:"*.h5ad" From 9eb70ff4c9a73a62912f69c1962f0552dad3c943 Mon Sep 17 00:00:00 2001 From: Arianna Landini Date: Wed, 8 Oct 2025 15:13:22 +0100 Subject: [PATCH 04/10] Forgot to load newly added module CONCAT_ANNDATA_BA --- workflows/finemap/main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/finemap/main.nf b/workflows/finemap/main.nf index 8ab0aaa..fb62850 100644 --- a/workflows/finemap/main.nf +++ b/workflows/finemap/main.nf @@ -2,6 +2,7 @@ include { SUSIE_FINEMAPPING } from "../../modules/local/susie_finemapping include { APPEND_TO_MASTER_COLOC } from "../../modules/local/append_to_master_coloc" //include { RDS_TO_ANNDATA } from "../../modules/local/rds_to_anndata" include { CONCAT_ANNDATA } from "../../modules/local/concat_anndata" +include { CONCAT_ANNDATA_BA } from "../../modules/local/concat_anndata_ba" workflow RUN_FINEMAPPING { take: @@ -75,4 +76,4 @@ workflow RUN_FINEMAPPING { emit: finemap_anndata = all_h5ad_small.mix(all_h5ad_large) // Mix the concatenated anndata - only one channel will actually contain data coloc_master = APPEND_TO_MASTER_COLOC.out.coloc_master -} \ No newline at end of file +} From 3b7d2381a9a079840523ddeaf7e6c4f998a2cd12 Mon Sep 17 00:00:00 2001 From: Arianna Landini <43240476+ariannalandini@users.noreply.github.com> Date: Thu, 16 Oct 2025 17:04:20 +0200 Subject: [PATCH 05/10] Forgot about coverage in anndata obs --- bin/funs_locus_breaker_cojo_finemap_all_at_once.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R index 53968b8..2be5ccf 100755 --- a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R +++ b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R @@ -716,7 +716,7 @@ from_susie_to_anndata <- function(finemap_list=NULL, cs_indices=NULL, analysis_i lABFs_list <- c(lABFs_list, lABFs) min_res_labf_vec <- c(min_res_labf_vec, min_res_labf) top_pvalue_vec <- c(top_pvalue_vec, top_pvalue) - purity_df <- rbind(purity_df, finemap$sets$purity |> dplyr::mutate(logsum.logABF=logsum.logABF)) + purity_df <- rbind(purity_df, finemap$sets$purity |> dplyr::mutate(logsum.logABF=logsum.logABF, coverage = finemap$sets$requested_coverage)) comment_section <- c(comment_section, rep(finemap$comment_section, length(finemap$sets$cs_index))) comment_section[is.na(comment_section)] <- "NaN" metadata_df <- rbind(metadata_df, finemap$metadata) From b168322ba7c64853fea38c08df2cc6cf1051df91 Mon Sep 17 00:00:00 2001 From: Arianna Landini <43240476+ariannalandini@users.noreply.github.com> Date: Thu, 16 Oct 2025 19:40:14 +0200 Subject: [PATCH 06/10] Making sure to remove actual values for SNPs outside of the extended cs --- bin/funs_locus_breaker_cojo_finemap_all_at_once.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R index 2be5ccf..f954171 100755 --- a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R +++ b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R @@ -732,6 +732,18 @@ from_susie_to_anndata <- function(finemap_list=NULL, cs_indices=NULL, analysis_i sep = "::" ) + +# Set lABF, beta and se to 0 for SNPs outside of cs! +for (cs in names(lABFs_list)) { + lABFs_list[[cs]] <- lABFs_list[[cs]] %>% + dplyr::mutate( + lABF = ifelse(is_cs, lABF, 0), + bC = ifelse(is_cs, bC, 0), + bC_se= ifelse(is_cs, bC_se, 0) + ) +} + + # Prepare `obs_df` metadata obs_df <- metadata_df |> dplyr::select(-L_index) |> dplyr::rename(type=TYPE) obs_df$chr <- paste0("chr", obs_df$chr) From 6a48a84c4475adc5942549b4703edeaf5ecb7cb1 Mon Sep 17 00:00:00 2001 From: Arianna Landini Date: Thu, 16 Oct 2025 18:45:56 +0100 Subject: [PATCH 07/10] Add also function to fix var after concatenation --- bin/s08bis_concat_anndata_bioalpha.py | 43 +++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 6 deletions(-) mode change 100644 => 100755 bin/s08bis_concat_anndata_bioalpha.py diff --git a/bin/s08bis_concat_anndata_bioalpha.py b/bin/s08bis_concat_anndata_bioalpha.py old mode 100644 new mode 100755 index e520978..574b65a --- a/bin/s08bis_concat_anndata_bioalpha.py +++ b/bin/s08bis_concat_anndata_bioalpha.py @@ -1,20 +1,47 @@ #!/usr/bin/env python3 import bioalpha as bsc import argparse +import pandas as pd +import re + +def fix_ad_var(adata): + """ + Standardize variant metadata in an AnnData object. + + Ensures: + - 'snp' column exists in adata.var + - Extracts chromosome ('chr') and position ('pos') from SNP identifiers + in format 'chr{num}:{pos}:EA:RA' + """ + var = adata.var.copy() + + # Add 'snp' column if missing + if "snp" not in var.columns: + var["snp"] = var.index.astype(str) + + # Extract chr and pos using regex + var["chr"] = var["snp"].apply(lambda x: re.sub(r":.*", "", x)) + var["pos"] = var["snp"].apply(lambda x: int(re.sub(r".*:(\d+):.*", r"\1", x))) + + adata.var = var + return adata + def main(): parser = argparse.ArgumentParser( description="Concatenate multiple .h5ad files listed in a text file into one AnnData object." ) parser.add_argument( - "-i", "--input", + "-i", + "--input", required=True, - help="Path to a text file listing .h5ad files to concatenate (one per line)." + help="Path to a text file listing .h5ad files to concatenate (one per line).", ) parser.add_argument( - "-o", "--output_file", + "-o", + "--output_file", required=True, - help="Path and file name for the output merged .h5ad file." + help="Path and file name for the output merged .h5ad file.", ) args = parser.parse_args() @@ -29,12 +56,16 @@ def main(): adata_list = [bsc.sc.read_h5ad(file) for file in h5ad_files] print("Concatenating...") - adata = bsc.sc.concat(adata_list, join='outer', fill_value=0) + adata = bsc.sc.concat(adata_list, join="outer", fill_value=0) - print(f"Writing output to: {args.output_h5ad}") + print("Fixing variant metadata...") + adata = fix_ad_var(adata) + + print(f"Writing output to: {args.output_file}") adata.write_h5ad(args.output_file) print("✅ Merge complete!") + if __name__ == "__main__": main() From cbc1df83906194d2e7c0598b5c43213d1ab4d9bf Mon Sep 17 00:00:00 2001 From: Arianna Landini <43240476+ariannalandini@users.noreply.github.com> Date: Thu, 16 Oct 2025 20:15:10 +0200 Subject: [PATCH 08/10] Instead of setting to 0, actually filter out SNPs outside of cs - avoid having actual 0 values rather than sparse --- ...uns_locus_breaker_cojo_finemap_all_at_once.R | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R index f954171..1a676bf 100755 --- a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R +++ b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R @@ -731,18 +731,6 @@ from_susie_to_anndata <- function(finemap_list=NULL, cs_indices=NULL, analysis_i metadata_df$L_index, sep = "::" ) - - -# Set lABF, beta and se to 0 for SNPs outside of cs! -for (cs in names(lABFs_list)) { - lABFs_list[[cs]] <- lABFs_list[[cs]] %>% - dplyr::mutate( - lABF = ifelse(is_cs, lABF, 0), - bC = ifelse(is_cs, bC, 0), - bC_se= ifelse(is_cs, bC_se, 0) - ) -} - # Prepare `obs_df` metadata obs_df <- metadata_df |> dplyr::select(-L_index) |> dplyr::rename(type=TYPE) @@ -779,6 +767,11 @@ for (cs in names(lABFs_list)) { ) } + # Filter out values for SNPs out of the credible set right before matrices creation + for (cs in seq_along(lABFs_list)) { + lABFs_list[[cs]] <- lABFs_list[[cs]] |> dplyr::filter(is_cs) + } + lABF_matrix_sparse <- create_sparse_matrix("lABF") beta_matrix_sparse <- create_sparse_matrix("bC") se_matrix_sparse <- create_sparse_matrix("bC_se") From 38b1ad94d17072f35aa117636e15fd3cbe312901 Mon Sep 17 00:00:00 2001 From: Arianna Landini <43240476+ariannalandini@users.noreply.github.com> Date: Thu, 4 Dec 2025 11:15:13 +0100 Subject: [PATCH 09/10] Tidy up branches - adding OT to annData part into most updated TileDB using branch (#111) * Adding modification to run on scQTLs on Sanger * Add OT script (#109) * Add Credible Set Expansion feature to Fine-Mapping (#99) * Removing susie.cs.ht() because it should be taken directly from the flanders R package * Have customizable post susie QC parameters * Nevermind - function copied and pasted from gitlab, temporary solution until merge with github flanders r repo * Back to sourcing function rather than calling it from flanders R package (temporary solution) and fixing parameters * Forgot to remove susie_qc_cs_lbf_thr parameter from here * Added susie QC parameters to the nextflow schema * Rearraning post susie QC, so that loci disappearing becuase of QC are re-finemmaped with L=1. And overall L=1 loci do not go thorugh QC * Adding locusbreaker * Adding locus size parameter * Add report collecting loci that were re-finemapped with L=1 after being wiped out by post susie QC * Wrapped all code to go from susie output to rds list of dataframes object in a function * Adding credible set expansion and using function from susie output to rds format * Remove some parameters from list of those mandatory - if not specified, default is fine * Do not hardcode L=10, but rather use the assigned variable * Ok nevermind, reverting previous commit and adding also post susie QC parameters to required list * Removing hardcoding of L=10 for easier maintenance * Forgot to close parenthesis * Temporarely copy and paste functions from gitlab R package version * Computing also cs logsum - and adding it to the anndata obs * Fixed unmatching parenthesis * tile_lb_input parameters defined but not used. Assigning correct filename to tiledb_bfile parameter in test * Removing tuple since it's only one element * Revert "Merge branch 'tiledb_locusbreaker' into cs_expansion" This reverts commit fc15e3b7b4465bfa14e8bae5ddc3a82790de98e1, reversing changes made to 20596980310bb4a561008842dcafb2ef1fcf9519. --------- Co-authored-by: arianna.landini Co-authored-by: bruno-ariano * Fixing KL for smaller loci (N SNPs < L) and identifying HLA region for GRCh 37 (#106) * Having HLA coordinates also in GRCh 37 * Checking for KL length in loci with less than L SNPs * When expanding credible sets, variance of lABFs not only must be different from 0, but also not NA (this happens with a single SNP in the locus) * Correct parameter (coloc_h5ad_input) for coloc input in example * adding script for creating anndata from OT --------- Co-authored-by: Arianna Landini <43240476+ariannalandini@users.noreply.github.com> Co-authored-by: arianna.landini --------- Co-authored-by: bruno.ariano Co-authored-by: arianna.landini --- README.md | 11 +- assets/OT_mapping_credset_table.txt | 6 + bin/parse_credible_sets_refined.py | 424 +++++++++++++++++++++++++ bin/s02_sumstat_munging_and_aligning.R | 15 +- 4 files changed, 449 insertions(+), 7 deletions(-) create mode 100644 assets/OT_mapping_credset_table.txt create mode 100755 bin/parse_credible_sets_refined.py diff --git a/README.md b/README.md index 42fd142..1b7895b 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ nextflow run Biostatistics-Unit-HT/Flanders -r 1.0 -profile [docker|singulari ### Example: Run Only Colocalization (with existing `.h5ad`) ```bash -nextflow run Biostatistics-Unit-HT/Flanders -r 1.0 -profile [docker|singularity|conda] --coloc_input /path/to/finemapping_output.h5ad --run_colocalization true --coloc_id my_coloc_run -w ./work -resume +nextflow run Biostatistics-Unit-HT/Flanders -r 1.0 -profile [docker|singularity|conda] --coloc_h5ad_input /path/to/finemapping_output.h5ad --run_colocalization true --coloc_id my_coloc_run -w ./work -resume ``` ### Quick run with example dataset @@ -84,6 +84,14 @@ Flanders separates the fine-mapping and colocalization process into two distinct

+#### 4.1 Getting external credible sets from Open Targets + - It is possible to create an Anndata using credible-sets already genereted from Open Targets. To do that follow these steps + - Download the Open Targets [credible-set](https://platform.opentargets.org/downloads/credible_set/access) and [study](https://platform.opentargets.org/downloads/study/access). + - Create a conda environment with the following packages: anndata, polars, pyarrow, scipy, math, argparse + - Run the python script in the bin folder called parse_credible_sets_refined.py following the instructions from the arguments +
+
+ ### Step 2: Colocalization analysis #### Inputs @@ -125,7 +133,6 @@ iCOLOC approach allows to:

- ## 👩‍🔬 Credits Developed by the Biostatistics and Genome Analysis Units at [Human Technopole](https://humantechnopole.it/en/)
- [Arianna Landini](mailto:arianna.landini@fht.org)
diff --git a/assets/OT_mapping_credset_table.txt b/assets/OT_mapping_credset_table.txt new file mode 100644 index 0000000..71e055f --- /dev/null +++ b/assets/OT_mapping_credset_table.txt @@ -0,0 +1,6 @@ +project,finemapping,data,description +GCST,SuSiE-inf,gwas,GWAS catalog data +GTEx,SuSie-inf,eqtl,eQTLs from GTEx +UKB_PPP_EUR,SuSiE-inf,pqtl,Data from UKB-PPP considering only Europeans +FINNGEN_R12,SuSie,gwas,Data from Finngen consortium + diff --git a/bin/parse_credible_sets_refined.py b/bin/parse_credible_sets_refined.py new file mode 100755 index 0000000..83219f5 --- /dev/null +++ b/bin/parse_credible_sets_refined.py @@ -0,0 +1,424 @@ +# This script create an anndata starting from study and credisble_set parquet files from Open Target. +# This anndata can later be used by the Flanders pipeline for colocalization + +import polars as pl +import pandas as pd +from scipy.sparse import coo_matrix +from scipy.sparse import csr_matrix +import pyarrow as pa +import math +import anndata as ad +import argparse + +#Although I give all the option here, some project have only certain type of data. For example FINNGEN_R12 has only gwas and SuSie +parser = argparse.ArgumentParser(description="Program to generate credible sets from Open Target in Anndata format for Flanders colocalization") +parser.add_argument("--project", default=None, type=str, help="Name of the project to get data from in OTG (example: FINNGEN_R12, GTEx, GCST)") +parser.add_argument("--type_sumstat", default='gwas', type=str, help="Either a gwas, eqtl or pqtl") +parser.add_argument("--type_finemap", default='SuSie', type=str, help="Either SuSie or SuSiE-inf") +parser.add_argument("--study", default=None, type=str, help="Parquet file of the study from OT") +parser.add_argument("--credset", default=None, type=str, help="Parquet file of the credible sets from OT") +parser.add_argument("--ref_panel", default=None, type=str, help="A csv panel to use to impute missing SNPs from the credible set. Mandatory columns are (chr,pos,a0,a1) wuth a0 and a1 in any particula order") +parser.add_argument("--out", default=None, type=str, help="Name of the output anndata file") +args = parser.parse_args() + + +class OT_create_anndata: + def __init__( + self, study:str, + project:str, + type_study:str, + credset:str, + ref_panel:str, + type_finemap:str): + + self.type_study = type_study + self.study = study + self.credset = credset + self.project = project + self.imputation_panel = ref_panel + self.type_finemap = type_finemap + self.log10 = math.log(10) + self.log_prior_odds = math.log(0.01 / 0.99) + + def _read_study(self): + self.study_pl = pl.read_parquet(self.study) + self.study_pl = self.study_pl.filter((pl.col("hasSumstats") == True) & (pl.col("projectId") == self.project) & (pl.col('studyType') == self.type_study)) + self.study_pl = self.study_pl.with_columns( + pl.col("ldPopulationStructure") + .map_elements( + lambda lst: ",".join(item["ldPopulation"] for item in lst) if lst is not None else None, + return_dtype=pl.String # avoid the warning + ) + .alias("ldPopulationStructure") + ) + if self.type_study != "eqtl" or self.type_study != "pqtl": + self.study_pl = self.study_pl.with_columns( + pl.col("traitFromSource").alias("biosampleId") + ) + if self.project=='GTEx': + self.study_pl = self.study_pl.filter( + pl.col("studyId").str.contains("^gtex_ge_", literal=False) + ) + if self.project=='UKB_PPP_EUR': + self.study_pl = self.study_pl.filter(pl.col("ldPopulationStructure")=="nfe") + return self.study_pl + + def _read_credset(self): + print(self.type_study) + credset_pl = pl.read_parquet(self.credset) + if self.type_study == "eqtl" or self.type_study == "pqtl": + self.credset_query_pl = credset_pl.filter((pl.col("isTransQtl") == False) & (pl.col('finemappingMethod') == self.type_finemap) & (pl.col("studyId").is_in(self.study_pl["studyId"].to_list()))) + else: + self.credset_query_pl = credset_pl.filter( (pl.col('finemappingMethod') == self.type_finemap) & (pl.col("studyId").is_in(self.study_pl["studyId"].to_list()))) + print('ok') + self.credset_query_pl = self.credset_query_pl.filter(pl.col("locus").list.eval(pl.element().struct.field("is99CredibleSet")).list.any()) + if len(self.credset_query_pl)==0: + print('Empty credible sets, please check if the parameter are right (type_sumstat, project, type_finemap)') + exit() + + + def _reshape_credset(self): + study_credset = self.credset_query_pl.join(self.study_pl, on = "studyId", how = "inner") + if "biosampleId" not in study_credset.columns: + study_credset = study_credset.with_columns(pl.lit(None).alias("biosampleId")) + var = study_credset.explode("locus").with_columns(pl.col("locus").struct.unnest()) + var = (var + .group_by("studyLocusId") + .agg([ + pl.col("logBF").sum().alias("sumlogABF") + ]) + ) + study_credset = var.join(study_credset, on = "studyLocusId") + study_credset = study_credset.with_columns( + pl.when((pl.lit(self.type_study != "eqtl")) | (pl.lit(self.type_study != "pqtl"))).then( + pl.col("studyId").alias("phenotype_id") + ).otherwise( + pl.concat_str( + [ + pl.col("biosampleId"), + pl.lit(":"), + pl.col("geneId") + ] + ).alias("phenotype_id") + )) + study_credset = study_credset.with_columns( + pl.concat_str([ + pl.col("projectId"), + pl.lit(":"), + pl.col("studyType") + ]).alias("projectId") + ) + study_credset = study_credset.with_columns([pl.col('variantId').alias('snp'), pl.col('chromosome').alias('chr')]) + study_credset = study_credset.with_columns( + pl.col("snp").str.split_exact("_", 4) + .struct.rename_fields(["chr_snp", "pos_snp", "a0", "a1"]) + .alias("fields") + ).unnest("fields").drop("snp").with_columns([ + pl.min_horizontal("a0", "a1").alias("a_allele1"), + pl.max_horizontal("a0", "a1").alias("a_allele2") + ]).with_columns( + pl.concat_str([ + pl.lit('chr'), + pl.col("chr_snp").cast(pl.Utf8).str.replace("X", "23"), + pl.lit(":"), + pl.col("pos_snp"), + pl.lit(":"), + pl.col("a_allele1"), + pl.lit(":"), + pl.col("a_allele2"), + ]).alias("snp") + ).drop(["chr_snp","pos_snp","a_allele2","a_allele1","a0","a1"]) + study_credset = study_credset.with_columns(pl.col("chr").cast(str).replace("X", "23").alias("chr")) + self.study_credset = study_credset.with_columns( + pl.concat_str([ + pl.lit('chr'), + pl.col("chr"), + pl.lit("::"), + pl.col("projectId"), + pl.lit("::"), + pl.col("phenotype_id"), + pl.lit("::"), + pl.col("snp") + ]).alias("csname"), + pl.when((pl.lit(self.type_study == "eqtl")) | (pl.lit(self.type_study == "pqtl"))).then( + (pl.col("region").str.extract(r":(-?\d+)", 1).cast(pl.Int64) - 500000).clip(lower_bound = 0) + ).otherwise( + pl.col("region").str.extract(r":(-?\d+)", 1).cast(pl.Int64) + ).alias("start"), + pl.when(pl.lit(self.type_study == "eqtl") | (pl.lit(self.type_study == "pqtl"))).then( + pl.col("region").str.extract(r"-(-?\d+)$", 1).cast(pl.Int64) + 500000 + ).otherwise( + pl.col("region").str.extract(r"-(-?\d+)$", 1).cast(pl.Int64) + ).alias("end"), + (pl.col("sumlogABF") * (self.log10 + self.log_prior_odds - (pl.col("locus").list.len().cast(pl.Float64).log()))).alias("min_res_labf"), + pl.col("sumlogABF").alias("cs_log10bf"), + pl.col('snp').alias("topsnp") + ).drop("chr") + + def _explode_locus_snps(self): + if "traitFromSource" not in self.study_credset.columns: + self.study_credset = self.study_credset.with_columns(pl.lit(None).alias("traitFromSource")) + var = self.study_credset.explode("locus").with_columns(pl.col("locus").struct.unnest()) + var= var.with_columns( + pl.col("variantId").str.split_exact("_", 4) + .struct.rename_fields(["chr", "pos", "a0", "a1"]) + .alias("fields") + ).unnest("fields").drop("variantId").with_columns([ + pl.min_horizontal("a0", "a1").alias("a_allele1"), + pl.max_horizontal("a0", "a1").alias("a_allele2") + ]).with_columns( + pl.concat_str([ + pl.lit("chr"), + pl.col("chr").cast(pl.Utf8).str.replace("X", "23"), # Note: you have "chr" twice - is this intentional? + pl.lit(":"), + pl.col("pos"), + pl.lit(":"), + pl.col("a_allele1"), + pl.lit(":"), + pl.col("a_allele2") + ]).alias("snp"), + pl.concat_str([ + pl.lit("chr"), + pl.col("chr") + ]).alias("chrname"), + pl.col('standardError').alias('se') + ).drop("chr","standardError") + var = var.with_columns(pl.col("chrname").cast(str).replace("chrX", "chr23").alias("chr")) + var = var.with_columns(pl.col("pos").cast(pl.Int64)).drop("chrname") + self.var = var.with_columns([ + pl.when( + (pl.col("a0") == pl.col("a_allele2")) & (pl.col("a1") == pl.col("a_allele1")) + ).then( + -pl.col("beta") + ).otherwise( + pl.col("beta") + ).alias("beta")]).select( + "csname", + 'projectId', + 'start', + 'end', + 'phenotype_id', + 'topsnp', + 'min_res_labf', + 'cs_log10bf', + 'logBF', + 'beta', + 'se', + 'snp', + 'chr', + 'pos', + 'a_allele1', + 'a_allele2', + 'nSamples' + ) + + def _merge_with_imputation_refined(self): + imputation_panel = pl.read_csv(self.imputation_panel) + ref = imputation_panel.with_columns([ + pl.min_horizontal("a0", "a1").alias("a_allele1"), + pl.max_horizontal("a0", "a1").alias("a_allele2") + ]).with_columns( + pl.col("chr").cast(pl.Utf8).str.replace("chrX", "chr23"), + pl.concat_str([ + pl.col("chr").cast(pl.Utf8).str.replace("chrX", "chr23"), + pl.lit(":"), + pl.col("pos"), + pl.lit(":"), + pl.col("a_allele1"), + pl.lit(":"), + pl.col("a_allele2") + ]).alias("snp") + ) + + # Perform full outer join + ot_merge = ref.join( + self.var, + on=['chr', 'pos', 'a_allele1', 'a_allele2'], + how='full' + ) + + # Coalesce the columns + ot_merge = ot_merge.with_columns([ + pl.coalesce(["chr", "chr_right"]).alias("chr"), + pl.coalesce(["pos", "pos_right"]).alias("pos"), + pl.coalesce(["a_allele1", "a_allele1_right"]).alias("a_allele1"), + pl.coalesce(["a_allele2", "a_allele2_right"]).alias("a_allele2"), + pl.coalesce(["snp", "snp_right"]).alias("snp") + ]).drop(["chr_right", "pos_right", "a_allele1_right", "a_allele2_right", "snp_right"]) + + # Sort by chromosome and position + sorted_df = ot_merge.sort(["chr", "pos"]) + + # Create a helper column to identify credible set boundaries + # We'll mark the start of each credible set region + result_with_filled_nulls = sorted_df.with_columns([ + # Create boundary markers for credible sets + pl.when( + pl.col("csname").is_not_null() & + pl.col("start").is_not_null() & + pl.col("end").is_not_null() + ).then(1).otherwise(0).alias("is_cs_boundary"), + + # Forward fill the credible set information within chromosomal regions + pl.col("csname").forward_fill().over("chr"), + pl.col("start").forward_fill().over("chr"), + pl.col("end").forward_fill().over("chr"), + pl.col("projectId").forward_fill().over("chr"), + pl.col("phenotype_id").forward_fill().over("chr"), + pl.col("topsnp").forward_fill().over("chr"), + pl.col("min_res_labf").forward_fill().over("chr"), + pl.col("cs_log10bf").forward_fill().over("chr"), + ]).with_columns([ + # Only keep the filled values if the SNP falls within the credible set region + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("csname")).otherwise(None).alias("csname"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("projectId")).otherwise(None).alias("projectId"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("phenotype_id")).otherwise(None).alias("phenotype_id"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("topsnp")).otherwise(None).alias("topsnp"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("min_res_labf")).otherwise(None).alias("min_res_labf"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("cs_log10bf")).otherwise(None).alias("cs_log10bf"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("start")).otherwise(None).alias("start"), + + pl.when( + (pl.col("pos") >= pl.col("start")) & + (pl.col("pos") <= pl.col("end")) + ).then(pl.col("end")).otherwise(None).alias("end"), + ]).with_columns([ + # Fill summary statistics with 0 + pl.col("logBF").fill_null(0), + pl.col("beta").fill_null(0), + pl.col("se").fill_null(0), + + # Ensure SNP format is consistent + pl.col("snp").fill_null( + pl.concat_str([ + pl.col("chr"), + pl.lit(":"), + pl.col("pos"), + pl.lit(":"), + pl.col("a_allele1"), + pl.lit(":"), + pl.col("a_allele2"), + ]) + ) + ]).select([ + "chr", + pl.col("pos").cast(pl.Utf8), + "csname", + "snp", + 'start', + 'end', + 'projectId', + 'phenotype_id', + 'topsnp', + 'min_res_labf', + 'cs_log10bf', + 'logBF', + 'beta', + 'se' + ]) + + self.result_with_filled_nulls = result_with_filled_nulls + + + def _create_anndata_refined(self): + susie_credset_pd = self.result_with_filled_nulls.to_pandas() + + # Only use rows with valid credible sets + valid_data = susie_credset_pd[susie_credset_pd["csname"].notna()].copy() + valid_data["csname"] = valid_data["csname"].astype(str) + + # Factorize using only valid data to ensure consistent lengths + row_idx, row_labels = valid_data["csname"].factorize() + col_idx, col_labels = valid_data["snp"].factorize() + + # Get values from the same valid_data to ensure consistent lengths + logbf = valid_data["logBF"].fillna(0).to_numpy() + beta = valid_data["beta"].fillna(0).to_numpy() + se = valid_data["se"].fillna(0).to_numpy() + + # Create sparse matrices + sparse_matrix_logbf = coo_matrix((logbf, (row_idx, col_idx))) + sparse_matrix_beta = coo_matrix((beta, (row_idx, col_idx))) + sparse_matrix_se = coo_matrix((se, (row_idx, col_idx))) + + adata = ad.AnnData(sparse_matrix_logbf) + adata.obs["cs_name"] = row_labels + adata.var["snp"] = col_labels + adata.X = csr_matrix(adata.X) + adata.layers["beta"] = csr_matrix(sparse_matrix_beta) + adata.layers["se"] = csr_matrix(sparse_matrix_se) + + # Create mappings for obs metadata + start_mapping = dict(zip(valid_data['csname'], valid_data['start'])) + end_mapping = dict(zip(valid_data['csname'], valid_data['end'])) + topsnp_mapping = dict(zip(valid_data['csname'], valid_data['topsnp'])) + credset_abf = dict(zip(valid_data['csname'], valid_data['cs_log10bf'])) + project_mapping = dict(zip(valid_data['csname'], valid_data['projectId'])) + min_credset_abf = dict(zip(valid_data['csname'], valid_data['min_res_labf'])) + study_mapping = dict(zip(valid_data['csname'], valid_data['phenotype_id'])) + + # Assign obs metadata + start_labels = [start_mapping.get(label) for label in row_labels] + end_labels = [end_mapping.get(label) for label in row_labels] + topsnp_labels = [topsnp_mapping.get(label) for label in row_labels] + credset_abf_labels = [credset_abf.get(label) for label in row_labels] + phenotype_labels = [study_mapping.get(label) for label in row_labels] + min_credset_abf_labels = [min_credset_abf.get(label) for label in row_labels] + project_labels = [project_mapping.get(label) for label in row_labels] + + adata.obs["start"] = start_labels + adata.obs["end"] = end_labels + adata.obs["topsnp"] = topsnp_labels + adata.obs["study_id"] = project_labels + adata.obs["min_res_labf"] = min_credset_abf_labels + adata.obs["logsum(logABF)"] = credset_abf_labels + adata.obs['start'] = adata.obs['start'].apply(lambda x: 0 if x < 0 else x) + adata.obs["phenotype_id"] = phenotype_labels + adata.obs[['chr']] = adata.obs['topsnp'].str.split(':', expand=True)[[0]] + adata.obs.loc[:, "coverage"] = 0.99 + + # Assign var metadata + adata.var[['chr', 'pos']] = adata.var['snp'].str.split(':', expand=True)[[0, 1]] + + # Set indices + adata.var.index = adata.var['snp'] + adata.obs.index = adata.obs['cs_name'] + return adata + +if __name__=='__main__': + ot_obj = OT_create_anndata(study = args.study, credset = args.credset, ref_panel = args.ref_panel, type_finemap=args.type_finemap, project=args.project, type_study = args.type_sumstat) + ot_obj._read_study() + ot_obj._read_credset() + ot_obj._reshape_credset() + ot_obj._explode_locus_snps() + ot_obj._merge_with_imputation_refined() + adata = ot_obj._create_anndata() + adata.write(f'{args.out}.h5ad', compression="gzip") + diff --git a/bin/s02_sumstat_munging_and_aligning.R b/bin/s02_sumstat_munging_and_aligning.R index 5ee1b56..56dac6f 100755 --- a/bin/s02_sumstat_munging_and_aligning.R +++ b/bin/s02_sumstat_munging_and_aligning.R @@ -667,16 +667,21 @@ if(nrow(loci_list) > 0){ ### Add study ID to the loci table. Save loci_list[, study_id := opt$study_id] - ### Check if locus spans the HLA locus chr6:28,510,120-33,480,577 + ### Check if locus spans the HLA locus chr6:28,510,120-33,480,577 (GRCh38) / chr6:28,477,797-33,448,354 (GRCh37) ### https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC?asm=GRCh38 + ### https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC?asm=GRCh37 # This code checks for four conditions to determine if a locus partially or completely spans the HLA region: # 1) Locus starts before (or at the exact beginning of) HLA and ends after (or at the exact end of) HLA. # 2) Locus starts and ends within the HLA region. # 3) Locus starts before the end of HLA and ends after the end of HLA. # 4) Locus starts before the beginning of HLA and ends after the beginning of HLA. - hla_start=28510120 - hla_end=33480577 - hla_coord <- seq(hla_start,hla_end) + if(as.numeric(opt$grch)==37 && isFALSE(opt$run_liftover)){ + hla_start=28477797 + hla_end=33448354 + } else { + hla_start=28510120 + hla_end=33480577 + } loci_list[, is_in_hla := chr == 6 & ((start <= hla_end & end >= hla_start) | @@ -728,4 +733,4 @@ idx <- Rsamtools::indexTabix(zipped, end=which(colnames(dataset_munged)=="BP") ) -message("-- COMPLETED SUCCESSFULLY --") \ No newline at end of file +message("-- COMPLETED SUCCESSFULLY --") From 1540b685be05785a980560ea647677d94575166b Mon Sep 17 00:00:00 2001 From: Bruno Ariano Date: Tue, 20 Jan 2026 14:04:20 +0100 Subject: [PATCH 10/10] Adding latest improvment to the TileDB version using bioalpha for anndata concatenation * attach tdbsumstat to flanders * Remove patch to replace the chr string in SNP * Exclude from cs expansion loci of only one SNP - no variance --------- Co-authored-by: Arianna Landini --- ...s_locus_breaker_cojo_finemap_all_at_once.R | 2 +- bin/s04_susie_finemapping.R | 4 +-- conf/base.config | 3 +- conf/test.config | 31 ++++++++++++++----- example_data/test_finemap_saige.csv | 4 +++ main.nf | 8 ++--- modules/local/locus_breaker_tiledb/main.nf | 16 +++++----- nextflow.config | 5 +-- 8 files changed, 48 insertions(+), 25 deletions(-) create mode 100644 example_data/test_finemap_saige.csv diff --git a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R index 1a676bf..7d809b5 100755 --- a/bin/funs_locus_breaker_cojo_finemap_all_at_once.R +++ b/bin/funs_locus_breaker_cojo_finemap_all_at_once.R @@ -583,7 +583,7 @@ expand_cs <- function(fitted) { #' cs_idx <- which(lbf_vec > 2) #' find_threshold(lbf_vec, original_idx = cs_idx) find_threshold <- function(vector, original_idx = NULL) { - if (var(vector) != 0) { + if ( var(vector) != 0 && !is.na(var(vector)) ) { thres <- optim( fn = function(th) thresholder(th, vector), p = 0, diff --git a/bin/s04_susie_finemapping.R b/bin/s04_susie_finemapping.R index 0be0e8e..fa9697d 100755 --- a/bin/s04_susie_finemapping.R +++ b/bin/s04_susie_finemapping.R @@ -72,13 +72,13 @@ if (is.null(opt$batch)) { "SNP"="SNPID", "BP" = "SNP_POS" ) - loci_df <- loci_df %>% rename( "chr" = "CHR", "start" = "START", "end" = "END" ) + if(loci_df[1,'TYPE'] == "gwas"){ dataset_aligned <- dataset_aligned %>% rename( @@ -146,7 +146,7 @@ for (i in 1:nrow(loci_df)) { dentist.bin="DENTIST" ) } - + print(dataset_aligned_sub) # Compute LD matrix susie_ld <- prep_susie_ld( D=dataset_aligned_sub, diff --git a/conf/base.config b/conf/base.config index b244e87..3e5237a 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,5 +1,6 @@ process { - conda = "/ssu/gassu/conda_envs/flanders_3.2" + //conda = "/ssu/gassu/conda_envs/flanders_3.2" + conda = "/software/cardinal_analysis/ht/conda_envs/flanders_3.2" // conda = "${projectDir}/pipeline_environment.yml" container = "htgenomeanalysisunit/flanders:3.0.1" diff --git a/conf/test.config b/conf/test.config index daed9bd..e76e191 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,13 +26,30 @@ profiles { tiledb_lb_maf = 0.01 tiledb_lb_typesumstat = "gwas" tiledb_lb_hole = 250000 - tiledb_metadata_file = "${projectDir}/example_data/example_data_tiledb_metadata.csv" - tiledb_bfile = "${projectDir}/example_data/ld/1KG_phase3_hg37_1:10546866-11576788_alpha_sorted_alleles" + tiledb_lb_file = "${projectDir}/example_data/example_data_tiledb_metadata.csv" + tiledb_pfile = "${projectDir}/example_data/ld/1KG_phase3_hg37_1:10546866-11576788_alpha_sorted_alleles" tiledb_cs_thresh = 0.99 - is_test_profile = true - tiledb_large_locus_size = 1000000 - workers = 4 - run_colocalization = true + is_test_profile = true + tiledb_large_locus_size = 1000000 + workers = 4 + run_colocalization = true + } + } + test_tiledb_sc { + params { + tiledb = true + tiledb_uri = "/lustre/scratch124/humgen/projects_v2/cardinal_analysis/analysed_datasets/eqtls/F3_QTLs_saige/UKBB/celltype_2/tiledb/results/TileDB/TileDB_F3_saige_UKBB_celltype_2" + tiledb_batch_size = 2 + path_tiledb_lb_out = "out" + tiledb_lb_maf = 0.01 + tiledb_lb_typesumstat = "qtl" + tiledb_lb_hole = 250000 + tiledb_lb_file = "${projectDir}/test_finemap_saige.csv" + tiledb_pfile = "/lustre/scratch124/humgen/projects_v2/cardinal_analysis/analysis/core_dataset/freeze3/eQTL_input/genotypes_autosomes_symlink__maf0.001/GRCh38_ukb_processed_autosomes" + tiledb_cs_thresh = 0.99 + is_test_profile = true + tiledb_large_locus_size = 1000000 + run_colocalization = true } } -} \ No newline at end of file +} diff --git a/example_data/test_finemap_saige.csv b/example_data/test_finemap_saige.csv new file mode 100644 index 0000000..fb3bd4d --- /dev/null +++ b/example_data/test_finemap_saige.csv @@ -0,0 +1,4 @@ +CHR,TRAIT,SIG,LIM +4,B_memory_IGHMhigh:ENSG00000174125,0.001334232,0.001334232 +4,B_memory_IGHMhigh:ENSG00000164111,2.604472e-12,1e-5 +4,B_memory_IGHMhigh:ENSG00000070190,5.721449e-10,1e-5 diff --git a/main.nf b/main.nf index 69ce7b2..0171722 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ workflow { if (params.tiledb){ // Split input file in a set of loci to run in parallel - Channel.fromPath(params.tiledb_metadata_file, checkIfExists:true) + Channel.fromPath(params.tiledb_lb_file, checkIfExists:true) .splitText(by: params.tiledb_batch_size, keepHeader: true, file: true) .map { batch_file -> def batch_index = (batch_file.name =~ /\.(\d+)\.csv$/)[0][1] @@ -54,7 +54,7 @@ workflow { // inspect values at runtime LOCUS_BREAKER_TILEDB(tiledb_metadata_batches) - bfiles = Channel.fromPath("${params.tiledb_bfile}.{pgen,pvar,psam}").collect() + bfiles = Channel.fromPath("${params.tiledb_pfile}.{pgen,pvar,psam}").collect() // Use a single meta-study map so combine(by:0) can match restructured_segments = LOCUS_BREAKER_TILEDB.out.locus_breaker_tdb_segments @@ -95,7 +95,7 @@ workflow { [batch_num, meta_loci, interval_file] } - // Build finemapping_config by mapping over the tiledb_bfile channel so + // Build finemapping_config by mapping over the tiledb_pfile channel so // the bfile becomes a concrete path value inside the tuple. finemap_params = [ "skip_dentist": params.skip_dentist, @@ -284,4 +284,4 @@ workflow { // At the end store log status completionSummary() } -} \ No newline at end of file +} diff --git a/modules/local/locus_breaker_tiledb/main.nf b/modules/local/locus_breaker_tiledb/main.nf index 050bc5a..0436889 100644 --- a/modules/local/locus_breaker_tiledb/main.nf +++ b/modules/local/locus_breaker_tiledb/main.nf @@ -2,8 +2,8 @@ process LOCUS_BREAKER_TILEDB { label "process_multi" - conda '/ssu/gassu/conda_envs/scqtl' - //conda '/software/cardinal_analysis/ht/conda_envs/scqtl' + //conda '/ssu/gassu/conda_envs/tdbsumstat' + conda '/software/cardinal_analysis/ht/conda_envs/tdbsumstat' publishDir "${params.outdir}/results/gwas_and_loci_tables/", mode: params.publish_dir_mode @@ -20,15 +20,15 @@ process LOCUS_BREAKER_TILEDB { // Define the shell script to execute script: """ - scqtl --workers ${params.workers} \ + tdbsumstat \ export \ - --table ${traits_list_table} \ + --table-lb ${traits_list_table} \ --uri-path ${params.tiledb_uri} \ - --out_lb ${params.path_tiledb_lb_out} \ - --maf ${params.tiledb_lb_maf} \ + --out ${params.path_tiledb_lb_out} \ + --maf-lb ${params.tiledb_lb_maf} \ --type-sumstat ${params.tiledb_lb_typesumstat} \ - --hole ${params.tiledb_lb_hole} \ - --locus-max-size ${params.tiledb_large_locus_size} \ + --hole-lb ${params.tiledb_lb_hole} \ + --locus-max-size-lb ${params.tiledb_large_locus_size} \ --locusbreaker \ --batch-name ${batch_index} diff --git a/nextflow.config b/nextflow.config index dd93648..6796fe8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,7 +33,7 @@ params { tiledb_cs_thresh = null tiledb_lb_file = null tiledb_metadata = null - tiledb_bfile = null + tiledb_pfile = null tiledb_large_locus_size = null tiledb_batch_size = null workers = null @@ -163,6 +163,7 @@ profiles { test { includeConfig 'conf/test.config' } test_long { includeConfig 'conf/test.config' } test_tiledb { includeConfig 'conf/test.config' } + test_tiledb_sc { includeConfig 'conf/test.config' } } // Set filename and location for timeline, report, trace and dag @@ -199,4 +200,4 @@ manifest { nextflowVersion = '!>=24.04' version = '1.0.0' doi = '' -} \ No newline at end of file +}