diff --git a/code/pecotmr_integration/twas_ctwas.ipynb b/code/pecotmr_integration/twas_ctwas.ipynb index 8b79b227..b5cf137a 100644 --- a/code/pecotmr_integration/twas_ctwas.ipynb +++ b/code/pecotmr_integration/twas_ctwas.ipynb @@ -96,7 +96,7 @@ "- **Essential columns**: `#chr`, `start`, `end`, `TSS`, `region_id`, `original_data`\n", "- **Structure of the weight database**: \n", " - RDS format.\n", - " - Organized hierarchically: region \u2192 context \u2192 weight matrix.\n", + " - Organized hierarchically: region → context → weight matrix.\n", " - Each column represents a different method.\n", "\n", "eg: `xqtl_meta.tsv`\n", @@ -151,8 +151,8 @@ "- `is_imputable`: status for wether this gene-context pair has cross-validated performance with r-square > 0.01 and pvalue < 0.05 in at least one method for expression level prediction.\n", "- `rsq_cv`: cross validation test of r-square for a method.\n", "- `pval_cv`: cross-validation test of predictive performance p-value for a method.\n", - "- `is_selected_method`: status of being best performing method (for each gene\u2013context pair), we pick the best model with highest cross validation r-square with cross validation pvalue < 0.05\n", - "- `block`: The LD region where the gene\u2019s transcription start site (TSS) is located, based on [predefined LD blocks](https://github.com/cumc/xqtl-data/blob/main/descriptor/reference_data/ld_reference.md). \n", + "- `is_selected_method`: status of being best performing method (for each gene–context pair), we pick the best model with highest cross validation r-square with cross validation pvalue < 0.05\n", + "- `block`: The LD region where the gene’s transcription start site (TSS) is located, based on [predefined LD blocks](https://github.com/cumc/xqtl-data/blob/main/descriptor/reference_data/ld_reference.md). \n", "\n", "If `twas_z` is `NA` it means the context is not imputable for the method of choice\n", "\n", @@ -773,7 +773,7 @@ " }\n", " )\n", " \n", - " # res is NULL \u2192 skip safely\n", + " # res is NULL → skip safely\n", " if (is.null(res) || is.null(res$twas_result)) {\n", " message(paste(\"Batch\", batch, \"returned no results\"))\n", " twas_results_db[[batch]] <- NULL\n", @@ -926,6 +926,7 @@ " \n", " # load genome-wide data across all regions\n", " gwas_meta_data <- fread(\"${gwas_meta_data}\",data.table=FALSE)\n", + " if(!'sample_size' %in% colnames(gwas_meta_data)) stop('Please add column `sample_size` information for each GWAS study in --gwas_meta_data. ')\n", " gwas_studies <- if (length(${gwas_study})!=0) ${gwas_study} else unique(gwas_meta_data$study_id)\n", " gwas_files <- gwas_meta_data[gwas_meta_data$chrom == readr::parse_number(\"${chrom}\") & gwas_meta_data$study_id %in% gwas_studies,, drop=FALSE]\n", " gwas_files <- setNames(gwas_files$file_path, gwas_files$study_id)\n", @@ -977,7 +978,7 @@ " max_pos <- max(max_pos$end[max_pos[,1]==\"${chrom}\"])\n", " region_of_interest <- data.frame(chrom = \"${chrom}\", start = 0, end = max_pos+1)\n", " snp_map <- get_ref_variant_info(\"${ld_meta_data}\", region_of_interest)\n", - " ld_variants <- paste0(\"chr\", snp_map$id)\n", + " ld_variants <- ifelse(grepl(\"^chr\", snp_map$id), snp_map$id, paste0(\"chr\", snp_map$id))\n", " ld_variants <- ld_variants[!duplicated(ld_variants)]\n", "\n", " regions_data <- get_ctwas_meta_data(\"${ld_meta_data}\")\n", @@ -1017,6 +1018,10 @@ " z_snp[[study]] <- harmonize_gwas(gwas_files[study], query_region=paste0(readr::parse_number(\"${chrom}\"), \":0-\", max_pos+1), ld_variants, c(\"beta\", \"z\"), match_min_prop = 0)\n", " z_snp[[study]] <- z_snp[[study]][, colnames(z_snp[[study]])[colnames(z_snp[[study]]) %in% c(\"chrom\", \"pos\", \"variant_id\", \"A1\", \"A2\", \"z\", \"beta\", \"n_sample\",\"n_case\", \"n_control\", \"effect_allele_frequency\")]]\n", " z_snp[[study]] <- z_snp[[study]][, c(\"chrom\", \"pos\", colnames(z_snp[[study]])[!colnames(z_snp[[study]]) %in% c(\"chrom\", \"pos\")])]\n", + " z_snp[['sample_size']][[study]]<- as.integer(unique(na.omit(gwas_meta_data$sample_size[gwas_meta_data$study_id==study])))\n", + " if(length(z_snp[['sample_size']][[study]]!=1) | z_snp[['sample_size']][[study]] <=0 ) {\n", + " stop(\"Please check sample size provided for \", study, \" at --gwas_meta_data. \")\n", + " }\n", " }\n", " # if weight variants are trimmed, load LD and re-compute twas z score \n", " if (${twas_weight_cutoff}!=0 | ${cs_min_cor}!=0 | ${min_pip_cutoff}!=0 | ${max_num_variants}!=Inf ) {\n", @@ -1085,7 +1090,8 @@ " saveRDS(boundary_genes, boundary_gene_file, compress='xz') \n", " }\n", " saveRDS(weight_list[[study]], file.path(outputdir, paste0(name,\".ctwas_weights.\", study, \".${chrom}.rds\")), compress='xz')\n", - " saveRDS(list(z_snp=z_snp[[study]], z_gene=z_gene[[study]]), file.path(outputdir, paste0(name, \".z_gene_snp.\", study, \".${chrom}.rds\")), compress='xz')\n", + " saveRDS(list(z_snp=z_snp[[study]], z_gene=z_gene[[study]], gwas_n=z_snp[['sample_size']][[study]]), \n", + " file.path(outputdir, paste0(name, \".z_gene_snp.\", study, \".${chrom}.rds\")), compress='xz')\n", " weight_list[[study]] <- NULL\n", " z_gene[[study]] <- NULL\n", " z_snp[[study]] <- NULL\n", @@ -1194,6 +1200,8 @@ "parameter: multi_group = True\n", "parameter: merge_regions=False\n", "parameter: L=5\n", + "# p_diff_thresh: p-value cutoff for identifying problematic SNPs with significant difference between observed z-scores and estimated values.\n", + "parameter: p_diff_thresh=5e-8\n", "# sum of gene PIPs in the region should be larger than this threshold to get selected for finemapping\n", "parameter: min_nonSNP_PIP=0.5\n", "# additional name specification for fine-mapping result file names \n", @@ -1340,8 +1348,8 @@ " ld_diag <- diagnose_LD_mismatch_susie(region_ids = gsub(\"chr\", \"\", \"${region_name}\"), \n", " z_snp = z_snp,\n", " LD_map = LD_map, \n", - " gwas_n = nrow(z_snp),\n", - " p_diff_thresh = 5e-8,\n", + " gwas_n = z_snp_list$gwas_n, # gwas sample size \n", + " p_diff_thresh = ${p_diff_thresh},\n", " ncore = ${numThreads},\n", " LD_format = \"custom\",\n", " LD_loader_fun = ctwas_ld_loader,\n", @@ -1613,4 +1621,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}