Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions code/pecotmr_integration/twas_ctwas.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"- **Essential columns**: `#chr`, `start`, `end`, `TSS`, `region_id`, `original_data`\n",
"- **Structure of the weight database**: \n",
" - RDS format.\n",
" - Organized hierarchically: region \u2192 context \u2192 weight matrix.\n",
" - Organized hierarchically: region context weight matrix.\n",
" - Each column represents a different method.\n",
"\n",
"eg: `xqtl_meta.tsv`\n",
Expand Down Expand Up @@ -151,8 +151,8 @@
"- `is_imputable`: status for wether this gene-context pair has cross-validated performance with r-square > 0.01 and pvalue < 0.05 in at least one method for expression level prediction.\n",
"- `rsq_cv`: cross validation test of r-square for a method.\n",
"- `pval_cv`: cross-validation test of predictive performance p-value for a method.\n",
"- `is_selected_method`: status of being best performing method (for each gene\u2013context pair), we pick the best model with highest cross validation r-square with cross validation pvalue < 0.05\n",
"- `block`: The LD region where the gene\u2019s transcription start site (TSS) is located, based on [predefined LD blocks](https://github.com/cumc/xqtl-data/blob/main/descriptor/reference_data/ld_reference.md). \n",
"- `is_selected_method`: status of being best performing method (for each gene–context pair), we pick the best model with highest cross validation r-square with cross validation pvalue < 0.05\n",
"- `block`: The LD region where the gene’s transcription start site (TSS) is located, based on [predefined LD blocks](https://github.com/cumc/xqtl-data/blob/main/descriptor/reference_data/ld_reference.md). \n",
"\n",
"If `twas_z` is `NA` it means the context is not imputable for the method of choice\n",
"\n",
Expand Down Expand Up @@ -773,7 +773,7 @@
" }\n",
" )\n",
" \n",
" # res is NULL \u2192 skip safely\n",
" # res is NULL skip safely\n",
" if (is.null(res) || is.null(res$twas_result)) {\n",
" message(paste(\"Batch\", batch, \"returned no results\"))\n",
" twas_results_db[[batch]] <- NULL\n",
Expand Down Expand Up @@ -926,6 +926,7 @@
" \n",
" # load genome-wide data across all regions\n",
" gwas_meta_data <- fread(\"${gwas_meta_data}\",data.table=FALSE)\n",
" if(!'sample_size' %in% colnames(gwas_meta_data)) stop('Please add column `sample_size` information for each GWAS study in --gwas_meta_data. ')\n",
" gwas_studies <- if (length(${gwas_study})!=0) ${gwas_study} else unique(gwas_meta_data$study_id)\n",
" gwas_files <- gwas_meta_data[gwas_meta_data$chrom == readr::parse_number(\"${chrom}\") & gwas_meta_data$study_id %in% gwas_studies,, drop=FALSE]\n",
" gwas_files <- setNames(gwas_files$file_path, gwas_files$study_id)\n",
Expand Down Expand Up @@ -977,7 +978,7 @@
" max_pos <- max(max_pos$end[max_pos[,1]==\"${chrom}\"])\n",
" region_of_interest <- data.frame(chrom = \"${chrom}\", start = 0, end = max_pos+1)\n",
" snp_map <- get_ref_variant_info(\"${ld_meta_data}\", region_of_interest)\n",
" ld_variants <- paste0(\"chr\", snp_map$id)\n",
" ld_variants <- ifelse(grepl(\"^chr\", snp_map$id), snp_map$id, paste0(\"chr\", snp_map$id))\n",
" ld_variants <- ld_variants[!duplicated(ld_variants)]\n",
"\n",
" regions_data <- get_ctwas_meta_data(\"${ld_meta_data}\")\n",
Expand Down Expand Up @@ -1017,6 +1018,10 @@
" z_snp[[study]] <- harmonize_gwas(gwas_files[study], query_region=paste0(readr::parse_number(\"${chrom}\"), \":0-\", max_pos+1), ld_variants, c(\"beta\", \"z\"), match_min_prop = 0)\n",
" z_snp[[study]] <- z_snp[[study]][, colnames(z_snp[[study]])[colnames(z_snp[[study]]) %in% c(\"chrom\", \"pos\", \"variant_id\", \"A1\", \"A2\", \"z\", \"beta\", \"n_sample\",\"n_case\", \"n_control\", \"effect_allele_frequency\")]]\n",
" z_snp[[study]] <- z_snp[[study]][, c(\"chrom\", \"pos\", colnames(z_snp[[study]])[!colnames(z_snp[[study]]) %in% c(\"chrom\", \"pos\")])]\n",
" z_snp[['sample_size']][[study]]<- as.integer(unique(na.omit(gwas_meta_data$sample_size[gwas_meta_data$study_id==study])))\n",
" if(length(z_snp[['sample_size']][[study]]!=1) | z_snp[['sample_size']][[study]] <=0 ) {\n",
" stop(\"Please check sample size provided for \", study, \" at --gwas_meta_data. \")\n",
" }\n",
" }\n",
" # if weight variants are trimmed, load LD and re-compute twas z score \n",
" if (${twas_weight_cutoff}!=0 | ${cs_min_cor}!=0 | ${min_pip_cutoff}!=0 | ${max_num_variants}!=Inf ) {\n",
Expand Down Expand Up @@ -1085,7 +1090,8 @@
" saveRDS(boundary_genes, boundary_gene_file, compress='xz') \n",
" }\n",
" saveRDS(weight_list[[study]], file.path(outputdir, paste0(name,\".ctwas_weights.\", study, \".${chrom}.rds\")), compress='xz')\n",
" saveRDS(list(z_snp=z_snp[[study]], z_gene=z_gene[[study]]), file.path(outputdir, paste0(name, \".z_gene_snp.\", study, \".${chrom}.rds\")), compress='xz')\n",
" saveRDS(list(z_snp=z_snp[[study]], z_gene=z_gene[[study]], gwas_n=z_snp[['sample_size']][[study]]), \n",
" file.path(outputdir, paste0(name, \".z_gene_snp.\", study, \".${chrom}.rds\")), compress='xz')\n",
" weight_list[[study]] <- NULL\n",
" z_gene[[study]] <- NULL\n",
" z_snp[[study]] <- NULL\n",
Expand Down Expand Up @@ -1194,6 +1200,8 @@
"parameter: multi_group = True\n",
"parameter: merge_regions=False\n",
"parameter: L=5\n",
"# p_diff_thresh: p-value cutoff for identifying problematic SNPs with significant difference between observed z-scores and estimated values.\n",
"parameter: p_diff_thresh=5e-8\n",
"# sum of gene PIPs in the region should be larger than this threshold to get selected for finemapping\n",
"parameter: min_nonSNP_PIP=0.5\n",
"# additional name specification for fine-mapping result file names \n",
Expand Down Expand Up @@ -1340,8 +1348,8 @@
" ld_diag <- diagnose_LD_mismatch_susie(region_ids = gsub(\"chr\", \"\", \"${region_name}\"), \n",
" z_snp = z_snp,\n",
" LD_map = LD_map, \n",
" gwas_n = nrow(z_snp),\n",
" p_diff_thresh = 5e-8,\n",
" gwas_n = z_snp_list$gwas_n, # gwas sample size \n",
" p_diff_thresh = ${p_diff_thresh},\n",
" ncore = ${numThreads},\n",
" LD_format = \"custom\",\n",
" LD_loader_fun = ctwas_ld_loader,\n",
Expand Down Expand Up @@ -1613,4 +1621,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}