StatFunGen · gaow · Apr 17, 2026 · Apr 14, 2026
diff --git a/code/pecotmr_integration/twas_ctwas.ipynb b/code/pecotmr_integration/twas_ctwas.ipynb
@@ -96,7 +96,7 @@
     "- **Essential columns**: `#chr`, `start`, `end`, `TSS`, `region_id`, `original_data`\n",
     "- **Structure of the weight database**: \n",
     "  - RDS format.\n",
-    "  - Organized hierarchically: region \u2192 context \u2192 weight matrix.\n",
+    "  - Organized hierarchically: region → context → weight matrix.\n",
     "  - Each column represents a different method.\n",
     "\n",
     "eg: `xqtl_meta.tsv`\n",
@@ -151,8 +151,8 @@
     "- `is_imputable`: status for wether this gene-context pair has  cross-validated performance with r-square > 0.01 and pvalue < 0.05 in at least one method for expression level prediction.\n",
     "- `rsq_cv`: cross validation test of r-square for a method.\n",
     "- `pval_cv`: cross-validation test of predictive performance p-value for a method.\n",
-    "- `is_selected_method`: status of being best performing method (for each gene\u2013context pair), we pick the best model with highest cross validation r-square with cross validation pvalue < 0.05\n",
-    "- `block`: The LD region where the gene\u2019s transcription start site (TSS) is located, based on [predefined LD blocks](https://github.com/cumc/xqtl-data/blob/main/descriptor/reference_data/ld_reference.md). \n",
+    "- `is_selected_method`: status of being best performing method (for each gene–context pair), we pick the best model with highest cross validation r-square with cross validation pvalue < 0.05\n",
+    "- `block`: The LD region where the gene’s transcription start site (TSS) is located, based on [predefined LD blocks](https://github.com/cumc/xqtl-data/blob/main/descriptor/reference_data/ld_reference.md). \n",
     "\n",
     "If `twas_z` is `NA` it means the context is not imputable for the method of choice\n",
     "\n",
@@ -773,7 +773,7 @@
     "            }\n",
     "        )\n",
     "    \n",
-    "        # res is NULL \u2192 skip safely\n",
+    "        # res is NULL → skip safely\n",
     "        if (is.null(res) || is.null(res$twas_result)) {\n",
     "            message(paste(\"Batch\", batch, \"returned no results\"))\n",
     "            twas_results_db[[batch]] <- NULL\n",
@@ -926,6 +926,7 @@
     "    \n",
     "    # load genome-wide data across all regions\n",
     "    gwas_meta_data <- fread(\"${gwas_meta_data}\",data.table=FALSE)\n",
+    "    if(!'sample_size' %in% colnames(gwas_meta_data)) stop('Please add column `sample_size` information for each GWAS study in --gwas_meta_data. ')\n",
     "    gwas_studies <- if (length(${gwas_study})!=0) ${gwas_study} else unique(gwas_meta_data$study_id)\n",
     "    gwas_files <- gwas_meta_data[gwas_meta_data$chrom == readr::parse_number(\"${chrom}\") & gwas_meta_data$study_id %in% gwas_studies,, drop=FALSE]\n",
     "    gwas_files <- setNames(gwas_files$file_path, gwas_files$study_id)\n",
@@ -977,7 +978,7 @@
     "    max_pos <- max(max_pos$end[max_pos[,1]==\"${chrom}\"])\n",
     "    region_of_interest <- data.frame(chrom = \"${chrom}\", start = 0, end = max_pos+1)\n",
     "    snp_map <- get_ref_variant_info(\"${ld_meta_data}\", region_of_interest)\n",
-    "    ld_variants <- paste0(\"chr\", snp_map$id)\n",
+    "    ld_variants <- ifelse(grepl(\"^chr\", snp_map$id), snp_map$id, paste0(\"chr\", snp_map$id))\n",
     "    ld_variants <- ld_variants[!duplicated(ld_variants)]\n",
     "\n",
     "    regions_data <- get_ctwas_meta_data(\"${ld_meta_data}\")\n",
@@ -1017,6 +1018,10 @@
     "        z_snp[[study]] <- harmonize_gwas(gwas_files[study], query_region=paste0(readr::parse_number(\"${chrom}\"), \":0-\", max_pos+1), ld_variants, c(\"beta\", \"z\"), match_min_prop = 0)\n",
     "        z_snp[[study]] <- z_snp[[study]][, colnames(z_snp[[study]])[colnames(z_snp[[study]]) %in% c(\"chrom\", \"pos\", \"variant_id\", \"A1\", \"A2\", \"z\", \"beta\", \"n_sample\",\"n_case\", \"n_control\", \"effect_allele_frequency\")]]\n",
     "        z_snp[[study]] <- z_snp[[study]][, c(\"chrom\", \"pos\", colnames(z_snp[[study]])[!colnames(z_snp[[study]]) %in% c(\"chrom\", \"pos\")])]\n",
+    "        z_snp[['sample_size']][[study]]<- as.integer(unique(na.omit(gwas_meta_data$sample_size[gwas_meta_data$study_id==study])))\n",
+    "        if(length(z_snp[['sample_size']][[study]]!=1) | z_snp[['sample_size']][[study]] <=0 ) {\n",
+    "            stop(\"Please check sample size provided for \", study, \" at --gwas_meta_data. \")\n",
+    "        }\n",
     "    }\n",
     "    # if weight variants are trimmed, load LD and re-compute twas z score \n",
     "    if (${twas_weight_cutoff}!=0 | ${cs_min_cor}!=0 | ${min_pip_cutoff}!=0 | ${max_num_variants}!=Inf ) {\n",
@@ -1085,7 +1090,8 @@
     "          saveRDS(boundary_genes, boundary_gene_file, compress='xz')   \n",
     "      }\n",
     "      saveRDS(weight_list[[study]], file.path(outputdir, paste0(name,\".ctwas_weights.\", study, \".${chrom}.rds\")), compress='xz')\n",
-    "      saveRDS(list(z_snp=z_snp[[study]], z_gene=z_gene[[study]]), file.path(outputdir,  paste0(name, \".z_gene_snp.\", study, \".${chrom}.rds\")), compress='xz')\n",
+    "      saveRDS(list(z_snp=z_snp[[study]], z_gene=z_gene[[study]], gwas_n=z_snp[['sample_size']][[study]]), \n",
+    "              file.path(outputdir,  paste0(name, \".z_gene_snp.\", study, \".${chrom}.rds\")), compress='xz')\n",
     "      weight_list[[study]] <- NULL\n",
     "      z_gene[[study]] <- NULL\n",
     "      z_snp[[study]] <- NULL\n",
@@ -1194,6 +1200,8 @@
     "parameter: multi_group = True\n",
     "parameter: merge_regions=False\n",
     "parameter: L=5\n",
+    "# p_diff_thresh: p-value cutoff for identifying problematic SNPs with significant difference between observed z-scores and estimated values.\n",
+    "parameter: p_diff_thresh=5e-8\n",
     "# sum of gene PIPs in the region should be larger than this threshold to get selected for finemapping\n",
     "parameter: min_nonSNP_PIP=0.5\n",
     "# additional name specification for fine-mapping result file names \n",
@@ -1340,8 +1348,8 @@
     "        ld_diag <- diagnose_LD_mismatch_susie(region_ids = gsub(\"chr\", \"\", \"${region_name}\"), \n",
     "                                  z_snp = z_snp,\n",
     "                                  LD_map = LD_map, \n",
-    "                                  gwas_n = nrow(z_snp),\n",
-    "                                  p_diff_thresh = 5e-8,\n",
+    "                                  gwas_n = z_snp_list$gwas_n, # gwas sample size \n",
+    "                                  p_diff_thresh = ${p_diff_thresh},\n",
     "                                  ncore = ${numThreads},\n",
     "                                  LD_format = \"custom\",\n",
     "                                  LD_loader_fun = ctwas_ld_loader,\n",
@@ -1613,4 +1621,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}