diff --git a/Scripts/pgs_methods/dbslmm.R b/Scripts/pgs_methods/dbslmm.R index 2df25cc0..7943d4e7 100644 --- a/Scripts/pgs_methods/dbslmm.R +++ b/Scripts/pgs_methods/dbslmm.R @@ -167,10 +167,10 @@ if(any(ldsc_h2*opt$h2f > 1)){ # Save in plink1 format for DBSLMM if(!is.null(opt$ref_keep)){ log_add(log_file = log_file, message = 'ref_keep used to subset reference genotype data.') - plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), plink2 = opt$plink2, chr = CHROMS, keep = opt$ref_keep, memory = opt$memory) + plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), extract = gwas$SNP, plink2 = opt$plink2, chr = CHROMS, keep = opt$ref_keep, memory = opt$memory) opt$ref_plink_chr_subset<-paste0(tmp_dir,'/ref_subset_chr') } else { - plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), plink2 = opt$plink2, chr = CHROMS, memory = opt$memory) + plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), extract = gwas$SNP, plink2 = opt$plink2, chr = CHROMS, memory = opt$memory) opt$ref_plink_chr_subset<-paste0(tmp_dir,'/ref_subset_chr') } diff --git a/Scripts/pgs_methods/lassosum.R b/Scripts/pgs_methods/lassosum.R index 99062bc7..dfcc7c51 100644 --- a/Scripts/pgs_methods/lassosum.R +++ b/Scripts/pgs_methods/lassosum.R @@ -18,8 +18,10 @@ option_list = list( help="Path PLINK v2 software binary [optional]"), make_option("--output", action="store", default=NULL, type='character', help="Path for output files [required]"), - make_option("--n_cores", action="store", default=1, type='numeric', - help="Number of cores to use [optional]"), + make_option("--n_cores", action="store", default=1, type='numeric', + help="Number of cores to use [optional]"), + make_option("--pseudo_only", action="store", default=F, type='logical', + help="Logical indicating whether only pseudovalidated model should be output [optional]"), make_option("--test", action="store", default=NA, type='character', help="Specify number of SNPs to include [optional]"), make_option("--sumstats", action="store", default=NULL, type='character', @@ -143,7 +145,7 @@ out <- lassosum.pipeline( # Change working directory back to the original setwd(orig_wd) -# Write out a score file +# Format score file score_file <- data.table(SNP = gwas$SNP[out$sumstats$order], out$sumstats[c('A1', 'A2')]) for(i in 1:length(out$s)){ @@ -154,21 +156,6 @@ for(i in 1:length(out$s)){ } } -# Flip effects to match reference alleles -ref <- read_pvar(opt$ref_plink_chr, chr = CHROMS)[, c('SNP','A1','A2'), with=F] -score_new <- map_score(ref = ref, score = score_file) - -# Reduce number of significant figures to save space -score_new[, (4:ncol(score_new)) := lapply(.SD, signif, digits = 7), .SDcols = 4:ncol(score_new)] - -fwrite(score_new, paste0(opt$output,'.score'), col.names=T, sep=' ', quote=F) - -if(file.exists(paste0(opt$output,'.score.gz'))){ - system(paste0('rm ',opt$output,'.score.gz')) -} - -system(paste0('gzip ',opt$output,'.score')) - ##### # Perform pseudovalidation ##### @@ -190,7 +177,26 @@ log_add(log_file = log_file, message = c( paste0('s = ', out2$s), paste0('lambda = ', out2$lambda), paste0('value = ', v$validation.table$value[v$validation.table$lambda == v$best.lambda & v$validation.table$s == v$best.s]) - )) +)) + +if(opt$pseudo_only){ + score_file <- score_file[, c('SNP','A1','A2',paste0('SCORE_s', out2$s, '_lambda', out2$lambda)), with=F] +} + +# Flip effects to match reference alleles +ref <- read_pvar(opt$ref_plink_chr, chr = CHROMS)[, c('SNP','A1','A2'), with=F] +score_new <- map_score(ref = ref, score = score_file) + +# Reduce number of significant figures to save space +score_new[, (4:ncol(score_new)) := lapply(.SD, signif, digits = 7), .SDcols = 4:ncol(score_new)] + +fwrite(score_new, paste0(opt$output,'.score'), col.names=T, sep=' ', quote=F) + +if(file.exists(paste0(opt$output,'.score.gz'))){ + system(paste0('rm ',opt$output,'.score.gz')) +} + +system(paste0('gzip ',opt$output,'.score')) # Record end time of test if(!is.na(opt$test)){ diff --git a/Scripts/pgs_methods/lassosum2.R b/Scripts/pgs_methods/lassosum2.R new file mode 100644 index 00000000..2c036b44 --- /dev/null +++ b/Scripts/pgs_methods/lassosum2.R @@ -0,0 +1,249 @@ +#!/usr/bin/Rscript +# This script was written by Oliver Pain whilst at King's College London University. +start.time <- Sys.time() +library("optparse") + +option_list = list( + make_option("--ref_plink_chr", action="store", default=NULL, type='character', + help="Path to per chromosome reference PLINK files [required]"), + make_option("--ref_pcs", action="store", default=NULL, type='character', + help="Reference PCs for continuous ancestry correction [optional]"), + make_option("--ldpred2_ref_dir", action="store", default=NULL, type='character', + help="Path to directory containing LDpred2 reference data [required]"), + make_option("--pop_data", action="store", default=NULL, type='character', + help="File containing the population code and location of the keep file [required]"), + make_option("--plink2", action="store", default='plink2', type='character', + help="Path PLINK v2 software binary [optional]"), + make_option("--output", action="store", default=NULL, type='character', + help="Path for output files [required]"), + make_option("--n_cores", action="store", default=1, type='numeric', + help="Number of cores for parallel computing [optional]"), + make_option("--sample_prev", action="store", default=NULL, type='numeric', + help="Sampling ratio in GWAS [optional]"), + make_option("--test", action="store", default=NA, type='character', + help="Specify number of SNPs to include [optional]"), + make_option("--binary", action="store", default=F, type='logical', + help="Specify T if GWAS phenotyp is binary [optional]"), + make_option("--seed", action="store", default=1, type='numeric', + help="Set seed to ensure reproducibility [optional]"), + make_option("--sumstats", action="store", default=NULL, type='character', + help="GWAS summary statistics [required]") +) + +opt = parse_args(OptionParser(option_list = option_list)) + +# Load dependencies +library(GenoUtils) +library(data.table) +source('../functions/misc.R') +source_all('../functions') +library(bigsnpr) +library(ggplot2) + +# Check required inputs +if(is.null(opt$ref_plink_chr)){ + stop('--ref_plink_chr must be specified.\n') +} +if(is.null(opt$ldpred2_ref_dir)){ + stop('--ldpred2_ref_dir must be specified.\n') +} +if(is.null(opt$sumstats)){ + stop('--sumstats must be specified.\n') +} +if(is.null(opt$pop_data)){ + stop('--pop_data must be specified.\n') +} +if(is.null(opt$output)){ + stop('--output must be specified.\n') +} + +# Create output directory +opt$output_dir <- paste0(dirname(opt$output),'/') +system(paste0('mkdir -p ',opt$output_dir)) + +# Create temp directory +tmp_dir<-tempdir() + +# Initiate log file +log_file <- paste0(opt$output,'.log') +log_header(log_file = log_file, opt = opt, script = 'lassosum2.R', start.time = start.time) + +# If testing, change CHROMS to chr value +if(!is.na(opt$test) && opt$test == 'NA'){ + opt$test<-NA +} +if(!is.na(opt$test)){ + CHROMS <- as.numeric(gsub('chr','',opt$test)) +} + +# Format the binary parameter +if(!is.logical(opt$binary)){ + opt$binary <- ifelse(opt$binary == 'T', T, F) +} + +if(opt$binary & is.null(opt$sample_prev)){ + stop('--sample_prev must be specified when --binary T.\n') +} + +##### +# Read in sumstats +##### + +log_add(log_file = log_file, message = 'Reading in GWAS.') + +# Read in, check and format GWAS summary statistics +sumstats <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','BP','A1','A2','BETA','SE','N','P')) + +# Update header for bigsnpr +names(sumstats)<-c('chr','rsid','pos','a1','a0','beta','beta_se','n_eff','p') + +# In binary, update N to be effective N based on opt$sample_prev +if(opt$binary){ + ncas<-sumstats$n_eff*opt$sample_prev + ncon<-sumstats$n_eff*(1-opt$sample_prev) + sumstats$n_eff<-4 / (1/ncas + 1/ncon) + log_add(log_file = log_file, message = paste0('Median effective N = ', median(sumstats$n_eff))) +} + +# Record start time for test +if(!is.na(opt$test)){ + test_start.time <- test_start(log_file = log_file) +} + +# Harmonise with the LDpred2 reference +map<-readRDS(paste0(opt$ldpred2_ref_dir, '/map.rds')) +names(map)[names(map) == 'af_UKBB']<-'af' +map<-map[, c('chr', 'pos', 'a0', 'a1', 'af', 'ld')] +info_snp <- snp_match(sumstats, map) + +##### +# Perform additional suggested QC for LDpred2 +##### + +# Remove SDss < 0.5 * SDval or SDss > 0.1 + SDval or SDss < 0.1 or SDval < 0.05 +sd_val <- with(info_snp, sqrt(2 * af * (1 - af))) + +if(opt$binary == F){ + sd_y_est = median(sd_val * info_snp$beta_se * sqrt(info_snp$n_eff)) + sd_ss = with(info_snp, sd_y_est / sqrt(n_eff * beta_se^2)) +} else { + sd_ss <- with(info_snp, 2 / sqrt(n_eff * beta_se^2)) +} + +is_bad <-sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 + +png(paste0(opt$output_dir,'/LDpred2_sd_qc.png'), res=300, unit='px',height=2000, width=2000) + plot_obj <- qplot(sd_val, sd_ss, color = is_bad) + + theme_bigstatsr() + + coord_equal() + + scale_color_viridis_d(direction = -1) + + geom_abline(linetype = 2, color = "red") + + labs(x = "Standard deviations in the validation set", + y = "Standard deviations derived from the summary statistics", + color = "Removed?") + print(plot_obj) +dev.off() + +log_add(log_file = log_file, message = paste0('Sumstats contains ', nrow(info_snp[!is_bad, ]),' after additional genotype SD check.')) + +sumstats<-info_snp[!is_bad, ] + +# If more than half the variants have the wrong SD then the N is probably inaccurate +# Recompute N based on BETA and SE +if(sum(is_bad) > (length(is_bad)*0.5)){ + log_add(log_file = log_file, message = paste0('>50% of variants had a discordant SD.')) + stop('>50% of variants had a discordant SD. Check the sample size in the sumstats.') +} + +##### +# Prepare LD reference data +##### + +log_add(log_file = log_file, message = 'Creating genome-wide sparse matrix.') + +# Create genome-wide sparse LD matrix +for (chr in CHROMS) { + ## indices in 'sumstats' + ind.chr <- which(sumstats$chr == chr) + ## indices in 'map' + ind.chr2 <- sumstats$`_NUM_ID_`[ind.chr] + ## indices in 'corr_chr' + ind.chr3 <- match(ind.chr2, which(map$chr == chr)) + + corr0 <- readRDS(paste0(opt$ldpred2_ref_dir, '/LD_with_blocks_chr', chr, '.rds'))[ind.chr3, ind.chr3] + + if (chr == CHROMS[1]) { + corr <- as_SFBM(corr0, paste0(tmp_dir, '/LD_GW_sparse'), compact = TRUE) + } else { + corr$add_columns(corr0, nrow(corr)) + } +} + +##### +# Run lassosum2 +##### + +log_add(log_file = log_file, message = 'Running lassosum2.') + +# Set seed to ensure reproducibility +set.seed(opt$seed) + +# lassosum2 +beta_df <- snp_lassosum2( + corr, + sumstats, + ncores = opt$n_cores +) + +#### +# Create score file +#### + +# Convert matrix to data.table with column names +beta_dt <- as.data.table(beta_df) +grid <- attr(beta_df, "grid_param") +new_names <- paste0("s", grid$delta, "_lambda", grid$lambda) +setnames(beta_dt, new_names) + +betas <- data.table(SNP=sumstats$rsid, A1=sumstats$a1, A2=sumstats$a0, beta_dt) + +rem<-NULL +for(i in 4:length(names(betas))){ + if(is.infinite(sum(betas[[names(betas)[i]]])) | is.na(sum(betas[[names(betas)[i]]]))){ + log_add(log_file = log_file, message = paste0('Skipping ',names(betas)[i],' due to presence of non-finite values.')) + rem<-c(rem,i) + } +} + +if(is.null(rem) == F){ + betas<-betas[, -rem, with=F] +} + +names(betas)[-1:-3] <- paste0('SCORE_', names(betas)[-1:-3]) + +# Flip effects to match reference alleles +ref <- read_pvar(opt$ref_plink_chr, chr = CHROMS)[, c('SNP','A1','A2'), with=F] +score_new <- map_score(ref = ref, score = betas) + +# Reduce number of significant figures to save space +score_new[, (4:ncol(score_new)) := lapply(.SD, signif, digits = 7), .SDcols = 4:ncol(score_new)] + +fwrite(score_new, paste0(opt$output,'.score'), col.names=T, sep=' ', quote=F) + +if(file.exists(paste0(opt$output,'.score.gz'))){ + system(paste0('rm ',opt$output,'.score.gz')) +} + +system(paste0('gzip ',opt$output,'.score')) + +# Record end time of test +if(!is.na(opt$test)){ + test_finish(log_file = log_file, test_start.time = test_start.time) +} + +end.time <- Sys.time() +time.taken <- end.time - start.time +sink(file = log_file, append = T) +cat('Analysis finished at', as.character(end.time),'\n') +cat('Analysis duration was', as.character(round(time.taken,2)), attr(time.taken, 'units'), '\n') +sink() diff --git a/Scripts/pgs_methods/ldpred2.R b/Scripts/pgs_methods/ldpred2.R index e57ef353..e168078d 100644 --- a/Scripts/pgs_methods/ldpred2.R +++ b/Scripts/pgs_methods/ldpred2.R @@ -105,6 +105,7 @@ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics sumstats <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','BP','A1','A2','BETA','SE','N','P')) +GWAS_CHROMS<-unique(sumstats$CHR) # Update header for bigsnpr names(sumstats)<-c('chr','rsid','pos','a1','a0','beta','beta_se','n_eff','p') @@ -183,7 +184,7 @@ if(ldsc[["h2"]] < 0.05){ log_add(log_file = log_file, message = 'Creating genome-wide sparse matrix.') # Create genome-wide sparse LD matrix -for (chr in CHROMS) { +for (chr in GWAS_CHROMS) { ## indices in 'sumstats' ind.chr <- which(sumstats$chr == chr) ## indices in 'map' diff --git a/Scripts/pgs_methods/megaprs.R b/Scripts/pgs_methods/megaprs.R index 2cb84590..7539e6a6 100644 --- a/Scripts/pgs_methods/megaprs.R +++ b/Scripts/pgs_methods/megaprs.R @@ -30,6 +30,8 @@ option_list = list( help="Path to ldak tagging data [required]"), make_option("--ldak_highld", action="store", default=NULL, type='character', help="Path to ldak highld data [required]"), + make_option("--pseudo_only", action="store", default=F, type='logical', + help="Logical indicating whether only pseudovalidated model should be output [optional]"), make_option("--n_cores", action="store", default=1, type='numeric', help="Number of cores for parallel computing [optional]"), make_option("--prs_model", action="store", default='mega', type='character', @@ -99,9 +101,10 @@ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','BP','SNP','A1','A2','BETA','SE','N','FREQ','REF.FREQ')) +GWAS_CHROMS<-unique(gwas$CHR) # Check allele frequency difference -ref_psam<-fread(paste0(opt$ref_plink_chr, CHROMS[1],'.psam')) +ref_psam<-fread(paste0(opt$ref_plink_chr, GWAS_CHROMS[1],'.psam')) names(ref_psam)<-gsub('\\#', '', names(ref_psam)) if(!is.null(opt$ref_keep)){ @@ -131,7 +134,7 @@ fwrite(gwas, paste0(tmp_dir,'/GWAS_sumstats_temp.txt'), sep=' ') log_add(log_file = log_file, message = 'Merging per chromosome reference data.') # Save in plink1 format for MegaPRS -plink_merge(pfile = opt$ref_plink_chr, chr = CHROMS, plink2 = opt$plink2, keep = opt$ref_keep, extract = snplist, make_bed =T, out = paste0(tmp_dir, '/ref_merge')) +plink_merge(pfile = opt$ref_plink_chr, chr = GWAS_CHROMS, plink2 = opt$plink2, keep = opt$ref_keep, extract = snplist, make_bed =T, out = paste0(tmp_dir, '/ref_merge')) # Record start time for test if(!is.na(opt$test)){ @@ -168,10 +171,10 @@ system(paste0('cp ', opt$ldak_tag, '/* ', tmp_dir, '/bld/')) system(paste0('mv ', tmp_dir, '/sections/weights.short ', tmp_dir,'/bld/bld65')) # Calculate taggings -if(length(CHROMS) != 1){ +if(length(GWAS_CHROMS) != 1){ system(paste0(opt$ldak, ' --calc-tagging ', tmp_dir, '/bld.ldak --bfile ', tmp_dir, '/ref_merge --ignore-weights YES --power -.25 --annotation-number 65 --annotation-prefix ', tmp_dir, '/bld/bld --window-cm 1 --save-matrix YES --max-threads ', opt$n_cores)) } else { - system(paste0(opt$ldak, ' --calc-tagging ', tmp_dir, '/bld.ldak --bfile ', tmp_dir, '/ref_merge --ignore-weights YES --power -.25 --annotation-number 65 --annotation-prefix ', tmp_dir, '/bld/bld --window-cm 1 --chr ', CHROMS, ' --save-matrix YES --max-threads ', opt$n_cores)) + system(paste0(opt$ldak, ' --calc-tagging ', tmp_dir, '/bld.ldak --bfile ', tmp_dir, '/ref_merge --ignore-weights YES --power -.25 --annotation-number 65 --annotation-prefix ', tmp_dir, '/bld/bld --window-cm 1 --chr ', GWAS_CHROMS, ' --save-matrix YES --max-threads ', opt$n_cores)) } # Calculate Per-Predictor Heritabilities. @@ -192,7 +195,7 @@ log_add(log_file = log_file, message = 'Running using full reference.') # Calculate predictor-predictor correlations log_add(log_file = log_file, message = 'Calculating predictor-predictor correlations.') -full_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), ldak = opt$ldak, n_cores = opt$n_cores, chr = CHROMS) +full_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), ldak = opt$ldak, n_cores = opt$n_cores, chr = GWAS_CHROMS) # Run MegaPRS log_add(log_file = log_file, message = paste0('Running MegaPRS: ',opt$prs_model,' model.')) @@ -217,7 +220,7 @@ system(paste0(opt$ldak, ' --pseudo-summaries ', tmp_dir, '/GWAS_sumstats_temp.ps # Calculate predictor-predictor correlations log_add(log_file = log_file, message = 'Calculating predictor-predictor correlations.') -subset_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), keep = paste0(tmp_dir, '/keepb'), ldak = opt$ldak, n_cores = opt$n_cores, chr = CHROMS) +subset_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), keep = paste0(tmp_dir, '/keepb'), ldak = opt$ldak, n_cores = opt$n_cores, chr = GWAS_CHROMS) # Run megaPRS log_add(log_file = log_file, message = paste0('Running MegaPRS: ',opt$prs_model,' model.')) @@ -255,6 +258,13 @@ ref_pvar <- read_pvar(dat = opt$ref_plink_chr, chr = CHROMS) ref_pvar$Predictor<-paste0(ref_pvar$CHR,':',ref_pvar$BP) score<-merge(score, ref_pvar[,c('Predictor','SNP'), with=F], by='Predictor') score<-score[, c('SNP', 'A1', 'A2', names(score)[grepl('Model', names(score))]), with=F] + +print(head(score)) + +if(opt$pseudo_only){ + score <- score[,c('SNP','A1','A2', paste0('Model', gsub('Score_','',best_score$V1[1]))), with = F] +} + names(score)[grepl('Model', names(score))]<-paste0('SCORE_ldak_',names(score)[grepl('Model', names(score))]) # Flip effects to match reference alleles diff --git a/Scripts/pgs_methods/prscs.R b/Scripts/pgs_methods/prscs.R index c77bccdf..62c3f44b 100644 --- a/Scripts/pgs_methods/prscs.R +++ b/Scripts/pgs_methods/prscs.R @@ -97,14 +97,27 @@ if(!is.na(opt$test)){ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics -gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('SNP','A1','A2','BETA','SE','N')) +gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','A1','A2','BETA','SE','N')) + +# Subset CHROMS object to chr present +gwas_CHROMS <- unique(gwas$CHR) +gwas$CHR <- NULL # Store average sample size gwas_N <- round(mean(gwas$N), 0) fwrite(gwas, paste0(tmp_dir, '/GWAS_sumstats_temp.txt'), sep=' ') +# Create a temporary reference bim files for PRS-CS to match to +pvar <- read_pvar(opt$ref_plink_chr, chr = CHROMS) +pvar <- pvar[pvar$SNP %in% gwas$SNP,] +pvar$POS<-0 +for(i in gwas_CHROMS){ + write.table(pvar[pvar$CHR == i, c('CHR','SNP','POS','BP','A1','A2'), with=F], paste0(tmp_dir,'/ref.chr',i,'.bim'), col.names=F, row.names=F, quote=F) +} + rm(gwas) +rm(pvar) gc() # Record start time for test @@ -116,19 +129,9 @@ if(!is.na(opt$test)){ # Process sumstats using PRSsc ##### -# Create a temporary reference bim files for PRS-CS to match to -pvar <- read_pvar(opt$ref_plink_chr, chr = CHROMS) -pvar$POS<-0 -for(i in CHROMS){ - write.table(pvar[pvar$CHR == i, c('CHR','SNP','POS','BP','A1','A2'), with=F], paste0(tmp_dir,'/ref.chr',i,'.bim'), col.names=F, row.names=F, quote=F) -} - -rm(pvar) -gc() - # Make a data.frame listing chromosome and phi combinations jobs<-NULL -for(i in CHROMS){ +for(i in gwas_CHROMS){ jobs<-rbind(jobs, data.frame(CHR=i, phi=phi_param)) } @@ -165,7 +168,7 @@ log <- foreach(i = 1:nrow(jobs), .combine = c, .options.multicore = list(presche score_all<-NULL for(phi_i in phi_param){ score_phi<-NULL - for(i in CHROMS){ + for(i in gwas_CHROMS){ score_phi_i<-fread(paste0(tmp_dir,'/_pst_eff_a1_b0.5_phi',phi_i,'_chr',i,'.txt')) score_phi<-rbind(score_phi, score_phi_i) } diff --git a/Scripts/pgs_methods/sbayesr.R b/Scripts/pgs_methods/sbayesr.R index a360775e..89432e2a 100644 --- a/Scripts/pgs_methods/sbayesr.R +++ b/Scripts/pgs_methods/sbayesr.R @@ -91,8 +91,9 @@ if(!is.na(opt$test)){ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics -gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('SNP','A1','A2','FREQ','BETA','SE','P','N')) - +gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','A1','A2','FREQ','BETA','SE','P','N')) +GWAS_CHROMS<-unique(gwas$CHR) +gwas$CHR<-NULL ### # Change to COJO format ### @@ -144,7 +145,7 @@ if(per_var_N == F & opt$impute_N == T){ sbayesr_opt <- paste0(sbayesr_opt, '--impute-n ') } -error<-foreach(i = CHROMS, .combine = rbind, .options.multicore = list(preschedule = FALSE)) %dopar% { +error<-foreach(i = GWAS_CHROMS, .combine = rbind, .options.multicore = list(preschedule = FALSE)) %dopar% { log <- system(paste0(opt$gctb, ' --sbayes R --ldm ', opt$ld_matrix_chr, i, '.ldm.sparse --pi 0.95,0.02,0.02,0.01 --gamma 0.0,0.01,0.1,1 --gwas-summary ', tmp_dir, '/GWAS_sumstats_COJO.txt --chain-length 10000 ', sbayesr_opt, '--exclude-mhc --burn-in 2000 --out-freq 1000 --out ', tmp_dir, '/GWAS_sumstats_SBayesR.chr', i), intern = T) # Check whether the analysis converged @@ -173,7 +174,7 @@ if(sum(grepl('Error', error$Log) == T) > 1){ # Combine per chromosome snpRes files snpRes<-NULL -for(i in CHROMS){ +for(i in GWAS_CHROMS){ snpRes <- rbind(snpRes, fread(paste0(tmp_dir, '/GWAS_sumstats_SBayesR.chr', i, '.snpRes'))) } @@ -199,14 +200,14 @@ if(!is.na(opt$test)){ # Combine per chromosome parRes files parRes_mcmc <- list() -for(i in CHROMS){ +for(i in GWAS_CHROMS){ parRes_mcmc[[i]] <- fread(paste0(tmp_dir, '/GWAS_sumstats_SBayesR.chr', i, '.mcmcsamples.Par')) } parRes <- NULL for(par in names(parRes_mcmc[[i]])){ parRes_mcmc_par <- NULL - for(i in CHROMS){ + for(i in GWAS_CHROMS){ parRes_mcmc_par <- cbind(parRes_mcmc_par, parRes_mcmc[[i]][[par]]) } diff --git a/Scripts/pipeline_reports/indiv_report_creator.Rmd b/Scripts/pipeline_reports/indiv_report_creator.Rmd index 921e640a..3ca744d2 100644 --- a/Scripts/pipeline_reports/indiv_report_creator.Rmd +++ b/Scripts/pipeline_reports/indiv_report_creator.Rmd @@ -1,10 +1,10 @@ --- title: "GenoPred Report" params: - name: "" - id: "" - config: "" - cwd: "" + name: + id: + config: + cwd: output: html_document: toc: true @@ -63,7 +63,11 @@ gwas_groups <- read_param(config = params$config, param = 'gwas_groups') score_list <- read_param(config = params$config, param = 'score_list') # Identify PGS methods to be included -pgs_methods_list <- read_param(config = params$config, param = 'pgs_methods', return_obj = F) +if(!is.null(gwas_list)){ + pgs_methods_list <- read_param(config = params$config, param = 'pgs_methods', return_obj = F) +} else { + pgs_methods_list <- NULL +} # If testing, change CHROMS to chr value testing <- read_param(config = params$config, param = 'testing', return_obj = F) @@ -265,10 +269,6 @@ cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statis cat0("- ", ifelse(is.null(gwas_groups), 0, nrow(gwas_groups)), " GWAS groups were specified.\n") cat0("- ", length(pgs_methods_list), " PGS methods were applied, including ", paste0(pgs_method_labels$label[pgs_method_labels$method %in% pgs_methods_list], collapse = ', '), ".\n") -if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){ - cat0(" - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n") -} - if(is.null(score_list)){ cat0("- No external score files were provided in score_list.\n\n") } else { @@ -489,7 +489,7 @@ cat0("## Target Polygenic Profile {.tabset .tabset-fade} \n\n") # Read in PGS # Exclude PGS from multi-source methods as no estimate of R is available single_source_methods <- pgs_methods_list[!(pgs_methods_list %in% pgs_group_methods) & !(grepl('_multi|tlprs_', pgs_methods_list))] -pgs <- read_pgs(config = params$config, name = params$name, pop = 'TRANS', pseudo_only=T, pgs_method = single_source_methods)[[1]] +pgs <- read_pgs(config = params$config, name = params$name, pop = 'TRANS', pseudo_only=T, pgs_methods = single_source_methods)[[1]] # Structure PGS for target individual pgs_dat <- NULL diff --git a/Scripts/pipeline_reports/samp_report_creator.Rmd b/Scripts/pipeline_reports/samp_report_creator.Rmd index 62450640..a9be7111 100644 --- a/Scripts/pipeline_reports/samp_report_creator.Rmd +++ b/Scripts/pipeline_reports/samp_report_creator.Rmd @@ -181,10 +181,6 @@ cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statis cat0("- ", ifelse(is.null(gwas_groups), 0, nrow(gwas_groups)), " GWAS groups were specified.\n") cat0("- ", length(pgs_methods_list), " PGS methods were applied, including ", paste0(pgs_methods_list, collapse = ', '), ".\n") -if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){ - cat0(" - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n") -} - if(is.null(score_list)){ cat0("- No external score files were provided in score_list.\n\n") } else { diff --git a/Scripts/target_scoring/target_scoring_pipeline.R b/Scripts/target_scoring/target_scoring_pipeline.R index fd22343d..63876005 100644 --- a/Scripts/target_scoring/target_scoring_pipeline.R +++ b/Scripts/target_scoring/target_scoring_pipeline.R @@ -27,6 +27,7 @@ opt = parse_args(OptionParser(option_list=option_list)) # Load dependencies library(GenoUtils) library(data.table) +library(bigstatsr) source('../functions/misc.R') source_all('../functions') library(foreach) @@ -180,30 +181,40 @@ for(chr_i in CHROMS){ score = paste0(tmp_dir,'/all_score.txt'), keep = opt$target_keep, frq = opt$ref_freq_chr, - threads = opt$n_cores + threads = opt$n_cores, + fbm = T ) # Sum scores across chromosomes if(chr_i == CHROMS[1]){ - scores_ids <- scores_i[, 1:2, with = F] - current_scores <- as.matrix(scores_i[, -1:-2, with = FALSE]) - scores <- current_scores - } else { - current_scores <- as.matrix(scores_i[, -1:-2, with = FALSE]) - scores <- scores + current_scores + scores_ids <- scores_i$ids + cols <- scores_i$cols + + # Initialize a FBM (backed on disk) for running PGS sum + file.remove(paste0(tmp_dir, '/PGS_fbm.bk')) + scores <- FBM( + nrow = nrow(scores_ids), + ncol = length(cols), + backingfile = paste0(tmp_dir, '/PGS_fbm'), + init = 0 + ) } - - system(paste0('rm ', tmp_dir, '/all_score.txt')) - system(paste0('rm ', tmp_dir, '/row_index.txt')) - system(paste0('rm ', tmp_dir, '/map.txt')) + + # In-place addition: for each score column + for (j in cols) { + scores[, which(cols == j)] <- scores[, which(cols == j)] + scores_i$scores[,which(scores_i$cols == j)] + } + + file.remove(scores_i$scores$backingfile, + scores_i$scores$rds) rm(scores_i) - rm(current_scores) gc() } # Combine score with IDs -scores<-data.table(scores_ids, - scores) +scores <- as.data.table(matrix(scores[,], ncol = length(cols))) +setnames(scores, cols) +scores <- cbind(scores_ids, scores) ### # Scale the polygenic scores based on the reference diff --git a/docs/CrossPop.Rmd b/docs/CrossPop.Rmd index 6d4195ef..1e1a1d52 100644 --- a/docs/CrossPop.Rmd +++ b/docs/CrossPop.Rmd @@ -7065,6 +7065,13 @@ cp ~/oliverpainfel/Analyses/crosspop/plots_three_pop/average_r.png /scratch/prj/ Here we will use GWAS sumtats that were used in the original GenoPred paper. These GWAS are from a range of sources, often large meta-analyses, which can lead to greater mispecification in the sumstats, which can impact the performance of some PGS methods. This is to provide more confidence in the performance of SBayesRC and QuickPRS relative to other methods. +
+Note: I am using this opportunity to evaluate +lassosum2, which is not included in other analyses in this project.
+
+
Conda is a software environment management system that simplifies -installing and managing dependencies. We recommend using Miniforge — a -minimal conda installer that comes with Mamba, a fast drop-in -replacement for conda.
-If you don’t already have Miniforge installed, you can install it -using the commands below:
-# Download and install Miniforge (for Linux)
-wget https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-24.11.3-0-Linux-x86_64.sh
-bash Miniforge3-24.11.3-0-Linux-x86_64.sh
-Accept the default installation options. Once installed, you may need -to run source ~/.bashrc or restart your terminal. You should see (base) -appear at the beginning of your terminal prompt.
-Now, create the GenoPred environment using Mamba:
-mamba env create -f GenoPred/pipeline/envs/pipeline.yaml
-Activate the new genopred environment:
mamba activate genopred
-Note: If you are working on an HPC, check whether -mamba is an available module. Using the centrally installed version of -mamba may avoid issues down the road.
-Conda is a software environment management system which is great way for easily downloading and storing software. We will use conda to create an environment that the GenoPred pipeline will run in.
If you don’t already have conda installed, we will install it using miniconda.
-<<<<<<< Updated upstream -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-sh Miniconda3-latest-Linux-x86_64.sh
-=======
# Download and install Miniforge (for Linux)
wget https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-24.11.3-0-Linux-x86_64.sh
bash Miniforge3-24.11.3-0-Linux-x86_64.sh
->>>>>>> Stashed changes
I would say yes to the default options. You may then
need to refresh your workspace to initiate conda, by running
source ~/.bashrc. You should see (base)
@@ -618,16 +589,9 @@
GenoPred/pipeline/envs/pipeline.yaml file. This will create
an environment called genopred with some essential packages
installed.
-<<<<<<< Updated upstream
-conda env create -f GenoPred/pipeline/envs/pipeline.yaml
-Now activate the new genopred environment.
conda activate genopred
-=======
mamba env create -f GenoPred/pipeline/envs/pipeline.yaml
Now activate the new genopred environment.
mamba activate genopred
->>>>>>> Stashed changes
->>>>>>> 999a324 (Updated docs)
configfile['ptclump','dbslmm'] |
Options are: ptclump, dbslmm,
prscs, sbayesr, lassosum,
-ldpred2, megaprs. Note.
-sbayesr and ldpred2 are only implemented for
-GWAS of EUR ancestry. |
+
testing |
@@ -1132,7 +1097,7 @@
The pipeline also implements a range of multi-source polygenic
-scoring methods, that can combine GWAS summary statistics from multiple
-populations. To use these methods an additional gwas_groups
-file must be provided, indicating which GWAS are to be jointly analysed.
-The gwas_groups file must be specified in the
-configfile using the gwas_groups parameter. An
-example of a gwas_groups file can be found here.
| Column | -Example | -Description | -
|---|---|---|
| name | -height |
-ID for the group of GWAS. Cannot contain spaces (’ ‘) -or hyphens (’-’) | -
| gwas | -yengo_eur,yengo_eas |
-Comma-seperated list of GWAS names, corresponding to
-the gwas_list. |
-
| label | -"Height (EUR+EAS)" |
-A human readable name for the group of GWAS Wrap in -double quotes if multiple words. | -
sbayesr_ldref: for SBayesRsbayesrc_ldref: for SBayesRCldpred2_ldref: for LDpred2ldpred2_ldref: for LDpred2 and
+lassosum2quickprs_ref: for QuickPRSquickprs_multi_ldref: for
LEOPARD+QuickPRS