diff --git a/Scripts/pgs_methods/dbslmm.R b/Scripts/pgs_methods/dbslmm.R index 2df25cc0..7943d4e7 100644 --- a/Scripts/pgs_methods/dbslmm.R +++ b/Scripts/pgs_methods/dbslmm.R @@ -167,10 +167,10 @@ if(any(ldsc_h2*opt$h2f > 1)){ # Save in plink1 format for DBSLMM if(!is.null(opt$ref_keep)){ log_add(log_file = log_file, message = 'ref_keep used to subset reference genotype data.') - plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), plink2 = opt$plink2, chr = CHROMS, keep = opt$ref_keep, memory = opt$memory) + plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), extract = gwas$SNP, plink2 = opt$plink2, chr = CHROMS, keep = opt$ref_keep, memory = opt$memory) opt$ref_plink_chr_subset<-paste0(tmp_dir,'/ref_subset_chr') } else { - plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), plink2 = opt$plink2, chr = CHROMS, memory = opt$memory) + plink_subset(pfile = opt$ref_plink_chr, make_bed = T, out = paste0(tmp_dir,'/ref_subset_chr'), extract = gwas$SNP, plink2 = opt$plink2, chr = CHROMS, memory = opt$memory) opt$ref_plink_chr_subset<-paste0(tmp_dir,'/ref_subset_chr') } diff --git a/Scripts/pgs_methods/lassosum.R b/Scripts/pgs_methods/lassosum.R index 99062bc7..dfcc7c51 100644 --- a/Scripts/pgs_methods/lassosum.R +++ b/Scripts/pgs_methods/lassosum.R @@ -18,8 +18,10 @@ option_list = list( help="Path PLINK v2 software binary [optional]"), make_option("--output", action="store", default=NULL, type='character', help="Path for output files [required]"), - make_option("--n_cores", action="store", default=1, type='numeric', - help="Number of cores to use [optional]"), + make_option("--n_cores", action="store", default=1, type='numeric', + help="Number of cores to use [optional]"), + make_option("--pseudo_only", action="store", default=F, type='logical', + help="Logical indicating whether only pseudovalidated model should be output [optional]"), make_option("--test", action="store", default=NA, type='character', help="Specify number of SNPs to include [optional]"), make_option("--sumstats", action="store", default=NULL, type='character', @@ -143,7 +145,7 @@ out <- lassosum.pipeline( # Change working directory back to the original setwd(orig_wd) -# Write out a score file +# Format score file score_file <- data.table(SNP = gwas$SNP[out$sumstats$order], out$sumstats[c('A1', 'A2')]) for(i in 1:length(out$s)){ @@ -154,21 +156,6 @@ for(i in 1:length(out$s)){ } } -# Flip effects to match reference alleles -ref <- read_pvar(opt$ref_plink_chr, chr = CHROMS)[, c('SNP','A1','A2'), with=F] -score_new <- map_score(ref = ref, score = score_file) - -# Reduce number of significant figures to save space -score_new[, (4:ncol(score_new)) := lapply(.SD, signif, digits = 7), .SDcols = 4:ncol(score_new)] - -fwrite(score_new, paste0(opt$output,'.score'), col.names=T, sep=' ', quote=F) - -if(file.exists(paste0(opt$output,'.score.gz'))){ - system(paste0('rm ',opt$output,'.score.gz')) -} - -system(paste0('gzip ',opt$output,'.score')) - ##### # Perform pseudovalidation ##### @@ -190,7 +177,26 @@ log_add(log_file = log_file, message = c( paste0('s = ', out2$s), paste0('lambda = ', out2$lambda), paste0('value = ', v$validation.table$value[v$validation.table$lambda == v$best.lambda & v$validation.table$s == v$best.s]) - )) +)) + +if(opt$pseudo_only){ + score_file <- score_file[, c('SNP','A1','A2',paste0('SCORE_s', out2$s, '_lambda', out2$lambda)), with=F] +} + +# Flip effects to match reference alleles +ref <- read_pvar(opt$ref_plink_chr, chr = CHROMS)[, c('SNP','A1','A2'), with=F] +score_new <- map_score(ref = ref, score = score_file) + +# Reduce number of significant figures to save space +score_new[, (4:ncol(score_new)) := lapply(.SD, signif, digits = 7), .SDcols = 4:ncol(score_new)] + +fwrite(score_new, paste0(opt$output,'.score'), col.names=T, sep=' ', quote=F) + +if(file.exists(paste0(opt$output,'.score.gz'))){ + system(paste0('rm ',opt$output,'.score.gz')) +} + +system(paste0('gzip ',opt$output,'.score')) # Record end time of test if(!is.na(opt$test)){ diff --git a/Scripts/pgs_methods/lassosum2.R b/Scripts/pgs_methods/lassosum2.R new file mode 100644 index 00000000..2c036b44 --- /dev/null +++ b/Scripts/pgs_methods/lassosum2.R @@ -0,0 +1,249 @@ +#!/usr/bin/Rscript +# This script was written by Oliver Pain whilst at King's College London University. +start.time <- Sys.time() +library("optparse") + +option_list = list( + make_option("--ref_plink_chr", action="store", default=NULL, type='character', + help="Path to per chromosome reference PLINK files [required]"), + make_option("--ref_pcs", action="store", default=NULL, type='character', + help="Reference PCs for continuous ancestry correction [optional]"), + make_option("--ldpred2_ref_dir", action="store", default=NULL, type='character', + help="Path to directory containing LDpred2 reference data [required]"), + make_option("--pop_data", action="store", default=NULL, type='character', + help="File containing the population code and location of the keep file [required]"), + make_option("--plink2", action="store", default='plink2', type='character', + help="Path PLINK v2 software binary [optional]"), + make_option("--output", action="store", default=NULL, type='character', + help="Path for output files [required]"), + make_option("--n_cores", action="store", default=1, type='numeric', + help="Number of cores for parallel computing [optional]"), + make_option("--sample_prev", action="store", default=NULL, type='numeric', + help="Sampling ratio in GWAS [optional]"), + make_option("--test", action="store", default=NA, type='character', + help="Specify number of SNPs to include [optional]"), + make_option("--binary", action="store", default=F, type='logical', + help="Specify T if GWAS phenotyp is binary [optional]"), + make_option("--seed", action="store", default=1, type='numeric', + help="Set seed to ensure reproducibility [optional]"), + make_option("--sumstats", action="store", default=NULL, type='character', + help="GWAS summary statistics [required]") +) + +opt = parse_args(OptionParser(option_list = option_list)) + +# Load dependencies +library(GenoUtils) +library(data.table) +source('../functions/misc.R') +source_all('../functions') +library(bigsnpr) +library(ggplot2) + +# Check required inputs +if(is.null(opt$ref_plink_chr)){ + stop('--ref_plink_chr must be specified.\n') +} +if(is.null(opt$ldpred2_ref_dir)){ + stop('--ldpred2_ref_dir must be specified.\n') +} +if(is.null(opt$sumstats)){ + stop('--sumstats must be specified.\n') +} +if(is.null(opt$pop_data)){ + stop('--pop_data must be specified.\n') +} +if(is.null(opt$output)){ + stop('--output must be specified.\n') +} + +# Create output directory +opt$output_dir <- paste0(dirname(opt$output),'/') +system(paste0('mkdir -p ',opt$output_dir)) + +# Create temp directory +tmp_dir<-tempdir() + +# Initiate log file +log_file <- paste0(opt$output,'.log') +log_header(log_file = log_file, opt = opt, script = 'lassosum2.R', start.time = start.time) + +# If testing, change CHROMS to chr value +if(!is.na(opt$test) && opt$test == 'NA'){ + opt$test<-NA +} +if(!is.na(opt$test)){ + CHROMS <- as.numeric(gsub('chr','',opt$test)) +} + +# Format the binary parameter +if(!is.logical(opt$binary)){ + opt$binary <- ifelse(opt$binary == 'T', T, F) +} + +if(opt$binary & is.null(opt$sample_prev)){ + stop('--sample_prev must be specified when --binary T.\n') +} + +##### +# Read in sumstats +##### + +log_add(log_file = log_file, message = 'Reading in GWAS.') + +# Read in, check and format GWAS summary statistics +sumstats <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','BP','A1','A2','BETA','SE','N','P')) + +# Update header for bigsnpr +names(sumstats)<-c('chr','rsid','pos','a1','a0','beta','beta_se','n_eff','p') + +# In binary, update N to be effective N based on opt$sample_prev +if(opt$binary){ + ncas<-sumstats$n_eff*opt$sample_prev + ncon<-sumstats$n_eff*(1-opt$sample_prev) + sumstats$n_eff<-4 / (1/ncas + 1/ncon) + log_add(log_file = log_file, message = paste0('Median effective N = ', median(sumstats$n_eff))) +} + +# Record start time for test +if(!is.na(opt$test)){ + test_start.time <- test_start(log_file = log_file) +} + +# Harmonise with the LDpred2 reference +map<-readRDS(paste0(opt$ldpred2_ref_dir, '/map.rds')) +names(map)[names(map) == 'af_UKBB']<-'af' +map<-map[, c('chr', 'pos', 'a0', 'a1', 'af', 'ld')] +info_snp <- snp_match(sumstats, map) + +##### +# Perform additional suggested QC for LDpred2 +##### + +# Remove SDss < 0.5 * SDval or SDss > 0.1 + SDval or SDss < 0.1 or SDval < 0.05 +sd_val <- with(info_snp, sqrt(2 * af * (1 - af))) + +if(opt$binary == F){ + sd_y_est = median(sd_val * info_snp$beta_se * sqrt(info_snp$n_eff)) + sd_ss = with(info_snp, sd_y_est / sqrt(n_eff * beta_se^2)) +} else { + sd_ss <- with(info_snp, 2 / sqrt(n_eff * beta_se^2)) +} + +is_bad <-sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05 + +png(paste0(opt$output_dir,'/LDpred2_sd_qc.png'), res=300, unit='px',height=2000, width=2000) + plot_obj <- qplot(sd_val, sd_ss, color = is_bad) + + theme_bigstatsr() + + coord_equal() + + scale_color_viridis_d(direction = -1) + + geom_abline(linetype = 2, color = "red") + + labs(x = "Standard deviations in the validation set", + y = "Standard deviations derived from the summary statistics", + color = "Removed?") + print(plot_obj) +dev.off() + +log_add(log_file = log_file, message = paste0('Sumstats contains ', nrow(info_snp[!is_bad, ]),' after additional genotype SD check.')) + +sumstats<-info_snp[!is_bad, ] + +# If more than half the variants have the wrong SD then the N is probably inaccurate +# Recompute N based on BETA and SE +if(sum(is_bad) > (length(is_bad)*0.5)){ + log_add(log_file = log_file, message = paste0('>50% of variants had a discordant SD.')) + stop('>50% of variants had a discordant SD. Check the sample size in the sumstats.') +} + +##### +# Prepare LD reference data +##### + +log_add(log_file = log_file, message = 'Creating genome-wide sparse matrix.') + +# Create genome-wide sparse LD matrix +for (chr in CHROMS) { + ## indices in 'sumstats' + ind.chr <- which(sumstats$chr == chr) + ## indices in 'map' + ind.chr2 <- sumstats$`_NUM_ID_`[ind.chr] + ## indices in 'corr_chr' + ind.chr3 <- match(ind.chr2, which(map$chr == chr)) + + corr0 <- readRDS(paste0(opt$ldpred2_ref_dir, '/LD_with_blocks_chr', chr, '.rds'))[ind.chr3, ind.chr3] + + if (chr == CHROMS[1]) { + corr <- as_SFBM(corr0, paste0(tmp_dir, '/LD_GW_sparse'), compact = TRUE) + } else { + corr$add_columns(corr0, nrow(corr)) + } +} + +##### +# Run lassosum2 +##### + +log_add(log_file = log_file, message = 'Running lassosum2.') + +# Set seed to ensure reproducibility +set.seed(opt$seed) + +# lassosum2 +beta_df <- snp_lassosum2( + corr, + sumstats, + ncores = opt$n_cores +) + +#### +# Create score file +#### + +# Convert matrix to data.table with column names +beta_dt <- as.data.table(beta_df) +grid <- attr(beta_df, "grid_param") +new_names <- paste0("s", grid$delta, "_lambda", grid$lambda) +setnames(beta_dt, new_names) + +betas <- data.table(SNP=sumstats$rsid, A1=sumstats$a1, A2=sumstats$a0, beta_dt) + +rem<-NULL +for(i in 4:length(names(betas))){ + if(is.infinite(sum(betas[[names(betas)[i]]])) | is.na(sum(betas[[names(betas)[i]]]))){ + log_add(log_file = log_file, message = paste0('Skipping ',names(betas)[i],' due to presence of non-finite values.')) + rem<-c(rem,i) + } +} + +if(is.null(rem) == F){ + betas<-betas[, -rem, with=F] +} + +names(betas)[-1:-3] <- paste0('SCORE_', names(betas)[-1:-3]) + +# Flip effects to match reference alleles +ref <- read_pvar(opt$ref_plink_chr, chr = CHROMS)[, c('SNP','A1','A2'), with=F] +score_new <- map_score(ref = ref, score = betas) + +# Reduce number of significant figures to save space +score_new[, (4:ncol(score_new)) := lapply(.SD, signif, digits = 7), .SDcols = 4:ncol(score_new)] + +fwrite(score_new, paste0(opt$output,'.score'), col.names=T, sep=' ', quote=F) + +if(file.exists(paste0(opt$output,'.score.gz'))){ + system(paste0('rm ',opt$output,'.score.gz')) +} + +system(paste0('gzip ',opt$output,'.score')) + +# Record end time of test +if(!is.na(opt$test)){ + test_finish(log_file = log_file, test_start.time = test_start.time) +} + +end.time <- Sys.time() +time.taken <- end.time - start.time +sink(file = log_file, append = T) +cat('Analysis finished at', as.character(end.time),'\n') +cat('Analysis duration was', as.character(round(time.taken,2)), attr(time.taken, 'units'), '\n') +sink() diff --git a/Scripts/pgs_methods/ldpred2.R b/Scripts/pgs_methods/ldpred2.R index e57ef353..e168078d 100644 --- a/Scripts/pgs_methods/ldpred2.R +++ b/Scripts/pgs_methods/ldpred2.R @@ -105,6 +105,7 @@ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics sumstats <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','BP','A1','A2','BETA','SE','N','P')) +GWAS_CHROMS<-unique(sumstats$CHR) # Update header for bigsnpr names(sumstats)<-c('chr','rsid','pos','a1','a0','beta','beta_se','n_eff','p') @@ -183,7 +184,7 @@ if(ldsc[["h2"]] < 0.05){ log_add(log_file = log_file, message = 'Creating genome-wide sparse matrix.') # Create genome-wide sparse LD matrix -for (chr in CHROMS) { +for (chr in GWAS_CHROMS) { ## indices in 'sumstats' ind.chr <- which(sumstats$chr == chr) ## indices in 'map' diff --git a/Scripts/pgs_methods/megaprs.R b/Scripts/pgs_methods/megaprs.R index 2cb84590..7539e6a6 100644 --- a/Scripts/pgs_methods/megaprs.R +++ b/Scripts/pgs_methods/megaprs.R @@ -30,6 +30,8 @@ option_list = list( help="Path to ldak tagging data [required]"), make_option("--ldak_highld", action="store", default=NULL, type='character', help="Path to ldak highld data [required]"), + make_option("--pseudo_only", action="store", default=F, type='logical', + help="Logical indicating whether only pseudovalidated model should be output [optional]"), make_option("--n_cores", action="store", default=1, type='numeric', help="Number of cores for parallel computing [optional]"), make_option("--prs_model", action="store", default='mega', type='character', @@ -99,9 +101,10 @@ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','BP','SNP','A1','A2','BETA','SE','N','FREQ','REF.FREQ')) +GWAS_CHROMS<-unique(gwas$CHR) # Check allele frequency difference -ref_psam<-fread(paste0(opt$ref_plink_chr, CHROMS[1],'.psam')) +ref_psam<-fread(paste0(opt$ref_plink_chr, GWAS_CHROMS[1],'.psam')) names(ref_psam)<-gsub('\\#', '', names(ref_psam)) if(!is.null(opt$ref_keep)){ @@ -131,7 +134,7 @@ fwrite(gwas, paste0(tmp_dir,'/GWAS_sumstats_temp.txt'), sep=' ') log_add(log_file = log_file, message = 'Merging per chromosome reference data.') # Save in plink1 format for MegaPRS -plink_merge(pfile = opt$ref_plink_chr, chr = CHROMS, plink2 = opt$plink2, keep = opt$ref_keep, extract = snplist, make_bed =T, out = paste0(tmp_dir, '/ref_merge')) +plink_merge(pfile = opt$ref_plink_chr, chr = GWAS_CHROMS, plink2 = opt$plink2, keep = opt$ref_keep, extract = snplist, make_bed =T, out = paste0(tmp_dir, '/ref_merge')) # Record start time for test if(!is.na(opt$test)){ @@ -168,10 +171,10 @@ system(paste0('cp ', opt$ldak_tag, '/* ', tmp_dir, '/bld/')) system(paste0('mv ', tmp_dir, '/sections/weights.short ', tmp_dir,'/bld/bld65')) # Calculate taggings -if(length(CHROMS) != 1){ +if(length(GWAS_CHROMS) != 1){ system(paste0(opt$ldak, ' --calc-tagging ', tmp_dir, '/bld.ldak --bfile ', tmp_dir, '/ref_merge --ignore-weights YES --power -.25 --annotation-number 65 --annotation-prefix ', tmp_dir, '/bld/bld --window-cm 1 --save-matrix YES --max-threads ', opt$n_cores)) } else { - system(paste0(opt$ldak, ' --calc-tagging ', tmp_dir, '/bld.ldak --bfile ', tmp_dir, '/ref_merge --ignore-weights YES --power -.25 --annotation-number 65 --annotation-prefix ', tmp_dir, '/bld/bld --window-cm 1 --chr ', CHROMS, ' --save-matrix YES --max-threads ', opt$n_cores)) + system(paste0(opt$ldak, ' --calc-tagging ', tmp_dir, '/bld.ldak --bfile ', tmp_dir, '/ref_merge --ignore-weights YES --power -.25 --annotation-number 65 --annotation-prefix ', tmp_dir, '/bld/bld --window-cm 1 --chr ', GWAS_CHROMS, ' --save-matrix YES --max-threads ', opt$n_cores)) } # Calculate Per-Predictor Heritabilities. @@ -192,7 +195,7 @@ log_add(log_file = log_file, message = 'Running using full reference.') # Calculate predictor-predictor correlations log_add(log_file = log_file, message = 'Calculating predictor-predictor correlations.') -full_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), ldak = opt$ldak, n_cores = opt$n_cores, chr = CHROMS) +full_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), ldak = opt$ldak, n_cores = opt$n_cores, chr = GWAS_CHROMS) # Run MegaPRS log_add(log_file = log_file, message = paste0('Running MegaPRS: ',opt$prs_model,' model.')) @@ -217,7 +220,7 @@ system(paste0(opt$ldak, ' --pseudo-summaries ', tmp_dir, '/GWAS_sumstats_temp.ps # Calculate predictor-predictor correlations log_add(log_file = log_file, message = 'Calculating predictor-predictor correlations.') -subset_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), keep = paste0(tmp_dir, '/keepb'), ldak = opt$ldak, n_cores = opt$n_cores, chr = CHROMS) +subset_cors <- ldak_pred_cor(bfile = paste0(tmp_dir, '/ref_merge'), keep = paste0(tmp_dir, '/keepb'), ldak = opt$ldak, n_cores = opt$n_cores, chr = GWAS_CHROMS) # Run megaPRS log_add(log_file = log_file, message = paste0('Running MegaPRS: ',opt$prs_model,' model.')) @@ -255,6 +258,13 @@ ref_pvar <- read_pvar(dat = opt$ref_plink_chr, chr = CHROMS) ref_pvar$Predictor<-paste0(ref_pvar$CHR,':',ref_pvar$BP) score<-merge(score, ref_pvar[,c('Predictor','SNP'), with=F], by='Predictor') score<-score[, c('SNP', 'A1', 'A2', names(score)[grepl('Model', names(score))]), with=F] + +print(head(score)) + +if(opt$pseudo_only){ + score <- score[,c('SNP','A1','A2', paste0('Model', gsub('Score_','',best_score$V1[1]))), with = F] +} + names(score)[grepl('Model', names(score))]<-paste0('SCORE_ldak_',names(score)[grepl('Model', names(score))]) # Flip effects to match reference alleles diff --git a/Scripts/pgs_methods/prscs.R b/Scripts/pgs_methods/prscs.R index c77bccdf..62c3f44b 100644 --- a/Scripts/pgs_methods/prscs.R +++ b/Scripts/pgs_methods/prscs.R @@ -97,14 +97,27 @@ if(!is.na(opt$test)){ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics -gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('SNP','A1','A2','BETA','SE','N')) +gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','A1','A2','BETA','SE','N')) + +# Subset CHROMS object to chr present +gwas_CHROMS <- unique(gwas$CHR) +gwas$CHR <- NULL # Store average sample size gwas_N <- round(mean(gwas$N), 0) fwrite(gwas, paste0(tmp_dir, '/GWAS_sumstats_temp.txt'), sep=' ') +# Create a temporary reference bim files for PRS-CS to match to +pvar <- read_pvar(opt$ref_plink_chr, chr = CHROMS) +pvar <- pvar[pvar$SNP %in% gwas$SNP,] +pvar$POS<-0 +for(i in gwas_CHROMS){ + write.table(pvar[pvar$CHR == i, c('CHR','SNP','POS','BP','A1','A2'), with=F], paste0(tmp_dir,'/ref.chr',i,'.bim'), col.names=F, row.names=F, quote=F) +} + rm(gwas) +rm(pvar) gc() # Record start time for test @@ -116,19 +129,9 @@ if(!is.na(opt$test)){ # Process sumstats using PRSsc ##### -# Create a temporary reference bim files for PRS-CS to match to -pvar <- read_pvar(opt$ref_plink_chr, chr = CHROMS) -pvar$POS<-0 -for(i in CHROMS){ - write.table(pvar[pvar$CHR == i, c('CHR','SNP','POS','BP','A1','A2'), with=F], paste0(tmp_dir,'/ref.chr',i,'.bim'), col.names=F, row.names=F, quote=F) -} - -rm(pvar) -gc() - # Make a data.frame listing chromosome and phi combinations jobs<-NULL -for(i in CHROMS){ +for(i in gwas_CHROMS){ jobs<-rbind(jobs, data.frame(CHR=i, phi=phi_param)) } @@ -165,7 +168,7 @@ log <- foreach(i = 1:nrow(jobs), .combine = c, .options.multicore = list(presche score_all<-NULL for(phi_i in phi_param){ score_phi<-NULL - for(i in CHROMS){ + for(i in gwas_CHROMS){ score_phi_i<-fread(paste0(tmp_dir,'/_pst_eff_a1_b0.5_phi',phi_i,'_chr',i,'.txt')) score_phi<-rbind(score_phi, score_phi_i) } diff --git a/Scripts/pgs_methods/sbayesr.R b/Scripts/pgs_methods/sbayesr.R index a360775e..89432e2a 100644 --- a/Scripts/pgs_methods/sbayesr.R +++ b/Scripts/pgs_methods/sbayesr.R @@ -91,8 +91,9 @@ if(!is.na(opt$test)){ log_add(log_file = log_file, message = 'Reading in GWAS.') # Read in, check and format GWAS summary statistics -gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('SNP','A1','A2','FREQ','BETA','SE','P','N')) - +gwas <- read_sumstats(sumstats = opt$sumstats, chr = CHROMS, log_file = log_file, req_cols = c('CHR','SNP','A1','A2','FREQ','BETA','SE','P','N')) +GWAS_CHROMS<-unique(gwas$CHR) +gwas$CHR<-NULL ### # Change to COJO format ### @@ -144,7 +145,7 @@ if(per_var_N == F & opt$impute_N == T){ sbayesr_opt <- paste0(sbayesr_opt, '--impute-n ') } -error<-foreach(i = CHROMS, .combine = rbind, .options.multicore = list(preschedule = FALSE)) %dopar% { +error<-foreach(i = GWAS_CHROMS, .combine = rbind, .options.multicore = list(preschedule = FALSE)) %dopar% { log <- system(paste0(opt$gctb, ' --sbayes R --ldm ', opt$ld_matrix_chr, i, '.ldm.sparse --pi 0.95,0.02,0.02,0.01 --gamma 0.0,0.01,0.1,1 --gwas-summary ', tmp_dir, '/GWAS_sumstats_COJO.txt --chain-length 10000 ', sbayesr_opt, '--exclude-mhc --burn-in 2000 --out-freq 1000 --out ', tmp_dir, '/GWAS_sumstats_SBayesR.chr', i), intern = T) # Check whether the analysis converged @@ -173,7 +174,7 @@ if(sum(grepl('Error', error$Log) == T) > 1){ # Combine per chromosome snpRes files snpRes<-NULL -for(i in CHROMS){ +for(i in GWAS_CHROMS){ snpRes <- rbind(snpRes, fread(paste0(tmp_dir, '/GWAS_sumstats_SBayesR.chr', i, '.snpRes'))) } @@ -199,14 +200,14 @@ if(!is.na(opt$test)){ # Combine per chromosome parRes files parRes_mcmc <- list() -for(i in CHROMS){ +for(i in GWAS_CHROMS){ parRes_mcmc[[i]] <- fread(paste0(tmp_dir, '/GWAS_sumstats_SBayesR.chr', i, '.mcmcsamples.Par')) } parRes <- NULL for(par in names(parRes_mcmc[[i]])){ parRes_mcmc_par <- NULL - for(i in CHROMS){ + for(i in GWAS_CHROMS){ parRes_mcmc_par <- cbind(parRes_mcmc_par, parRes_mcmc[[i]][[par]]) } diff --git a/Scripts/pipeline_reports/indiv_report_creator.Rmd b/Scripts/pipeline_reports/indiv_report_creator.Rmd index 921e640a..3ca744d2 100644 --- a/Scripts/pipeline_reports/indiv_report_creator.Rmd +++ b/Scripts/pipeline_reports/indiv_report_creator.Rmd @@ -1,10 +1,10 @@ --- title: "GenoPred Report" params: - name: "" - id: "" - config: "" - cwd: "" + name: + id: + config: + cwd: output: html_document: toc: true @@ -63,7 +63,11 @@ gwas_groups <- read_param(config = params$config, param = 'gwas_groups') score_list <- read_param(config = params$config, param = 'score_list') # Identify PGS methods to be included -pgs_methods_list <- read_param(config = params$config, param = 'pgs_methods', return_obj = F) +if(!is.null(gwas_list)){ + pgs_methods_list <- read_param(config = params$config, param = 'pgs_methods', return_obj = F) +} else { + pgs_methods_list <- NULL +} # If testing, change CHROMS to chr value testing <- read_param(config = params$config, param = 'testing', return_obj = F) @@ -265,10 +269,6 @@ cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statis cat0("- ", ifelse(is.null(gwas_groups), 0, nrow(gwas_groups)), " GWAS groups were specified.\n") cat0("- ", length(pgs_methods_list), " PGS methods were applied, including ", paste0(pgs_method_labels$label[pgs_method_labels$method %in% pgs_methods_list], collapse = ', '), ".\n") -if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){ - cat0(" - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n") -} - if(is.null(score_list)){ cat0("- No external score files were provided in score_list.\n\n") } else { @@ -489,7 +489,7 @@ cat0("## Target Polygenic Profile {.tabset .tabset-fade} \n\n") # Read in PGS # Exclude PGS from multi-source methods as no estimate of R is available single_source_methods <- pgs_methods_list[!(pgs_methods_list %in% pgs_group_methods) & !(grepl('_multi|tlprs_', pgs_methods_list))] -pgs <- read_pgs(config = params$config, name = params$name, pop = 'TRANS', pseudo_only=T, pgs_method = single_source_methods)[[1]] +pgs <- read_pgs(config = params$config, name = params$name, pop = 'TRANS', pseudo_only=T, pgs_methods = single_source_methods)[[1]] # Structure PGS for target individual pgs_dat <- NULL diff --git a/Scripts/pipeline_reports/samp_report_creator.Rmd b/Scripts/pipeline_reports/samp_report_creator.Rmd index 62450640..a9be7111 100644 --- a/Scripts/pipeline_reports/samp_report_creator.Rmd +++ b/Scripts/pipeline_reports/samp_report_creator.Rmd @@ -181,10 +181,6 @@ cat0("- ", ifelse(is.null(gwas_list), 0, nrow(gwas_list)), " GWAS summary statis cat0("- ", ifelse(is.null(gwas_groups), 0, nrow(gwas_groups)), " GWAS groups were specified.\n") cat0("- ", length(pgs_methods_list), " PGS methods were applied, including ", paste0(pgs_methods_list, collapse = ', '), ".\n") -if(any(gwas_list$population != 'EUR') & any(c('ldpred2','sbayesr') %in% pgs_methods_list)){ - cat0(" - **Note.** `ldpred2` and `sbayesr` are currently only implemented for GWAS of EUR populations.\n\n") -} - if(is.null(score_list)){ cat0("- No external score files were provided in score_list.\n\n") } else { diff --git a/Scripts/target_scoring/target_scoring_pipeline.R b/Scripts/target_scoring/target_scoring_pipeline.R index fd22343d..63876005 100644 --- a/Scripts/target_scoring/target_scoring_pipeline.R +++ b/Scripts/target_scoring/target_scoring_pipeline.R @@ -27,6 +27,7 @@ opt = parse_args(OptionParser(option_list=option_list)) # Load dependencies library(GenoUtils) library(data.table) +library(bigstatsr) source('../functions/misc.R') source_all('../functions') library(foreach) @@ -180,30 +181,40 @@ for(chr_i in CHROMS){ score = paste0(tmp_dir,'/all_score.txt'), keep = opt$target_keep, frq = opt$ref_freq_chr, - threads = opt$n_cores + threads = opt$n_cores, + fbm = T ) # Sum scores across chromosomes if(chr_i == CHROMS[1]){ - scores_ids <- scores_i[, 1:2, with = F] - current_scores <- as.matrix(scores_i[, -1:-2, with = FALSE]) - scores <- current_scores - } else { - current_scores <- as.matrix(scores_i[, -1:-2, with = FALSE]) - scores <- scores + current_scores + scores_ids <- scores_i$ids + cols <- scores_i$cols + + # Initialize a FBM (backed on disk) for running PGS sum + file.remove(paste0(tmp_dir, '/PGS_fbm.bk')) + scores <- FBM( + nrow = nrow(scores_ids), + ncol = length(cols), + backingfile = paste0(tmp_dir, '/PGS_fbm'), + init = 0 + ) } - - system(paste0('rm ', tmp_dir, '/all_score.txt')) - system(paste0('rm ', tmp_dir, '/row_index.txt')) - system(paste0('rm ', tmp_dir, '/map.txt')) + + # In-place addition: for each score column + for (j in cols) { + scores[, which(cols == j)] <- scores[, which(cols == j)] + scores_i$scores[,which(scores_i$cols == j)] + } + + file.remove(scores_i$scores$backingfile, + scores_i$scores$rds) rm(scores_i) - rm(current_scores) gc() } # Combine score with IDs -scores<-data.table(scores_ids, - scores) +scores <- as.data.table(matrix(scores[,], ncol = length(cols))) +setnames(scores, cols) +scores <- cbind(scores_ids, scores) ### # Scale the polygenic scores based on the reference diff --git a/docs/CrossPop.Rmd b/docs/CrossPop.Rmd index 6d4195ef..1e1a1d52 100644 --- a/docs/CrossPop.Rmd +++ b/docs/CrossPop.Rmd @@ -7065,6 +7065,13 @@ cp ~/oliverpainfel/Analyses/crosspop/plots_three_pop/average_r.png /scratch/prj/ Here we will use GWAS sumtats that were used in the original GenoPred paper. These GWAS are from a range of sources, often large meta-analyses, which can lead to greater mispecification in the sumstats, which can impact the performance of some PGS methods. This is to provide more confidence in the performance of SBayesRC and QuickPRS relative to other methods. +
+ +**Note**: I am using this opportunity to evaluate lassosum2, which is not included in other analyses in this project. + +
+ + *** ### PGS calculation @@ -7104,7 +7111,7 @@ config<-c( "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_meta.yaml", "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_meta.txt", "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt", - "pgs_methods: ['quickprs','sbayesrc','ldpred2','ptclump','dbslmm']", + "pgs_methods: ['quickprs','sbayesrc','ldpred2','ptclump','dbslmm','lassosum2']", # "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']", "cores_prep_pgs: 10", "cores_target_pgs: 50", @@ -7382,7 +7389,7 @@ for(pheno_i in phenos){ # Remove pseudo model for methods that don't really have one eval_i <- eval_i[ - !(eval_i$Method %in% c('ptclump') & + !(eval_i$Method %in% c('ptclump','lassosum2') & eval_i$Model %in% c('SumStatTune')),] res_eval[[pheno_i]]<-eval_i @@ -7390,7 +7397,7 @@ for(pheno_i in phenos){ } # Create vector defining or of methods in plots -model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC") +model_order <- c("DBSLMM", "lassosum",'lassosum2', "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC") res_eval_simp <- NULL for(pheno_i in phenos){ @@ -7407,6 +7414,7 @@ for(pheno_i in phenos){ # Plot results for each phenotype separately dir.create('~/oliverpainfel/Analyses/crosspop/plots_meta') +png(paste0('~/oliverpainfel/Analyses/crosspop/plots_meta/trait_r.png'), res=100, width = 1200, height = 900, units = 'px') ggplot(res_eval_simp, aes(x=label, y=R , fill = Model)) + geom_errorbar(aes(ymin = R - SE, ymax = R + SE), width = 0, @@ -7422,6 +7430,7 @@ ggplot(res_eval_simp, aes(x=label, y=R , fill = Model)) + legend.position = "top", legend.key.spacing.x = unit(1, "cm"), legend.justification = "center") +dev.off() #### # Average results across phenotypes @@ -7508,9 +7517,11 @@ dev.off() cp ~/oliverpainfel/Analyses/crosspop/plots_meta/average_r.png /scratch/prj/oliverpainfel/Software/MyGit/GenoPred/docs/Images/CrossPop_2025/average_r_meta.png +cp ~/oliverpainfel/Analyses/crosspop/plots_meta/trait_r.png /scratch/prj/oliverpainfel/Software/MyGit/GenoPred/docs/Images/CrossPop_2025/trait_r_meta.png + ``` -
Show results +
Show average results
@@ -7520,6 +7531,16 @@ cp ~/oliverpainfel/Analyses/crosspop/plots_meta/average_r.png /scratch/prj/olive
+
Show trait-specific results + +
+
+ +
+
+ +
+ *** ## Using downsampled GWAS diff --git a/docs/CrossPop.html b/docs/CrossPop.html index 12989093..f433f96b 100644 --- a/docs/CrossPop.html +++ b/docs/CrossPop.html @@ -439,6 +439,9 @@
  • Translating Polygenic Scores onto the Absolute Scale
  • +
  • + Cross-Ancestry Polygenic Prediction +
  • @@ -11383,6 +11386,10 @@

    Using external GWAS sumstats

    sumstats, which can impact the performance of some PGS methods. This is to provide more confidence in the performance of SBayesRC and QuickPRS relative to other methods.

    +
    +

    Note: I am using this opportunity to evaluate +lassosum2, which is not included in other analyses in this project.

    +

    PGS calculation

    @@ -11423,7 +11430,7 @@

    "config_file: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/config_meta.yaml", "gwas_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/crosspop/gwas_list_meta.txt", "target_list: /users/k1806347/oliverpainfel/Data/ukb/GenoPred/configs/basic/target_list.txt", - "pgs_methods: ['quickprs','sbayesrc','ldpred2','ptclump','dbslmm']", + "pgs_methods: ['quickprs','sbayesrc','ldpred2','ptclump','dbslmm','lassosum2']", # "pgs_methods: ['ptclump','quickprs','dbslmm','lassosum','megaprs','prscs','ldpred2','sbayesrc']", "cores_prep_pgs: 10", "cores_target_pgs: 50", @@ -11689,7 +11696,7 @@

    # Remove pseudo model for methods that don't really have one eval_i <- eval_i[ - !(eval_i$Method %in% c('ptclump') & + !(eval_i$Method %in% c('ptclump','lassosum2') & eval_i$Model %in% c('SumStatTune')),] res_eval[[pheno_i]]<-eval_i @@ -11697,7 +11704,7 @@

    } # Create vector defining or of methods in plots -model_order <- c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC") +model_order <- c("DBSLMM", "lassosum",'lassosum2', "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "QuickPRS", "SBayesRC") res_eval_simp <- NULL for(pheno_i in phenos){ @@ -11714,6 +11721,7 @@

    # Plot results for each phenotype separately dir.create('~/oliverpainfel/Analyses/crosspop/plots_meta') +png(paste0('~/oliverpainfel/Analyses/crosspop/plots_meta/trait_r.png'), res=100, width = 1200, height = 900, units = 'px') ggplot(res_eval_simp, aes(x=label, y=R , fill = Model)) + geom_errorbar(aes(ymin = R - SE, ymax = R + SE), width = 0, @@ -11729,6 +11737,7 @@

    legend.position = "top", legend.key.spacing.x = unit(1, "cm"), legend.justification = "center") +dev.off() #### # Average results across phenotypes @@ -11810,7 +11819,7 @@

  • -Show results +Show average results
    @@ -11818,6 +11827,16 @@

    +
    + +Show trait-specific results + +
    +
    +

    +
    +
    +

    diff --git a/docs/Images/CrossPop_2025/average_r_meta.png b/docs/Images/CrossPop_2025/average_r_meta.png index 965ca324..9e8dbc0f 100644 Binary files a/docs/Images/CrossPop_2025/average_r_meta.png and b/docs/Images/CrossPop_2025/average_r_meta.png differ diff --git a/docs/Images/CrossPop_2025/trait_r_meta.png b/docs/Images/CrossPop_2025/trait_r_meta.png new file mode 100644 index 00000000..00bc9423 Binary files /dev/null and b/docs/Images/CrossPop_2025/trait_r_meta.png differ diff --git a/docs/Images/pipeline_readme/pipeline_schematic_groups_lowdef.png b/docs/Images/pipeline_readme/pipeline_schematic_groups_lowdef.png index 5d56ae02..4b08e619 100644 Binary files a/docs/Images/pipeline_readme/pipeline_schematic_groups_lowdef.png and b/docs/Images/pipeline_readme/pipeline_schematic_groups_lowdef.png differ diff --git a/docs/more_index.Rmd b/docs/more_index.Rmd index c47a1fe9..4bfdbc37 100644 --- a/docs/more_index.Rmd +++ b/docs/more_index.Rmd @@ -23,6 +23,7 @@ output: - Computational time/memory benchmark - Link - Benchmark in OpenSNP dataset - Link - Benchmark in UK Biobank dataset - Link +- Demonstration of multi-source methods to height GWAS and tested in OpenSNP target sample - Link *** @@ -42,5 +43,4 @@ output: - Cross-Ancestry Polygenic Prediction - Summary - Link - Code - Link - - Application of multi-source methods to height GWAS and tested in OpenSNP target sample - Link diff --git a/docs/more_index.html b/docs/more_index.html index 0be2de73..9a5d162a 100644 --- a/docs/more_index.html +++ b/docs/more_index.html @@ -411,6 +411,9 @@

    Pipeline

    Link
  • Benchmark in UK Biobank dataset - Link
  • +
  • Demonstration of multi-source methods to height GWAS and tested in +OpenSNP target sample - +Link

  • @@ -445,9 +448,6 @@

    Research

  • Summary - Link
  • Code - Link
  • -
  • Application of multi-source methods to height GWAS and tested in -OpenSNP target sample - -Link
  • diff --git a/docs/opensnp_benchmark_crosspop.Rmd b/docs/opensnp_benchmark_crosspop.Rmd index b9b1810a..7273027f 100644 --- a/docs/opensnp_benchmark_crosspop.Rmd +++ b/docs/opensnp_benchmark_crosspop.Rmd @@ -125,7 +125,7 @@ config <- readLines('misc/opensnp/config.yaml') config[grepl('^config_file:', config)]<- 'config_file: misc/opensnp/config_cross_pop.yaml' config <- config[!grepl('^score_list:', config)] config[grepl('^outdir:', config)]<- 'outdir: /users/k1806347/oliverpainfel/Data/OpenSNP/GenoPred/test_cross_pop_2' -config[grepl('^pgs_methods:', config)]<- "pgs_methods: ['quickprs']" +config[grepl('^pgs_methods:', config)]<- "pgs_methods: ['quickprs','sbayesrc']" config[grepl('^gwas_list:', config)]<- "gwas_list: misc/opensnp/gwas_list_cross_pop.txt" config<-c(config, 'gwas_groups: misc/opensnp/gwas_groups.txt') config<-c(config, "leopard_methods: ['quickprs']") diff --git a/docs/pipeline_readme.Rmd b/docs/pipeline_readme.Rmd index 45885cf5..a46ad6e5 100644 --- a/docs/pipeline_readme.Rmd +++ b/docs/pipeline_readme.Rmd @@ -271,7 +271,7 @@ config <- list( pgs_methods = list( description = 'List of polygenic scoring methods to run', example = "`['ptclump','dbslmm']`", - note = "Options are: `ptclump`, `dbslmm`, `prscs`, `sbayesr`, `lassosum`, `ldpred2`, `megaprs`. **Note.** `sbayesr` and `ldpred2` are only implemented for GWAS of EUR ancestry." + note = "Options are: `ptclump`, `dbslmm`, `prscs`, `sbayesr`, `lassosum`, `lassosum2`, `ldpred2`, `megaprs`. **Note.** By default, `sbayesr`, `lassosum2`, and `ldpred2` are only implemented for GWAS of EUR ancestry." ), testing = list( description = 'Controls testing mode', @@ -555,7 +555,7 @@ target_list: example_input/target_list.txt # Specify location of score_list file score_list: example_input/score_list.txt -# Specify pgs_methods ('ptclump','dbslmm','prscs','sbayesr','lassosum','ldpred2','megaprs') +# Specify pgs_methods pgs_methods: ['ptclump','dbslmm'] # Specify if you want test mode. Set to NA if you don't want test mode @@ -818,7 +818,7 @@ The GenoPred pipeline has many potential outputs. Here is a detailed schematic d
    - +
    @@ -1334,7 +1334,7 @@ By default, the pipeline allocates 5 cores when running the outlier_detection ru ## Multi-source PGS methods -The GenoPred pipeline also implements a range of multi-source polygenic scoring methods that can combine GWAS summary statistics from multiple populations. To use these methods, an additional `gwas_groups` file must be provided, indicating which GWAS are to be jointly analyzed. Specify the location of the `gwas_groups` file in the `configfile` using the `gwas_groups` parameter. An example of a gwas_groups file can be found [here](../pipeline/example_input/gwas_groups.txt). +The GenoPred pipeline also implements a range of multi-source polygenic scoring methods that can combine GWAS summary statistics from multiple populations. To use these methods, an additional `gwas_groups` file must be provided, indicating which GWAS are to be jointly analysed. Specify the location of the `gwas_groups` file in the `configfile` using the `gwas_groups` parameter. An example of a gwas_groups file can be found [here](../pipeline/example_input/gwas_groups.multisource.txt).
    View gwas_groups format @@ -1376,52 +1376,6 @@ for (column in names(gwas_groups$Column)) { kable(gwas_groups_df, 'markdown') ``` -The pipeline also implements a range of multi-source polygenic scoring methods, that can combine GWAS summary statistics from multiple populations. To use these methods an additional `gwas_groups` file must be provided, indicating which GWAS are to be jointly analysed. The `gwas_groups` file must be specified in the `configfile` using the `gwas_groups` parameter. An example of a gwas_groups file can be found [here](../pipeline/example_input/gwas_groups.txt). - -
    - -View gwas_groups format - -```{r, eval = T, echo = F, results = 'asis'} - -gwas_groups <- list( - Column = list( - name = list( - example = '`height`', - description = "ID for the group of GWAS. Cannot contain spaces (' ') or hyphens ('-')" - ), - gwas = list( - example = '`yengo_eur,yengo_eas`', - description = "Comma-seperated list of GWAS names, corresponding to the `gwas_list`." - ), - label = list( - example = '`\"Height (EUR+EAS)\"`', - description = "A human readable name for the group of GWAS Wrap in double quotes if multiple words." - ) - ) -) - -gwas_groups_df <- NULL -for (column in names(gwas_groups$Column)) { - description <- gwas_groups$Column[[column]]$description - example <- gwas_groups$Column[[column]]$example - - # Append each parameter's details to the data frame - gwas_groups_df <- rbind( - gwas_groups_df, - data.frame( - Column = column, - Example = example, - Description = description, - stringsAsFactors = FALSE - ) - ) -} - -kable(gwas_groups_df, 'markdown') - -``` -
    @@ -1460,7 +1414,7 @@ gwas_groups: example_input/gwas_groups.multisource.txt # Specify location of target_list file target_list: example_input/target_list.txt -# Specify pgs_methods ('ptclump','dbslmm','prscs','sbayesr','lassosum','ldpred2','megaprs') +# Specify pgs_methods pgs_methods: ['lassosum'] # Specify methods for which PGS should be combined using LEOPARD+QuickPRS @@ -1575,7 +1529,7 @@ The relevant `configfile` parameters are: - `sbayesr_ldref`: for **SBayesR** - `sbayesrc_ldref`: for **SBayesRC** -- `ldpred2_ldref`: for **LDpred2** +- `ldpred2_ldref`: for **LDpred2** and **lassosum2** - `quickprs_ref`: for **QuickPRS** - `quickprs_multi_ldref`: for **LEOPARD+QuickPRS** diff --git a/docs/pipeline_readme.html b/docs/pipeline_readme.html index 13202138..2eaa9aef 100644 --- a/docs/pipeline_readme.html +++ b/docs/pipeline_readme.html @@ -570,44 +570,15 @@

    Step 1: Download GenoPred repository

    -<<<<<<< HEAD -

    Step 2: Create software environment for pipeline

    -

    Conda is a software environment management system that simplifies -installing and managing dependencies. We recommend using Miniforge — a -minimal conda installer that comes with Mamba, a fast drop-in -replacement for conda.

    -

    If you don’t already have Miniforge installed, you can install it -using the commands below:

    -
    # Download and install Miniforge (for Linux)
    -wget https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-24.11.3-0-Linux-x86_64.sh
    -bash Miniforge3-24.11.3-0-Linux-x86_64.sh
    -

    Accept the default installation options. Once installed, you may need -to run source ~/.bashrc or restart your terminal. You should see (base) -appear at the beginning of your terminal prompt.

    -

    Now, create the GenoPred environment using Mamba:

    -
    mamba env create -f GenoPred/pipeline/envs/pipeline.yaml
    -

    Activate the new genopred environment:

    -
    mamba activate genopred
    -
    -

    Note: If you are working on an HPC, check whether -mamba is an available module. Using the centrally installed version of -mamba may avoid issues down the road.

    -
    -=======

    Step 2: Create conda environment for pipeline

    Conda is a software environment management system which is great way for easily downloading and storing software. We will use conda to create an environment that the GenoPred pipeline will run in.

    If you don’t already have conda installed, we will install it using miniconda.

    -<<<<<<< Updated upstream -
    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
    -sh Miniconda3-latest-Linux-x86_64.sh
    -=======
    # Download and install Miniforge (for Linux)
     wget https://github.com/conda-forge/miniforge/releases/download/24.11.3-0/Miniforge3-24.11.3-0-Linux-x86_64.sh
     bash Miniforge3-24.11.3-0-Linux-x86_64.sh
    ->>>>>>> Stashed changes

    I would say yes to the default options. You may then need to refresh your workspace to initiate conda, by running source ~/.bashrc. You should see (base) @@ -618,16 +589,9 @@

    Step 2: Create conda environment for pipeline

    GenoPred/pipeline/envs/pipeline.yaml file. This will create an environment called genopred with some essential packages installed.

    -<<<<<<< Updated upstream -
    conda env create -f GenoPred/pipeline/envs/pipeline.yaml
    -

    Now activate the new genopred environment.

    -
    conda activate genopred
    -=======
    mamba env create -f GenoPred/pipeline/envs/pipeline.yaml

    Now activate the new genopred environment.

    mamba activate genopred
    ->>>>>>> Stashed changes ->>>>>>> 999a324 (Updated docs)
    @@ -743,9 +707,9 @@

    configfile

    ---+++ @@ -808,9 +772,10 @@

    configfile

    +lassosum2, ldpred2, megaprs. +Note. By default, sbayesr, +lassosum2, and ldpred2 are only implemented +for GWAS of EUR ancestry. @@ -1132,7 +1097,7 @@

    Step 2: Run the pipeline

    # Specify location of score_list file score_list: example_input/score_list.txt -# Specify pgs_methods ('ptclump','dbslmm','prscs','sbayesr','lassosum','ldpred2','megaprs') +# Specify pgs_methods pgs_methods: ['ptclump','dbslmm'] # Specify if you want test mode. Set to NA if you don't want test mode @@ -1576,7 +1541,7 @@

    Requesting outputs

    GenoPred pipeline.

    -

    +

    @@ -2185,11 +2150,11 @@

    Multi-source PGS methods

    polygenic scoring methods that can combine GWAS summary statistics from multiple populations. To use these methods, an additional gwas_groups file must be provided, indicating which GWAS -are to be jointly analyzed. Specify the location of the +are to be jointly analysed. Specify the location of the gwas_groups file in the configfile using the gwas_groups parameter. An example of a gwas_groups file can be found here.

    +href="../pipeline/example_input/gwas_groups.multisource.txt">here.

    View gwas_groups format @@ -2228,52 +2193,6 @@

    Multi-source PGS methods

    ['ptclump','dbslmm'] Options are: ptclump, dbslmm, prscs, sbayesr, lassosum, -ldpred2, megaprs. Note. -sbayesr and ldpred2 are only implemented for -GWAS of EUR ancestry.
    testing
    -

    The pipeline also implements a range of multi-source polygenic -scoring methods, that can combine GWAS summary statistics from multiple -populations. To use these methods an additional gwas_groups -file must be provided, indicating which GWAS are to be jointly analysed. -The gwas_groups file must be specified in the -configfile using the gwas_groups parameter. An -example of a gwas_groups file can be found here.

    -
    - -View gwas_groups format - - ----- - - - - - - - - - - - - - - - - - - - - - - - - -
    ColumnExampleDescription
    nameheightID for the group of GWAS. Cannot contain spaces (’ ‘) -or hyphens (’-’)
    gwasyengo_eur,yengo_easComma-seperated list of GWAS names, corresponding to -the gwas_list.
    label"Height (EUR+EAS)"A human readable name for the group of GWAS Wrap in -double quotes if multiple words.
    @@ -2329,7 +2248,7 @@

    Multi-source PGS methods

    # Specify location of target_list file target_list: example_input/target_list.txt -# Specify pgs_methods ('ptclump','dbslmm','prscs','sbayesr','lassosum','ldpred2','megaprs') +# Specify pgs_methods pgs_methods: ['lassosum'] # Specify methods for which PGS should be combined using LEOPARD+QuickPRS @@ -2523,7 +2442,8 @@

    Specifying alternative reference data for PGS methods

    • sbayesr_ldref: for SBayesR
    • sbayesrc_ldref: for SBayesRC
    • -
    • ldpred2_ldref: for LDpred2
    • +
    • ldpred2_ldref: for LDpred2 and +lassosum2
    • quickprs_ref: for QuickPRS
    • quickprs_multi_ldref: for LEOPARD+QuickPRS
    • diff --git a/docs/pipeline_technical.Rmd b/docs/pipeline_technical.Rmd index 07cd143e..32336190 100644 --- a/docs/pipeline_technical.Rmd +++ b/docs/pipeline_technical.Rmd @@ -24,15 +24,19 @@ library(data.table) This document provides technical details of the GenoPred pipeline. The GenoPred pipeline automates the process of calculating polygenic scores. The pipeline aims to implement the current practises for polygenic scoring. See [here](pipeline_overview.html) more general information regarding the GenoPred pipeline. -Please cite our preprint when using the pipeline: +Please cite our publication when using the pipeline: -- "Pain, O. et al. "The GenoPred Pipeline: A Comprehensive and Scalable Pipeline for Polygenic Scoring." MedRxiv 2024. https://doi.org/10.1101/2024.06.12.24308843 +- "Pain, O. et al. "The GenoPred Pipeline: A Comprehensive and Scalable Pipeline for Polygenic Scoring." Bioinformatics (2024). https://doi.org/10.1093/bioinformatics/btae551 + +If using multi-source PGS methods, please also cite our study describing their evaluation and implementation within GenoPred: + +- "Pain, O. "Leveraging Global Genetics Resources to Enhance Polygenic Prediction Across Ancestrally Diverse Populations." MedRxiv (2026). https://doi.org/10.1101/2025.03.27.25324773 If relevant, please also cite our paper comparing polygenic scoring methods and describing the reference-standardised approach: -- Pain, O. et al. "Evaluation of polygenic prediction methodology within a reference-standardized framework." PLoS genetics. https://doi.org/10.1371/journal.pgen.1009021 +- Pain, O. et al. "Evaluation of polygenic prediction methodology within a reference-standardized framework." PLoS genetics (2024). https://doi.org/10.1371/journal.pgen.1009021 -Please also cite the relevant studies for the tools and data used by the GenoPred pipeline. +Please also remember to cite the relevant studies for the tools and data used by the GenoPred pipeline. *** @@ -51,7 +55,7 @@ See [here](pipeline_readme.html#pipeline-configuration) for more information on ***
      -![Figure 1. GenoPred pipeline schematic. Shows input files, processes, outputs, and rules.](Images/pipeline_readme/pipeline_schematic_lowdef.png) +![Figure 1. GenoPred pipeline schematic. Shows input files, processes, outputs, and rules.](Images/pipeline_readme/pipeline_schematic_groups_lowdef.png)
      @@ -123,21 +127,22 @@ The GenoPred pipeline currently only implements PGS methods intended for GWAS ba ```{r, eval = T, echo = F} methods_table <- data.frame( - Method = c("DBSLMM", "lassosum", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "SBayesR"), - Software = c("DBSLMM", "lassosum R package", "bigsnpr R package", "LDAK", "PRS-CS", "PLINK", "GCTB"), - PubMedID = c(32330416, "28480976", "33326037", "34234142", "30992449", "25722852", "31704910"), - PseudoValidationOption = c("Yes (only option)", "Yes", "Yes", "Yes", "Yes", "No", "Yes (only option)"), + Method = c("DBSLMM", "lassosum", "lassosum2", "LDpred2", "MegaPRS", "PRS-CS", "pT+clump", "SBayesR"), + Software = c("DBSLMM", "lassosum R package", "bigsnpr R package", "bigsnpr R package", "LDAK", "PRS-CS", "PLINK", "GCTB"), + PubMedID = c("32330416", "28480976", "36105883", "33326037", "34234142", "30992449", "25722852", "31704910"), + PseudoValidationOption = c("Yes (only option)", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes (only option)"), Parameters = c("SNP-heritability estimated using LD Score Regression (on liability scale for binary outcomes)", "s = 0.2, 0.5, 0.9, 1; lambda = exp(seq(log(0.001), log(0.1), length.out=20))", + "delta = 0.001, 0.01, 0.1, 1; nlambda = 30; lambda.min.ratio = 0.01", "Grid includes 126 combinations of heritability and non-zero effect fractions (p).", "Fits lasso, ridge, bolt and BayesR models, with a total of 148 sets of hyperparameters", "phi = 1e-6, 1e-4, 1e-2, 1, auto", "p-value thresholds: 1e-8, 1e-6, 1e-4, 1e-2, 0.1, 0.2, 0.3, 0.4, 0.5, 1; Clumping: r2 = 0.1; window = 250kb", "NA"), - MHCRegion = c("Not excluded", "Not excluded", "Not excluded", "Not excluded", "Not excluded", "Only top variant retained", "Excluded (as recommended)"), - LDReference = c("Population-matched 1KG+HGDP", "Population-matched 1KG+HGDP", "EUR UKB (LDpred2-provided)", "Population-matched 1KG+HGDP", "Population-matched UKB (PRS-CS provided)", "Population-matched 1KG+HGDP", "EUR UKB (GCTB-provided)"), - CPUUsage = c("40 seconds", "10 seconds", "3 minutes", "5 minutes", "35 minutes", "5 seconds", "3 minutes"), - MemoryUsage = c("450 Mb", "400 Mb", "500 Mb", "450Mb", "350 Mb", "100 Mb", "500 Mb") + MHCRegion = c("Not excluded", "Not excluded", "Not excluded", "Not excluded", "Not excluded", "Not excluded", "Only top variant retained", "Excluded (as recommended)"), + LDReference = c("Population-matched 1KG+HGDP", "Population-matched 1KG+HGDP", "EUR UKB (LDpred2-provided)", "EUR UKB (LDpred2-provided)", "Population-matched 1KG+HGDP", "Population-matched UKB (PRS-CS provided)", "Population-matched 1KG+HGDP", "EUR UKB (GCTB-provided)"), + CPUUsage = c("40 seconds", "10 seconds", "NA but it is quick", "3 minutes", "5 minutes", "35 minutes", "5 seconds", "3 minutes"), + MemoryUsage = c("450 Mb", "400 Mb", "NA but ~500Mb", "500 Mb", "450Mb", "350 Mb", "100 Mb", "500 Mb") ) names(methods_table) <- c('Method','Software','PubMed ID', 'Pseudovalidation Option','Parameters','MHC Region','LD Reference','CPU usage*','Memory Usage*') @@ -165,6 +170,12 @@ lassosum, an R package, applies the LASSO method for constructing PGS, is implem *** +#### lassosum2 + +lassosum2 is an updated version of the original lassosum method and is implemented in the bigsnpr R package. It performs penalised regression on GWAS summary statistics using an elastic net–like approach, balancing L1 and L2 penalties through a grid of hyperparameters: the shrinkage parameter lambda and the sparsity parameter delta. Lassosum2 can be run efficiently on sparse LD matrices derived from reference data, and is particularly suited for traits where variable selection may be beneficial. In GenoPred, lassosum2 is run using the script [pgs_methods/lassosum2.R](https://github.com/opain/GenoPred/blob/master/Scripts/pgs_methods/lassosum2.R). GenoPred uses bigsnpr v1.12.2 with the default grid of lambda and delta values, and performs GWAS quality control as recommended by the authors. LD matrices used by lassosum2 are precomputed from European individuals in the UK Biobank and applied only to GWAS based on a EUR population. Alternative LD reference panels can be specified for other populations ([see here](pipeline_readme.html#specifying-alternative-reference-data-for-pgs-methods)). + +*** + #### LDpred2 LDpred2, part of the bigsnpr R package, operates in 'inf', 'grid', and 'auto' modes. The 'inf' mode is better suited for highly polygenic traits, whereas 'grid' and 'auto' adjust effect sizes using various hyperparameters, including SNP heritability. In GenoPred, LDpred2 is run using the script [pgs_methods/ldpred2](https://github.com/opain/GenoPred/blob/master/Scripts/pgs_methods/ldpred2.R).R. GenoPred uses bigsnpr v1.12.2, with the default LDpred2 grid search, and recommended GWAS quality control checks. GenoPred employs LDpred2's precomputed LD matrices based on the European individuals from the UK Biobank, and it is applied only to GWAS based on a EUR population. The user can specify alternative LD reference data to include additional populations ([see here](pipeline_readme.html#specifying-alternative-reference-data-for-pgs-methods)). If the SNP-h2 estimated using LDSC is <0.05, the SNP-heritability used by LDpred2 is set to 0.05. diff --git a/docs/pipeline_technical.html b/docs/pipeline_technical.html index 49ef0bb9..f678bc41 100644 --- a/docs/pipeline_technical.html +++ b/docs/pipeline_technical.html @@ -80,6 +80,41 @@ gtag('config', 'G-YR18ZB3PR3'); + + + +