-
Codes for installation, step1, step2, drawing plots are provided in the SAIGE directory.
-
First, install SAIGE with devtools library in R. You may choose either a or b method.
- SAIGE is now available in conda environment
conda create -n saige -c conda-forge -c bioconda "r-base>=4.0" r-saige
conda activate saige
a) Install dependencies on R with devtools library
R code
install.packages('devtools')
install.packages('SKAT')
library(devtools)
devtools::install_github("leeshawn/MetaSKAT")
library(MetaSKAT); library(SKAT)
devtools::install_github("weizhouUMICH/SAIGE")
#try pip cget or cmake3 if not working
library(SAIGE)
b) Installation by conda virtual environment on linux shell
Required yml file can be found in https://github.com/weizhouUMICH/SAIGE or in /BiO/kogo/data
code on bash shell
conda env create -f environment-RSAIGE.yml
conda activate RSAIGE
FLAGPATH=`which python | sed 's|/bin/python$||'`
export LDFLAGS="-L${FLAGPATH}/lib"
export CPPFLAGS="-I${FLAGPATH}/include"
pip install savvy
src_branch=master
repo_src_url=https://github.com/weizhouUMICH/SAIGE
git clone --depth 1 -b $src_branch $repo_src_url
R CMD INSTALL --library=path_to_final_SAIGE_library SAIGE
path is ./SAIGE When calling library in R, use 'library(SAIGE, lib.loc=path_to_final_SAIGE_library)'
-
Please copy all the example files in Example files in /BiO/kogo/data directory and the plink2 program in /BiO/kogo/apps/ to your directory. (cf . cp -r /BiO/kogo/data/sglee/ ./ , cp /BiO/kogo/apps/plink2 ./)
-
vcf.gz file and tbi files are needed in order to make single variant association test. If you want to make association tests on all of the variants in the plink files, use the following codes
./plink2 --bfile saige_example --recode vcf id-paste=iid --out practice
bgzip practice.vcf
tabix -p vcf practice.vcf.gz # create tbi index file
tabix -C practice.vcf.gz # create csi index file for step2 in SAIGE-GENE
or if you want to convert vcf files to plink binary files,
./plink2 --vcf saige_example.vcf.gz --make-bed --out result
Installation of bgzip and tabix can be done with command 'conda install -c bioconda tabix' (conda environment) or 'apt-get install tabix' or follow the instructions in http://www.htslib.org/download/ (Download htslib)
- Running SAIGE - fitting null GLMM nohup command makes the process run in the background so that you can work in other processes.
nohup Rscript step1_fitNULLGLMM.R
--plinkFile=saige_example
--phenoFile=saige_pheno.txt
--phenoCol=y_binary
--covarColList=x1,x2
--sampleIDColinphenoFile=IID
--traitType=binary
--outputPrefix=./step1_result --nThreads=4
--LOCO=FALSE --IsOverwriteVarianceRatioFile=TRUE &
- Running SAIGE- Step2 Single variant association test
nohup Rscript step2_SPAtests.R
--vcfFile=practice.vcf.gz
--vcfFileIndex=practice.vcf.gz.tbi
--vcfField=GT
--chrom=1 --minMAF=0.0001 --minMAC=1
--sampleFile=sampleIDindosage.txt
--GMMATmodelFile=step1_result.rda
--varianceRatioFile=step1_result.varianceRatio.txt
--SAIGEOutputFile=finalresult.txt --numLinesOutput=2
--IsOutputAFinCaseCtrl=TRUE --LOCO=FALSE &
- To make Manhattan plot and QQ plot, please use the plots.R code. Rscript plots.R More detailed information in https://github.com/weizhouUMICH/SAIGE/wiki/Genetic-association-tests-using-SAIGE
For the binary phenotype, use SKATBinary.SSD.All function
library(SKAT)
File.Bed<-'./Example1.bed'
File.Bim<-'./Example1.bim'
File.Fam<-'./Example1.fam'
File.SetID<-'./Example1.SetID'
#Calling covariate file (Order of the sample ID must be same with the fam file)
#When there is no covariate file, use FAM<-Read_Plink_FAM(File.Fam,Is.binary = F)
#and object obj<-SKAT_Null_Model(y~X1+X2, out_type = 'C')
File.Cov<-'./Example1.Cov'
FAM_Cov<-Read_Plink_FAM_Cov(File.Fam,File.Cov,Is.binary = F)
#Object file for Null model
X1<-FAM_Cov$X1
X2<-FAM_Cov$X2
y<-FAM_Cov$Phenotype
obj<-SKAT_Null_Model(y~X1+X2, out_type = 'C')
# If the phenotype is binary one, out_type='D'
#When there is no covariate file, use FAM<-Read_Plink_FAM(File.Fam,Is.binary = F)
#and object obj<-SKAT_Null_Model(y~1, out_type = 'C')
#Please set file location and name for SSD file and SSD.info file
File.SSD<-'./Example1.SSD'
File.Info<-'./Example1.SSD.info'
#Generate and open SSD file for analysis
Generate_SSD_SetID(File.Bed,File.Bim,File.Fam,File.SetID,File.SSD,File.Info )
SSD.INFO<-Open_SSD(File.SSD,File.Info)
SSD.INFO$nSample
SSD.INFO$nSets
#Analysis
out<-SKAT.SSD.All(SSD.INFO,obj,method='SKATO')
#close SSD file
Close_SSD()
Step 0 : Creating Sparse GRM (Genomic Relationship Matrix)
createSparseGRM.R --plinkFile=saige_gene_example \
--nThreads=4 \
--outputPrefix=step0_result \
--numRandomMarkerforSparseKin=2000 \
--relatednessCutoff=0.125
Step 1 : Fitting Null model
step1_fitNULLGLMM.R --plinkFile=saige_gene_example \
--phenoFile=saige_gene_pheno.txt \
--phenoCol=y_quantitative \
--covarColList=x1,x2 \
--sampleIDColinphenoFile=IID \
--traitType=quantitative \
--invNormalize=TRUE \
--outputPrefix=step1_result \
--outputPrefix_varRatio=step1_result_ratio \
--sparseGRMFile=step0_result_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseGRM.mtx \
--sparseGRMSampleIDFile=step0_result_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseGRM.mtx.sampleIDs.txt \
--nThreads=4 \
--LOCO=FALSE \
--skipModelFitting=FALSE \
--IsSparseKin=TRUE \
--isCateVarianceRatio=TRUE
Step 2 : Gene based test
step2_SPAtests.R --vcfFile=genotype_10markers.vcf.gz \
--vcfFileIndex=genotype_10markers.vcf.gz.csi \
--chrom=1 \
--vcfField=GT \
--minMAF=0 \
--minMAC=0.5 \
--maxMAFforGroupTest=0.01 \
--sampleFile=samplelist.txt \
--GMMATmodelFile=step1_result.rda \
--varianceRatioFile=step1_result_ratio.varianceRatio.txt \
--SAIGEOutputFile=step2_result_gene.txt \
--numLinesOutput=1 \
--groupFile=groupFile_geneBasedtest_simulation.txt \
--sparseSigmaFile=step1_result_ratio.varianceRatio.txt_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseSigma.mtx \
--IsSingleVarinGroupTest=TRUE \
--method_to_CollapseUltraRare =absence_or_presence \
--MACCutoff_to_CollapseUltraRare =10
--LOCO=FALSE