module load bcftools
cd /u/project/gandalm/mmkim121/PsychENCODE/PsychENCODE-genotype/
# filter for unique European individuals with frontal cortex RNA-seq data (N = 860; 5,312,508 SNPs)
bcftools view -Oz -S samples-European.unique.frontal.txt /u/project/gandalm/shared/GenomicDatasets/PsychENCODE/Genotypes/Capstone4.all_dbSNP_1864.vcf.gz > Capstone4.HRC.European.unique.frontal.vcf.gz
# remove 'chr' from chromosome names consistent with hg19 conventions
bcftools annotate --rename-chrs chromosome_names.txt Capstone4.HRC.European.unique.frontal.vcf.gz > Capstone4.HRC.European.unique.frontal.nochr.vcf
bgzip Capstone4.HRC.European.unique.frontal.nochr.vcf
bcftools query -f '%CHROM ' Capstone4.HRC.European.unique.frontal.nochr.vcf.gz # sanity check
# extract HapMap3 SNPs (130,057 SNPs)
bcftools filter -i 'ID=@Capstone4.hm3.snplist' Capstone4.HRC.European.unique.frontal.nochr.vcf.gz > Capstone4.hm3.European.unique.frontal.nochr.vcf
bgzip Capstone4.hm3.European.unique.frontal.nochr.vcf
# overlap with 1000G SNPs (5,277,467 SNPs)
awk 'NR==FNR{A[$1,$4]; next} ($1,$2) in A' /u/project/gandalm/shared/refGenomes/1000genomes/chrs/kgp.bim Capstone4.HRC.moreinfo.snplist > Capstone4.1kg.snplist
# filter for unique, unrelated (GRM < 0.05) European individuals with frontal cortex RNA-seq data (N = 855)
bcftools view -Oz -S samples-European.unique.frontal.unrelated.txt Capstone4.HRC.European.unique.frontal.nochr.vcf.gz >
Capstone4.HRC.European.unique.frontal.nochr.unrelated.vcf.gz
bcftools view -Oz -S samples-European.unique.frontal.unrelated.txt Capstone4.hm3.European.unique.frontal.nochr.vcf.gz >
Capstone4.hm3.European.unique.frontal.nochr.unrelated.vcf.gz
# convert to plink format
module load plink
plink --vcf /u/project/gandalm/shared/GenomicDatasets/PsychENCODE/Genotypes/Capstone4.all_dbSNP_1864.vcf.gz --make-bed --recode --out plink/Capstone4.all_dbSNP_1864 --const-fid
# --mac is not supported by 1.9
# /u/project/gandalm/shared/apps/plink2 --vcf ${input} --mac 1 --make-bed --out plink/Capstone4.all_dbSNP_1864 --const-fid
plink --bfile plink/Capstone4.all_dbSNP_1864 --maf 0.01 --geno 0.05 --mind 0.05 --hwe 1e-6 --make-bed --out plink/Capstone4.all_dbSNP_1864.filter
# check for multi-allelic variants
awk '!seen[$1,$4]++' plink/Capstone4.all_dbSNP_1864.filter.bim | wc -l