Skip to content

PsychENCODE .vcf file processing #1

@mmkim1210

Description

@mmkim1210
module load bcftools
cd /u/project/gandalm/mmkim121/PsychENCODE/PsychENCODE-genotype/

# filter for unique European individuals with frontal cortex RNA-seq data (N = 860; 5,312,508 SNPs)

bcftools view -Oz -S samples-European.unique.frontal.txt /u/project/gandalm/shared/GenomicDatasets/PsychENCODE/Genotypes/Capstone4.all_dbSNP_1864.vcf.gz > Capstone4.HRC.European.unique.frontal.vcf.gz

# remove 'chr' from chromosome names consistent with hg19 conventions

bcftools annotate --rename-chrs chromosome_names.txt Capstone4.HRC.European.unique.frontal.vcf.gz > Capstone4.HRC.European.unique.frontal.nochr.vcf

bgzip Capstone4.HRC.European.unique.frontal.nochr.vcf

bcftools query -f '%CHROM ' Capstone4.HRC.European.unique.frontal.nochr.vcf.gz  # sanity check

# extract HapMap3 SNPs (130,057 SNPs)

bcftools filter -i 'ID=@Capstone4.hm3.snplist' Capstone4.HRC.European.unique.frontal.nochr.vcf.gz > Capstone4.hm3.European.unique.frontal.nochr.vcf

bgzip Capstone4.hm3.European.unique.frontal.nochr.vcf

# overlap with 1000G SNPs (5,277,467 SNPs)

awk 'NR==FNR{A[$1,$4]; next} ($1,$2) in A' /u/project/gandalm/shared/refGenomes/1000genomes/chrs/kgp.bim Capstone4.HRC.moreinfo.snplist  > Capstone4.1kg.snplist

# filter for unique, unrelated (GRM < 0.05) European individuals with frontal cortex RNA-seq data (N = 855)

bcftools view -Oz -S samples-European.unique.frontal.unrelated.txt  Capstone4.HRC.European.unique.frontal.nochr.vcf.gz > 
Capstone4.HRC.European.unique.frontal.nochr.unrelated.vcf.gz

bcftools view -Oz -S samples-European.unique.frontal.unrelated.txt  Capstone4.hm3.European.unique.frontal.nochr.vcf.gz > 
Capstone4.hm3.European.unique.frontal.nochr.unrelated.vcf.gz

# convert to plink format

module load plink

plink --vcf /u/project/gandalm/shared/GenomicDatasets/PsychENCODE/Genotypes/Capstone4.all_dbSNP_1864.vcf.gz --make-bed --recode --out plink/Capstone4.all_dbSNP_1864 --const-fid 

# --mac is not supported by 1.9
# /u/project/gandalm/shared/apps/plink2 --vcf ${input} --mac 1 --make-bed --out plink/Capstone4.all_dbSNP_1864 --const-fid

plink  --bfile plink/Capstone4.all_dbSNP_1864 --maf 0.01 --geno 0.05 --mind 0.05 --hwe 1e-6 --make-bed --out plink/Capstone4.all_dbSNP_1864.filter

# check for multi-allelic variants
awk '!seen[$1,$4]++' plink/Capstone4.all_dbSNP_1864.filter.bim | wc -l

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions