seqExtract/seq_extract.py at master · sym24/seqExtract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python

import os
import sys
from pybedtools import BedTool
import pysam
import subprocess
import argparse


def get_gene_name(target_file):
    """Generate set contains gene name without duplication."""
    with open(target_file, 'r') as fi:
        gene_set = set(line.rstrip() for line in fi)
    print 'Total number of genes: ', len(gene_set)
    return (gene_set, len(gene_set))

def get_bounds(gene_set, buffer, gtf_file):
    """Go through gtf file extract the longest sequence for genes in gene_set"""
    print '...Extract gene boundaries...'
    bounds = {}
    for feature in BedTool(gtf_file):
        if feature[2] == 'exon' and feature.attrs.has_key('gene_id'):
             gene = feature.attrs['gene_id']
             if gene in gene_set:
                 startp = int(feature.start) + 1 - buffer
                 endp = int(feature.stop) + buffer
                 try:
                     # update the starting and end point
                     if startp < bounds[gene][1]:
                         bounds[gene][1] = startp
                     if endp > bounds[gene][2]:
                         bounds[gene][2] = endp
                 except KeyError:
                     # add new key
                     bounds[gene] = [feature[0], startp, endp]

    find_num = 0
    miss_genes = []
    for gene in gene_set:
        if gene in bounds:
            find_num += 1
        else:
            miss_genes.append(gene)
    print 'Number of genes successfully captured: %d'%(find_num)
    if miss_genes:
        print 'Number of genes failed to captured: %d'%len(miss_genes)
        print 'Missing genes: %s'%', '.join(miss_genes)
    return bounds

def extract_seq(target_fa, ref_file, bounds):
    """Extract genes from reference genome and write it to output file."""
    print '...Extract gene sequence...'
    with open(target_fa, 'a') as fi:
        ref_fasta = pysam.Fastafile(ref_file)
        for gene, coord in bounds.iteritems():
            seq = ref_fasta.fetch(coord[0], coord[1], coord[2])
            fi.write("%s%s %s:%d-%d\n%s\n" % ('>',gene,coord[0],coord[1],coord[2],seq))

def makeBloomFilter(input_file, outdir, prefix):
    """Make bloom filter out of targeted genes"""
    target_bf = os.path.join(outdir, prefix+'.bf')
    try:
        print '>>>Run samtools'
        subprocess.check_output(['samtools', 'faidx', input_file])
        cmd_pass = ['biobloommaker', '-p', prefix, '-o', outdir, input_file]
        print '>>>Run biobloommaker'
        subprocess.check_output(cmd_pass)
    except subprocess.CalledProcessError as e:
        sys.exit(e)
    return target_bf

def main():
    parser = argparse.ArgumentParser(description='Extract target genes and '
                                     'generate bloom filter')
    # Required arguments'
    parser.add_argument('-g', '--genes', dest='genes', metavar='file',
                        type=argparse.FileType('r'), required=True,
                        help='Absolute path of text file containing '
                        'target gene names. [Required]')
    parser.add_argument('-t', '--gtf', dest='gtf', metavar='gtf_file',
                        type=file, required=True,
                        help='Absolute path of gtf file.[Required]')
    parser.add_argument('-r', '--ref', dest='ref', metavar='ref_genome',
                        type=file, required=True,
                        help='Absolute path of the reference sequence file. [Requried]')

    # Optional variables
    parser.add_argument('-o', '--outdir', dest='outdir', metavar='STR', type=str,
                        help='Output file path. Default: [%(default)s]',
                        default=os.getcwd())
    parser.add_argument('-p', '--prefix', dest='prefix', metavar='STR', type=str,
                        help='Output file prefix. Default: [%(default)s]',
                        default='target_genes')
    parser.add_argument('-b', '--buffer', dest='buffer', metavar='INT', type=int,
                        help='Size of extended buffer on both end of '
                        'extracted sequence. Defaule: [%(default)s]', default='0')
    parser.add_argument('-f', '--force', dest='overwrite', action='store_true',
                        help='Overwrite existing output files')

    args = parser.parse_args()

    # Make output file name
    target_fa = os.path.join(args.outdir, args.prefix+'.fa')

    # Make output directory if not exist
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    elif os.path.isfile(target_fa):
        if args.overwrite:
            print 'File {} exists, will overwrite'.format(target_fa)
        else:
            sys.exit('File {} exist\nPlease use "-f" to overwrite or '
                     'change outdir or prefix'.format(target_fa))
        # end if
    # end if
    print 'Genome file: ', os.path.abspath(args.ref.name)
    print 'Gene list: ', os.path.abspath(args.genes.name)
    print 'GTF file: ', os.path.abspath(args.gtf.name)
    print 'Output FASTA file: ', target_fa
    # Extract gene name from input file
    gene_set, gene_num = get_gene_name(args.genes.name)
    # Get gene coordinates from GTF file
    bounds = get_bounds(gene_set, args.buffer, args.gtf.name)
    # Get gene sequences from reference sequence file
    extract_seq(target_fa, args.ref.name, bounds)
    # Generate bloom filter
    target_bf = makeBloomFilter(target_fa, args.outdir, args.prefix)

if __name__ == '__main__':
    main()