-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathseqStats.py
More file actions
48 lines (40 loc) · 1.37 KB
/
seqStats.py
File metadata and controls
48 lines (40 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#! /usr/bin/env python
# read fasta sequences and return statistiques
import sys
f = open(sys.argv[1])
def read_fasta(fp):
name, seq = None, []
for line in fp:
line = line.rstrip()
if line.startswith(">"):
if name: yield (name, ''.join(seq))
name, seq = line, []
else:
seq.append(line)
if name: yield (name, ''.join(seq))
nameFile = str(sys.argv[1])
print("File\tname\tsize\tGC\tgap_data\tN_sites\ttot_missing")
GC=0
gap_data=0
N_sites=0
tot_missing=0
with f as fp:
for name, seq in read_fasta(fp):
seq = seq.upper()
nseq = seq.replace("-","")
mseq = seq.replace("N","")
mseq = mseq.replace("?","")
nomissing_seq = nseq.replace("N", "")
nomissing_seq = nomissing_seq.replace("?", "")
if len(nomissing_seq) > 0:
GC = (nseq.count('G') + nseq.count('C')) / float(len(nomissing_seq))
gap_data = 1.0- ( float(len(nseq)) /float(len(seq)))
N_sites = 1.0- ( float(len(mseq)) /float(len(seq)))
tot_missing = gap_data+N_sites
else:
GC = "NA"
gap_data = 1.0
N_sites = 1.0
tot_missing = 1.0
name=name.lstrip(">")
print(nameFile+"\t"+name+"\t"+ str(len(seq)) + "\t" + str(GC) + "\t" + str(gap_data) + "\t" + str(N_sites) + "\t" + str(tot_missing))