-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathExtractSampleCountData.py
More file actions
executable file
·94 lines (64 loc) · 2.33 KB
/
ExtractSampleCountData.py
File metadata and controls
executable file
·94 lines (64 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# This tool will go through an OBITools fasta file and extract the
# number of sequences for each sample and output these in a table.
# Usage: ExtractSampleCountData.py [input file] [sample list] > [output table]
# Contact: youri.lammers@gmail.com
# version: 1.1.4
# load a bunch of modules
import sys, json, os, itertools
from collections import defaultdict
def read_samplefile():
# Create an empty dictionary for sample sequence info
sampleDict = defaultdict(int)
# parse through the sample file
for line in open(sys.argv[2]):
# split the line and skip if it is the header
line = line.strip().split('\t')
if line[0][0] == "#": continue
# set the repeat / sample to 0
sampleDict[line[1]] = 0
# return the dictionary
return sampleDict
def read_fasta(sampleDict):
# Parse through a fasta file and extract the sample
# and sequence count information and store these in
# a dictionay
# open the sequence library
seq_file = open(sys.argv[1])
# parse through the fasta file and obtain the sequences
seq_groups = (x[1] for x in itertools.groupby(seq_file,
key=lambda line: line[0] == '>'))
# for each header in the sequence data
for header in seq_groups:
# get the fasta header and parse out the
# sample name and count information
header = next(header).strip()
descrip = header.split("merged_sample=")[1]
descrip = descrip.split(";")[0]
samples = json.loads(descrip.replace("\'","\""))
# get the fasta sequence
sequence = ''.join(seq_line.strip() for
seq_line in next(seq_groups))
# try to add the sample information for the
# sequence to the sequence dictionary
# if it fails, add a new entry
for sample in samples:
sample = str(sample)
sampleDict[sample] += samples[sample]
# close the sequence file
seq_file.close()
# return the sequence dictionary
return sampleDict
def output_sample_data(sampleDict):
# This function will format and output the sample data
# print the table header
print("Sample name\tRead count")
# get a list of the dictionary keys and sort it
samples = list(sampleDict.keys())
samples.sort()
# Parse through the dictionary in an alphabetic order and
# output the sample and read counts
for sample in samples:
print("{0}\t{1}".format(sample, sampleDict[sample]))
sampleDict = read_fasta(read_samplefile())
output_sample_data(sampleDict)