MergeAndFilter/ExtractSampleCountData.py at master · Y-Lammers/MergeAndFilter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python

# This tool will go through an OBITools fasta file and extract the
# number of sequences for each sample and output these in a table.

# Usage: ExtractSampleCountData.py [input file] [sample list] > [output table]

# Contact: youri.lammers@gmail.com
# version: 1.1.4

# load a bunch of modules
import sys, json, os, itertools
from collections import defaultdict

def read_samplefile():

	# Create an empty dictionary for sample sequence info
	sampleDict = defaultdict(int)

	# parse through the sample file
	for line in open(sys.argv[2]):

		# split the line and skip if it is the header
		line = line.strip().split('\t')
		if line[0][0] == "#": continue

		# set the repeat / sample to 0
		sampleDict[line[1]] = 0

	# return the dictionary
	return sampleDict


def read_fasta(sampleDict):

	# Parse through a fasta file and extract the sample
	# and sequence count information and store these in
	# a dictionay

	# open the sequence library
	seq_file = open(sys.argv[1])

	# parse through the fasta file and obtain the sequences
	seq_groups = (x[1] for x in itertools.groupby(seq_file,
			key=lambda line: line[0] == '>'))

	# for each header in the sequence data
	for header in seq_groups:

		# get the fasta header and parse out the
		# sample name and count information
		header = next(header).strip()
		descrip = header.split("merged_sample=")[1]
		descrip = descrip.split(";")[0]
		samples = json.loads(descrip.replace("\'","\""))

		# get the fasta sequence
		sequence = ''.join(seq_line.strip() for
			seq_line in next(seq_groups))

		# try to add the sample information for the
		# sequence to the sequence dictionary
		# if it fails, add a new entry
		for sample in samples:

			sample = str(sample)
			sampleDict[sample] += samples[sample]

	# close the sequence file
	seq_file.close()

	# return the sequence dictionary
	return sampleDict


def output_sample_data(sampleDict):

	# This function will format and output the sample data

	# print the table header
	print("Sample name\tRead count")

	# get a list of the dictionary keys and sort it
	samples = list(sampleDict.keys())
	samples.sort()

	# Parse through the dictionary in an alphabetic order and
	# output the sample and read counts
	for sample in samples:
		print("{0}\t{1}".format(sample, sampleDict[sample]))


sampleDict = read_fasta(read_samplefile())
output_sample_data(sampleDict)