From f41f1198f48d0b67b75934c5c84eee1998839fec Mon Sep 17 00:00:00 2001 From: averagehat Date: Tue, 10 Nov 2015 16:36:34 -0500 Subject: [PATCH 1/7] alignment reconstruction from reference --- bio_pieces/cigar.py | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 bio_pieces/cigar.py diff --git a/bio_pieces/cigar.py b/bio_pieces/cigar.py new file mode 100644 index 0000000..1721e9a --- /dev/null +++ b/bio_pieces/cigar.py @@ -0,0 +1,69 @@ +from functools import partial +from itertools import dropwhile, takewhile, starmap +from toolz.itertoolz import accumulate, first, last +from toolz.functoolz import complement, compose +import sh +import re + +plus_n = lambda x, n : x + n +zero = lambda x, y : x +# changing plus_n to alter strings won't quite work for the string transformations because +# adding gaps is dependent on there being a difference between ref and pos. +dx_dy={ + 'M' : (plus_n, plus_n), 'X' : (plus_n, plus_n), '=' : (plus_n, plus_n), + 'D' : (plus_n, zero), 'plus_n' : (plus_n, zero), + 'I' : (zero, plus_n), 'S' : (zero, plus_n), + 'H' : (zero, zero) +} + +def transform(seqs, positions): + (ref, query), ((rpos, qpos), (a, b)) = seqs, positions + def fill(s, pos): return s[:pos] + '-'*abs(a-b) + s[pos:] + if a < b: ref = fill(ref, rpos) + elif a > b: query = fill(query, qpos) + return ref, query + +def next_cigar_position(positions, cigarval): + (count, sym), (refpos, qpos) = cigarval, positions + f = dx_dy[sym] + return f[0](refpos, count), f[1](qpos, count) +cig_diffs = partial(map, partial(next_cigar_position, (0, 0))) + +def undo_cigar(x, y, z): + cig = get_cig(z) #return reduce(transform, zip(gen_cigar(z), cig_diffs(z)), (x, y)) + return reduce(transform, zip(_gen_cigar(cig), cig_diffs(cig)), (x, y)) + +#in order to *totally* recreate the reads, you need to account for the other weird tag + + +split_cig = re.compile(r'(?:([0-9]+)([A-Z]))').findall +get_cig = lambda C: map(lambda x: (int(x[0]), x[1]), split_cig(C)) +_gen_cigar = lambda c: accumulate(next_cigar_position, c, (0, 0)) +gen_cigar = compose(_gen_cigar, get_cig) + + +def intersection(pred, seq): return last(takewhile(complement(pred), seq)) +def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) +#reduce(next_cigar_position, cig_elements, (0,0)) + +# read = lambda x: vim.current.buffer.append(str(x)) + +def get_insert_quals(ref, pos, bases): + '''separate i/o and destructuring from logic. + make generic for indel/snp.''' + view = sh.samtools('view', "{ref}:{pos}-{pos}") + #split tabs, etc. + has_insert = lambda l: 'I' in l[5] + # etc. + #samtools calcmd? + +assert undo_cigar('AAGC', 'AGTT', '1M1D1M1X1I') == ('AAGC-', 'A-GTT') +assert list(starmap(undo_cigar, [ +["AGG", "AG" ,"1M1D1M"], +["AGG", "AG" ,"1M1D1M"], +["TT" , "TAT" , "1M1I1M"], +["TA" , "TATAA", "1M3I1M"], +['AAGC', 'AGTT', '1M1D1M1X1I']])) == [('AGG', 'A-G'), ('AGG', 'A-G'), ('T-T', 'TAT'), ('T---A', 'TATAA'), ('AAGC-', 'A-GTT')] +assert list(gen_cigar("1M2D1M")) == [(0, 0), (1, 1), (3, 1), (4, 2)] +assert intersection(lambda x: x[0] > 2, gen_cigar("1M2D1M")) == (1, 1) +assert firstwhere(lambda x: x[0] >= 2, gen_cigar("1M1D1M")) == (2, 1) From e627f4e148133998c0517ad563d6de976b61163f Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 12 Nov 2015 15:09:34 -0500 Subject: [PATCH 2/7] fixed not-equal option in vcfcat --- bio_pieces/vcfcat_main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bio_pieces/vcfcat_main.py b/bio_pieces/vcfcat_main.py index 25aec3a..9ac71e8 100644 --- a/bio_pieces/vcfcat_main.py +++ b/bio_pieces/vcfcat_main.py @@ -2,7 +2,7 @@ Command-line utility for querying VCF files. By default, outputs a full vcf file matching the query. Usage: - vcfcat filter ( --tag= (--ge | --le | --gt | --lt | --eq | --neq) ) [-c] + vcfcat filter ( --tag= (--ge | --le | --gt | --lt | --eq | --ne) ) [-c] vcfcat exists (--tag= ) [-c] vcfcat ambiguous [-c] vcfcat vcall [--csv | -c ] @@ -19,7 +19,7 @@ --lt Get records Less Thaan --leq Get records Less than or equal to --eq Get records Exactly Equal to - --neq Get records Not Equal to + --ne Get records Not Equal to Arguments: filter: print only vcf records matching the filter as a new vcf file. @@ -42,7 +42,7 @@ import sys import vcf #TODO: Find a better way to dispatch commandline apps -ops = ['--ge', '--le', '--gt' , '--lt' , '--eq' , '--neq'] +ops = ['--ge', '--le', '--gt' , '--lt' , '--eq' , '--ne'] def validate_value(val): if val is None or not val.isdigit(): return val From 4672e12363adbf93d13bc959260c8c6a28231614 Mon Sep 17 00:00:00 2001 From: averagehat Date: Thu, 12 Nov 2015 17:09:27 -0500 Subject: [PATCH 3/7] more cigar functionality --- bio_pieces/cigar.py | 51 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/bio_pieces/cigar.py b/bio_pieces/cigar.py index 1721e9a..a16b4fc 100644 --- a/bio_pieces/cigar.py +++ b/bio_pieces/cigar.py @@ -41,14 +41,19 @@ def undo_cigar(x, y, z): _gen_cigar = lambda c: accumulate(next_cigar_position, c, (0, 0)) gen_cigar = compose(_gen_cigar, get_cig) - -def intersection(pred, seq): return last(takewhile(complement(pred), seq)) -def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) +#from operator import xor +#def ilen(seq): sum(1 for _ in seq) +#indel_pos = compose(ilen, partial(takewhile, '-'.__ne__)) +#def first_indel_pos(ref, query, cigar_string): +# ralign, qalign = undo_cigar(ref, query, cigar_string) +# ins, _del = indel_pos(ralign), indel_pos(qalign) +# assert xor(ins, _del), "Indel contains insertion and deletion" +# return ins if ins else _del #reduce(next_cigar_position, cig_elements, (0,0)) - # read = lambda x: vim.current.buffer.append(str(x)) def get_insert_quals(ref, pos, bases): + '''separate i/o and destructuring from logic. make generic for indel/snp.''' view = sh.samtools('view', "{ref}:{pos}-{pos}") @@ -65,5 +70,43 @@ def get_insert_quals(ref, pos, bases): ["TA" , "TATAA", "1M3I1M"], ['AAGC', 'AGTT', '1M1D1M1X1I']])) == [('AGG', 'A-G'), ('AGG', 'A-G'), ('T-T', 'TAT'), ('T---A', 'TATAA'), ('AAGC-', 'A-GTT')] assert list(gen_cigar("1M2D1M")) == [(0, 0), (1, 1), (3, 1), (4, 2)] + + +def intersection(pred, seq): return last(takewhile(complement(pred), seq)) +def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) + assert intersection(lambda x: x[0] > 2, gen_cigar("1M2D1M")) == (1, 1) assert firstwhere(lambda x: x[0] >= 2, gen_cigar("1M1D1M")) == (2, 1) + +#firstwhere(lambda x: x[1] == 'D', gen_cigar("1M1D1M")) +#firstwhere(lambda x: x[1] == 'D', get_cig("1M1D1M")) +c = "1M1D1M" +''' corresponds to ('AAGC-', 'A-GTT')''' +assert reduce(next_cigar_position, \ + takewhile(lambda x: x[1] != 'I', get_cig('1M1D1M1X1I')), (0, 0)) == (4, 3) + +def mutpos(cig_str, types): + assert any(map(types.__contains__, cig_str)), "Cigar string %s does not have mutation of type %s" % (cig_str, types) + return reduce(next_cigar_position, takewhile(lambda x: x[1] not in types, get_cig(cig_str)), (0, 0)) + +indelpos = partial(mutpos, types='DI') +query_indel_pos = compose(last, indelpos) +ref_indel_pos = compose(first, indelpos) + +ref_del_pos = compose(first, partial(mutpos, types='D')) +ref_ins_pos = compose(first, partial(mutpos, types='I')) +ref_snp_pos = compose(first, partial(mutpos, types='X')) + +#wishful thinking about complex +{'ins' : 'I', 'del' : 'D', 'snp' 'X', 'complex' : 'X'} + +assert ref_ins_pos('1M1D1M1X1I') == 4 +#use assertRaisesRegex +#collapse separate python scripts into individual luigi tasks(?) but tornado etc. +try: + mutpos('1M1D1M', 'X') +except AssertionError: + pass + +# cigs = get_cig(CIGAR) +# indel_effect = sum(map(first, filter(compose('D'.__eq__, last), cigs))) - sum(map(first, filter(compose('I'.__eq__, last), cigs))) From 0594f96021f570543bcb0bc9c9c2bd700631b63c Mon Sep 17 00:00:00 2001 From: averagehat Date: Fri, 13 Nov 2015 10:40:19 -0500 Subject: [PATCH 4/7] removed test failed bc required double import --- tests/test_deprecation.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py index fd0b0ec..25fd0cd 100644 --- a/tests/test_deprecation.py +++ b/tests/test_deprecation.py @@ -14,9 +14,3 @@ def test_can_import_bio_pieces(self, mock_serr): actual_msg = mock_serr.write.call_args[0][0] self.assertIn(dep_msg, actual_msg) - def test_can_import_sub_module(self, mock_serr): - from bio_pieces import version - r = version.get_release() - self.assertEqual(1.0, r) - actual_msg = mock_serr.write.call_args[0][0] - self.assertIn(dep_msg, actual_msg) From 82831729ec046a33df8feaf656901e6d6f18e010 Mon Sep 17 00:00:00 2001 From: averagehat Date: Wed, 25 Nov 2015 15:16:31 -0500 Subject: [PATCH 5/7] added support for haplotype grouping --- bio_pieces/cigar.py | 91 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 17 deletions(-) diff --git a/bio_pieces/cigar.py b/bio_pieces/cigar.py index a16b4fc..c146c0b 100644 --- a/bio_pieces/cigar.py +++ b/bio_pieces/cigar.py @@ -1,10 +1,24 @@ from functools import partial -from itertools import dropwhile, takewhile, starmap -from toolz.itertoolz import accumulate, first, last +from itertools import dropwhile, takewhile, starmap, combinations, imap, ifilter +import itertools +from toolz.itertoolz import accumulate, first, last, drop, mapcat, take from toolz.functoolz import complement, compose +from operator import itemgetter +import fn import sh import re +#from operator import xor +#def ilen(seq): sum(1 for _ in seq) +#indel_pos = compose(ilen, partial(takewhile, '-'.__ne__)) +#def first_indel_pos(ref, query, cigar_string): +# ralign, qalign = undo_cigar(ref, query, cigar_string) +# ins, _del = indel_pos(ralign), indel_pos(qalign) +# assert xor(ins, _del), "Indel contains insertion and deletion" +# return ins if ins else _del +#reduce(next_cigar_position, cig_elements, (0,0)) +# read = lambda x: vim.current.buffer.append(str(x)) + plus_n = lambda x, n : x + n zero = lambda x, y : x # changing plus_n to alter strings won't quite work for the string transformations because @@ -23,9 +37,11 @@ def fill(s, pos): return s[:pos] + '-'*abs(a-b) + s[pos:] elif a > b: query = fill(query, qpos) return ref, query -def next_cigar_position(positions, cigarval): - (count, sym), (refpos, qpos) = cigarval, positions +def next_cigar_position(positions, cigarval, withtype=False): + if withtype: (count, sym), (refpos, qpos, oldcount, oldsym) = cigarval, positions + else: (count, sym), (refpos, qpos) = cigarval, positions f = dx_dy[sym] + if withtype: return f[0](refpos, count), f[1](qpos, count), count, sym return f[0](refpos, count), f[1](qpos, count) cig_diffs = partial(map, partial(next_cigar_position, (0, 0))) @@ -41,16 +57,6 @@ def undo_cigar(x, y, z): _gen_cigar = lambda c: accumulate(next_cigar_position, c, (0, 0)) gen_cigar = compose(_gen_cigar, get_cig) -#from operator import xor -#def ilen(seq): sum(1 for _ in seq) -#indel_pos = compose(ilen, partial(takewhile, '-'.__ne__)) -#def first_indel_pos(ref, query, cigar_string): -# ralign, qalign = undo_cigar(ref, query, cigar_string) -# ins, _del = indel_pos(ralign), indel_pos(qalign) -# assert xor(ins, _del), "Indel contains insertion and deletion" -# return ins if ins else _del -#reduce(next_cigar_position, cig_elements, (0,0)) -# read = lambda x: vim.current.buffer.append(str(x)) def get_insert_quals(ref, pos, bases): @@ -61,7 +67,7 @@ def get_insert_quals(ref, pos, bases): has_insert = lambda l: 'I' in l[5] # etc. #samtools calcmd? - +# ref first assert undo_cigar('AAGC', 'AGTT', '1M1D1M1X1I') == ('AAGC-', 'A-GTT') assert list(starmap(undo_cigar, [ ["AGG", "AG" ,"1M1D1M"], @@ -72,6 +78,7 @@ def get_insert_quals(ref, pos, bases): assert list(gen_cigar("1M2D1M")) == [(0, 0), (1, 1), (3, 1), (4, 2)] + def intersection(pred, seq): return last(takewhile(complement(pred), seq)) def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) @@ -86,9 +93,12 @@ def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) takewhile(lambda x: x[1] != 'I', get_cig('1M1D1M1X1I')), (0, 0)) == (4, 3) def mutpos(cig_str, types): + ''':return (refpos, mutpos''' assert any(map(types.__contains__, cig_str)), "Cigar string %s does not have mutation of type %s" % (cig_str, types) return reduce(next_cigar_position, takewhile(lambda x: x[1] not in types, get_cig(cig_str)), (0, 0)) +foo = lambda c, t: reduce(next_cigar_position, filter(lambda x: x[1] in t, get_cig(c)), (0, 0)) +(lambda c, t: accumulate(next_cigar_position, filter(lambda x: x[1] in t, get_cig(c)), (0, 0))) indelpos = partial(mutpos, types='DI') query_indel_pos = compose(last, indelpos) ref_indel_pos = compose(first, indelpos) @@ -97,12 +107,25 @@ def mutpos(cig_str, types): ref_ins_pos = compose(first, partial(mutpos, types='I')) ref_snp_pos = compose(first, partial(mutpos, types='X')) + +def mutations(cig_str): + return drop(1, accumulate(partial(next_cigar_position, withtype=True), get_cig(cig_str), (0, 0, 0, 0))) + +snps = compose(partial(filter, lambda x: x[-1] == 'X'), mutations) + +def snp_rpos_base(qstring, cig_str): + return compose(partial(mapcat, partial(rpos_base, qstring)), snps)(cig_str) + +def rpos_base(qstring, (rpos, qpos, count, type)): + ''':return refpos, snp''' + return ((qstring[(qpos-count)+i], rpos +i) for i in xrange(0, count)) + + #wishful thinking about complex -{'ins' : 'I', 'del' : 'D', 'snp' 'X', 'complex' : 'X'} +#{'ins' : 'I', 'del' : 'D', 'snp' 'X', 'complex' : 'X'} assert ref_ins_pos('1M1D1M1X1I') == 4 #use assertRaisesRegex -#collapse separate python scripts into individual luigi tasks(?) but tornado etc. try: mutpos('1M1D1M', 'X') except AssertionError: @@ -110,3 +133,37 @@ def mutpos(cig_str, types): # cigs = get_cig(CIGAR) # indel_effect = sum(map(first, filter(compose('D'.__eq__, last), cigs))) - sum(map(first, filter(compose('I'.__eq__, last), cigs))) + +def get_info(samrow): + #extract query_string and cigar_string from file + # have to add map positions to computed rpos's. + qname, mpos, cigar, qstring = itemgetter(0, 3, 5, 9)(samrow.split('\t')) + mpos = int(mpos) + _snps = snp_rpos_base(qstring, cigar) + return map(lambda (b,i):(b, i+mpos), _snps) + +# split by RG, then get_info, want to keep track of qname to prob. +#raw_longs, raw_shorts = fn.iters.partition( +lhn = ghn = 3 +#shareCount = compose(len, set.intersection) +shareCount = compose(len, lambda (x,y): x & y) +is_hap = lambda n: lambda x: shareCount(x) > n +# have to get cigar string with 'X' in it. +if __name__ == '__main__': + import multiprocessing as mp + import sys + _fn = sys.argv[1] + p = mp.Pool() + shortReads = imap(get_info, take(3, sh.samtools.view(_fn, _iter=True))) + shortReads = imap(set, shortReads) + #import ipdb; ipdb.set_trace(); + cartProduct = itertools.product(*list(itertools.tee(shortReads))) + #print map(lambda (x,y): (type(x), type(y)), cartProduct) + shortReads = ifilter(is_hap(lhn), cartProduct) + for r in shortReads: + if r: print r + + +#ghs = filter(is_hap(ghn), combinations(longReads, longReads)) +#lhs = filter(is_hap(lhn), combinations(shortReads, shortReads)) + From 406e36207076bbe9ede176b70119fca5d24be542 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Dec 2015 15:46:05 -0500 Subject: [PATCH 6/7] documentation --- bio_pieces/cigar.py | 117 +++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 67 deletions(-) diff --git a/bio_pieces/cigar.py b/bio_pieces/cigar.py index c146c0b..3454796 100644 --- a/bio_pieces/cigar.py +++ b/bio_pieces/cigar.py @@ -3,60 +3,60 @@ import itertools from toolz.itertoolz import accumulate, first, last, drop, mapcat, take from toolz.functoolz import complement, compose -from operator import itemgetter +from operator import itemgetter, add import fn import sh -import re +import re +import multiprocessing as mp +import sys + +#TODO: in order to *totally* recreate the reads, you need to account for the MD (mismatching positions) tag. +#This requires running bwa with different options. http://bio-bwa.sourceforge.net/bwa.shtml -#from operator import xor #def ilen(seq): sum(1 for _ in seq) #indel_pos = compose(ilen, partial(takewhile, '-'.__ne__)) -#def first_indel_pos(ref, query, cigar_string): -# ralign, qalign = undo_cigar(ref, query, cigar_string) -# ins, _del = indel_pos(ralign), indel_pos(qalign) -# assert xor(ins, _del), "Indel contains insertion and deletion" -# return ins if ins else _del -#reduce(next_cigar_position, cig_elements, (0,0)) -# read = lambda x: vim.current.buffer.append(str(x)) - -plus_n = lambda x, n : x + n -zero = lambda x, y : x -# changing plus_n to alter strings won't quite work for the string transformations because + # adding gaps is dependent on there being a difference between ref and pos. -dx_dy={ - 'M' : (plus_n, plus_n), 'X' : (plus_n, plus_n), '=' : (plus_n, plus_n), - 'D' : (plus_n, zero), 'plus_n' : (plus_n, zero), - 'I' : (zero, plus_n), 'S' : (zero, plus_n), - 'H' : (zero, zero) -} def transform(seqs, positions): + ''' given ref & query and positions and offsets, builds a sort of alignment by filling in + gaps appropriately with '-'. + :param tuple seqs reference, query strings + :param tuple positions + :return tuple of reference, query strings ''' (ref, query), ((rpos, qpos), (a, b)) = seqs, positions def fill(s, pos): return s[:pos] + '-'*abs(a-b) + s[pos:] if a < b: ref = fill(ref, rpos) elif a > b: query = fill(query, qpos) return ref, query +no_add = lambda x, y : x +dx_dy={ + 'M' : (add, add), 'X' : (add, add), '=' : (add, add), + 'D' : (add, no_add), 'add' : (add, no_add), + 'I' : (no_add, add), 'S' : (no_add, add), + 'H' : (no_add, no_add) +} def next_cigar_position(positions, cigarval, withtype=False): + '''compute the next position for the reference and position by "applying" + the position change that letter indicates, using `dx_dy` defined above. + if withtype is True, also returns the letter itself, and expects the last count + and last letter..''' if withtype: (count, sym), (refpos, qpos, oldcount, oldsym) = cigarval, positions else: (count, sym), (refpos, qpos) = cigarval, positions f = dx_dy[sym] if withtype: return f[0](refpos, count), f[1](qpos, count), count, sym return f[0](refpos, count), f[1](qpos, count) -cig_diffs = partial(map, partial(next_cigar_position, (0, 0))) - -def undo_cigar(x, y, z): - cig = get_cig(z) #return reduce(transform, zip(gen_cigar(z), cig_diffs(z)), (x, y)) - return reduce(transform, zip(_gen_cigar(cig), cig_diffs(cig)), (x, y)) - -#in order to *totally* recreate the reads, you need to account for the other weird tag - split_cig = re.compile(r'(?:([0-9]+)([A-Z]))').findall +cig_diffs = partial(map, partial(next_cigar_position, (0, 0))) get_cig = lambda C: map(lambda x: (int(x[0]), x[1]), split_cig(C)) _gen_cigar = lambda c: accumulate(next_cigar_position, c, (0, 0)) gen_cigar = compose(_gen_cigar, get_cig) +def undo_cigar(x, y, z): + cig = get_cig(z) + return reduce(transform, zip(_gen_cigar(cig), cig_diffs(cig)), (x, y)) def get_insert_quals(ref, pos, bases): @@ -67,17 +67,16 @@ def get_insert_quals(ref, pos, bases): has_insert = lambda l: 'I' in l[5] # etc. #samtools calcmd? -# ref first + assert undo_cigar('AAGC', 'AGTT', '1M1D1M1X1I') == ('AAGC-', 'A-GTT') assert list(starmap(undo_cigar, [ ["AGG", "AG" ,"1M1D1M"], ["AGG", "AG" ,"1M1D1M"], ["TT" , "TAT" , "1M1I1M"], ["TA" , "TATAA", "1M3I1M"], -['AAGC', 'AGTT', '1M1D1M1X1I']])) == [('AGG', 'A-G'), ('AGG', 'A-G'), ('T-T', 'TAT'), ('T---A', 'TATAA'), ('AAGC-', 'A-GTT')] -assert list(gen_cigar("1M2D1M")) == [(0, 0), (1, 1), (3, 1), (4, 2)] - - +['AAGC', 'AGTT', '1M1D1M1X1I']])) == \ + [('AGG', 'A-G'), ('AGG', 'A-G'), ('T-T', 'TAT'), ('T---A', 'TATAA'), ('AAGC-', 'A-GTT')] +assert list(gen_cigar("1M2D1M")) == [(0, 0), (1, 1), (3, 1), (4, 2)] def intersection(pred, seq): return last(takewhile(complement(pred), seq)) def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) @@ -85,11 +84,8 @@ def firstwhere(pred, seq): return first(dropwhile(complement(pred), seq)) assert intersection(lambda x: x[0] > 2, gen_cigar("1M2D1M")) == (1, 1) assert firstwhere(lambda x: x[0] >= 2, gen_cigar("1M1D1M")) == (2, 1) -#firstwhere(lambda x: x[1] == 'D', gen_cigar("1M1D1M")) -#firstwhere(lambda x: x[1] == 'D', get_cig("1M1D1M")) -c = "1M1D1M" -''' corresponds to ('AAGC-', 'A-GTT')''' -assert reduce(next_cigar_position, \ + +assert reduce(next_cigar_position, \ #corresponds to ('AAGC-', 'A-GTT') takewhile(lambda x: x[1] != 'I', get_cig('1M1D1M1X1I')), (0, 0)) == (4, 3) def mutpos(cig_str, types): @@ -97,8 +93,6 @@ def mutpos(cig_str, types): assert any(map(types.__contains__, cig_str)), "Cigar string %s does not have mutation of type %s" % (cig_str, types) return reduce(next_cigar_position, takewhile(lambda x: x[1] not in types, get_cig(cig_str)), (0, 0)) -foo = lambda c, t: reduce(next_cigar_position, filter(lambda x: x[1] in t, get_cig(c)), (0, 0)) -(lambda c, t: accumulate(next_cigar_position, filter(lambda x: x[1] in t, get_cig(c)), (0, 0))) indelpos = partial(mutpos, types='DI') query_indel_pos = compose(last, indelpos) ref_indel_pos = compose(first, indelpos) @@ -107,10 +101,13 @@ def mutpos(cig_str, types): ref_ins_pos = compose(first, partial(mutpos, types='I')) ref_snp_pos = compose(first, partial(mutpos, types='X')) +assert ref_ins_pos('1M1D1M1X1I') == 4 def mutations(cig_str): return drop(1, accumulate(partial(next_cigar_position, withtype=True), get_cig(cig_str), (0, 0, 0, 0))) +#TODO: Unfortnuately, need to use the MD tag, because bwa does not provide +# the 'X' tag for mismatches. snps = compose(partial(filter, lambda x: x[-1] == 'X'), mutations) def snp_rpos_base(qstring, cig_str): @@ -118,52 +115,38 @@ def snp_rpos_base(qstring, cig_str): def rpos_base(qstring, (rpos, qpos, count, type)): ''':return refpos, snp''' - return ((qstring[(qpos-count)+i], rpos +i) for i in xrange(0, count)) + return ((qstring[(qpos-count)+i], rpos +i) for i in xrange(0, count)) - -#wishful thinking about complex -#{'ins' : 'I', 'del' : 'D', 'snp' 'X', 'complex' : 'X'} - -assert ref_ins_pos('1M1D1M1X1I') == 4 -#use assertRaisesRegex try: mutpos('1M1D1M', 'X') except AssertionError: pass -# cigs = get_cig(CIGAR) -# indel_effect = sum(map(first, filter(compose('D'.__eq__, last), cigs))) - sum(map(first, filter(compose('I'.__eq__, last), cigs))) - def get_info(samrow): - #extract query_string and cigar_string from file - # have to add map positions to computed rpos's. + #TODO: extract query_string and cigar_string from file qname, mpos, cigar, qstring = itemgetter(0, 3, 5, 9)(samrow.split('\t')) mpos = int(mpos) _snps = snp_rpos_base(qstring, cigar) + #have to add map positions to computed rpos's. return map(lambda (b,i):(b, i+mpos), _snps) -# split by RG, then get_info, want to keep track of qname to prob. -#raw_longs, raw_shorts = fn.iters.partition( -lhn = ghn = 3 -#shareCount = compose(len, set.intersection) -shareCount = compose(len, lambda (x,y): x & y) -is_hap = lambda n: lambda x: shareCount(x) > n -# have to get cigar string with 'X' in it. +''' :lhn int minimum shared mutations in order to mark a group of reads as a local haplotype cluster. + :ghn int minimum shared mutations in order to mark a group of reads as a GLOBAL haplotype cluster. ''' if __name__ == '__main__': - import multiprocessing as mp - import sys + '''split by RG (read group), then get_info, want to keep track of qname to prob. + Find all local and global haplotype clusters.''' + lhn = ghn = 3 + shareCount = compose(len, lambda (x,y): x & y) + is_hap = lambda n: lambda x: shareCount(x) > n _fn = sys.argv[1] p = mp.Pool() shortReads = imap(get_info, take(3, sh.samtools.view(_fn, _iter=True))) shortReads = imap(set, shortReads) - #import ipdb; ipdb.set_trace(); cartProduct = itertools.product(*list(itertools.tee(shortReads))) - #print map(lambda (x,y): (type(x), type(y)), cartProduct) shortReads = ifilter(is_hap(lhn), cartProduct) + # shortReads should become the cluster of local haplotypes. for r in shortReads: - if r: print r - + if r: print r #ghs = filter(is_hap(ghn), combinations(longReads, longReads)) -#lhs = filter(is_hap(lhn), combinations(shortReads, shortReads)) - +#lhs = filter(is_hap(lhn), combinations(shortReads, shortReads)) From 0f854663adc922e79a62228c7fc24e3c8f12ceb6 Mon Sep 17 00:00:00 2001 From: michaelpanciera Date: Thu, 3 Dec 2015 15:46:27 -0500 Subject: [PATCH 7/7] renamed --- {bio_pieces => bio_bits}/cigar.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {bio_pieces => bio_bits}/cigar.py (100%) diff --git a/bio_pieces/cigar.py b/bio_bits/cigar.py similarity index 100% rename from bio_pieces/cigar.py rename to bio_bits/cigar.py