-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathgetregions.py
More file actions
137 lines (121 loc) · 4.46 KB
/
getregions.py
File metadata and controls
137 lines (121 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
"""Extract columns from chrom-start sorted tsv file and produce
a BED3 or BED6 file. The file must have a numerical column that
can be used to filter the lines that will be output.
"""
def bed_from_line(line, cols, regionchrom, regionstart, regionstop):
"""Extract columns from file and return a BED3 of BED6 line"""
(chrom, start, end, *dummy) = cols.split(",")
chrom = int(chrom) - 1
start = int(start) - 1
end = int(end) - 1
if dummy != []:
name, score, strand = dummy
name = int(name) - 1
score = int(score) - 1
strand = int(strand) - 1
else:
name = score = strand = -1
fields = line.strip().split("\t")
if name >= 0:
return "\t".join(
[
fields[chrom],
str(regionstart),
str(regionstop),
fields[name],
fields[score],
fields[strand],
]
)
else:
return "\t".join([fields[chrom], str(regionstart), str(regionstop)])
def get_regions_2_bed(infile: str, columns, col: int = 0, threshold: int = 0):
"""Extract columns from tsv file with threshold for a column
Returns: List with BED3 or BED6 formatted lines.
"""
result = []
(chrom, start, end, *dummy) = columns.split(",")
chrom = int(chrom) - 1
start = int(start) - 1
end = int(end) - 1
if dummy != []:
name, score, strand = dummy
name = int(name) - 1
score = int(score) - 1
strand = int(strand) - 1
else:
name = score = strand = -1
if col == 0:
col = start if score == -1 else score
threshold = 0
else:
col -= 1
regionchrom = ""
regionstart = -1
regionstop = 0
inregion = False
with open(infile) as fi:
for line in fi:
fields = line.strip().split("\t")
if threshold == 0:
if name >= 0:
result.append(
"\t".join(
[
fields[chrom],
str(start),
str(end),
fields[name],
fields[score],
fields[strand],
]
)
)
else:
result.append("\t".join([fields[chrom], str(start), str(end)]))
else:
if fields[chrom] != regionchrom:
if inregion:
result.append(
"\t".join([regionchrom, str(regionstart), str(regionstop)])
)
regionchrom = fields[chrom]
regionstart = -1
regionstop = 0
inregion = False
if int(fields[col]) > threshold and inregion == False:
regionstart = fields[start]
inregion = True
if int(fields[col]) <= threshold and inregion:
result.append(
"\t".join([regionchrom, str(regionstart), str(regionstop)])
)
inregion = False
regionstart = -1
regionstop = 0
if inregion:
regionstop = fields[end]
if regionchrom != "" and regionstart != -1 and regionstop != 0:
result.append("\t".join([regionchrom, str(regionstart), str(regionstop)]))
return result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Parse tsv file and extract contiguous non zero regions in BED format"
)
parser.add_argument(
"-i", help="TAB separated file with at least a Chrom, star and end column"
)
parser.add_argument("-c", type=int, help="Column number to filter")
parser.add_argument("-t", type=int, default=0, help="Threshold to consider region")
parser.add_argument(
"-p",
help="Column numbers describing position in the form 'chrom,start,end[,name,score,strand]' no spaces and comma delimited",
)
parser.add_argument("-o", help="Output in bed format")
options = parser.parse_args()
regions = get_regions_2_bed(options.i, options.p, options.c, options.t)
with open(options.o, "w") as fo:
for region in regions:
fo.write(region + "\n")