forked from Maggi-Chen/DeBreak
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebreak
More file actions
executable file
·340 lines (285 loc) · 14.4 KB
/
debreak
File metadata and controls
executable file
·340 lines (285 loc) · 14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/env python3
import debreak_detect
import debreak_merge_contig as debreak_merge
import debreak_merge_cluster
import debreak_writevcf
import os
import argparse
import multiprocessing
import debreak_rescuedupfromins
import debreak_resdup_selfalignment
#import debreak_genotype_ml as debreak_genotype
import debreak_genotype
import debreak_allpoa
import debreak_rescuelargeins
import pysam
import sys
parser=argparse.ArgumentParser(description='SV caller for long-read sequencing data', usage='debreak.py [-h] --bam <sort.bam>')
parser.add_argument('-v','--version', action='version', version='DeBreak_v1.0.2')
parser.add_argument('--bam',type=str,default=False,help='input sorted bam. index required')
parser.add_argument('--samlist',type=str,default=False,help='a list of SAM files of same sample')
parser.add_argument('-o','--outpath',type=str,default='./debreak-out/',help='output directory')
parser.add_argument('--min_size',type=int,default=48,help='minimal size of detected SV')
parser.add_argument('--max_size',type=int,default=400000000,help='maxminal size of detected SV')
parser.add_argument('-d','--depth',type=float,default=False,help='sequencing depth of this dataset')
parser.add_argument('-m','--min_support',type=int,default=False,help='minimal number of supporting reads for one event')
parser.add_argument('--min_quality',type=int,default=None,help='minimal mapping quality of reads')
parser.add_argument('--aligner',type=str,default='minimap2',help='aligner used to generate BAM/SAM')
parser.add_argument('-t','--thread',type=int,default=8,help='number of threads')
parser.add_argument('--rescue_dup',action='store_true',default=False,help='rescue DUP from INS calls. minimap2,ref required')
parser.add_argument('--rescue_large_ins',action='store_true',default=False,help='rescue large INS. wtdbg2,minimap2,ref required')
parser.add_argument('--poa',action='store_true',default=False,help='POA for accurate breakpoint. wtdbg2,minimap2,ref required.')
parser.add_argument('--no_genotype',action='store_true',default=False,help='disable genotyping')
parser.add_argument('-r','--ref',type=str,default=False,help='reference genome. Should be same with SAM/BAM')
parser.add_argument('--maxcov',type=int,default=False,help='Maximal coverage for a SV. Suggested maxcov as 2 times mean depth.')
parser.add_argument('--skip_detect',action='store_true',default=False,help='Skip SV raw signal detection.')
parser.add_argument('--tumor',action='store_true',default=False,help='Allow clustered SV breakpoints during raw SV signal detection')
debreak_args=parser.parse_args()
# check required parameters and set optional modules
if not debreak_args.samlist and not debreak_args.bam:
print ('Error: No input file given!\nFor Debreak usage, use -h or --help')
sys.exit(1)
if debreak_args.outpath=='./debreak-out/':
os.system("mkdir debreak-out")
if debreak_args.outpath[-1]!='/':
debreak_args.outpath+='/'
if not os.path.exists(debreak_args.outpath):
os.makedirs(debreak_args.outpath)
writepath=debreak_args.outpath
if debreak_args.tumor:
if debreak_args.min_quality ==None:
debreak_args.min_quality=0
logfile=open(writepath+'log.txt','a')
logfile.write('Start calling SV...\n')
logfile.close()
record_clip=False
if debreak_args.rescue_large_ins:
if debreak_args.bam:
if debreak_args.ref:
os.system("mkdir "+writepath+'debreak_ins_workspace/')
record_clip=True
else:
logfile=open(writepath+'log.txt','a')
logfile.write('Warning: No reference provided. Rescue_large_INS module aborted.\n')
logfile.close()
else:
logfile=open(writepath+'log.txt','a')
logfile.write('Warning: Rescue_large_INS module requires sorted BAM input. Aborted.\n')
logfile.close()
vcflist=[]
os.system("mkdir "+writepath+'map_depth/')
os.system("mkdir "+writepath+'sv_raw_calls/')
if not debreak_args.bam:
# Detect SVs from all SAM files
samlistpath=debreak_args.samlist
f=open(samlistpath,'r')
samlist=f.read().split('\n')[:-1]
f.close()
readpath=debreak_args.samlist[:-len(debreak_args.samlist.split('/')[-1])]
chrominfofile=pysam.AlignmentFile(readpath+samlist[0],'r')
chromosomes=chrominfofile.references
chromolength={}
for chrom in chromosomes:
chromolength[chrom]=chrominfofile.get_reference_length(chrom)
chromosomes=[c for c in chromosomes]
debreak_det=multiprocessing.Pool(debreak_args.thread)
for i in range(len(samlist)):
vcflist+=[samlist[i][:-4]+'.debreak.temp']
if not debreak_args.skip_detect:
debreak_det.apply_async(debreak_detect.detect_sam,args=(samlist[i],readpath,writepath,chromosomes,debreak_args.min_size,debreak_args.max_size,True,True))
debreak_det.close()
debreak_det.join()
else:
# Detect SVs from sorted bam file
readpath=debreak_args.bam
chrominfofile=pysam.AlignmentFile(readpath,'rb')
chromosomes=chrominfofile.references
chromolength={}
for chrom in chromosomes:
chromolength[chrom]=chrominfofile.get_reference_length(chrom)
debreak_det=multiprocessing.Pool(debreak_args.thread)
for i in range(len(chromosomes)):
vcflist+=[readpath.split('/')[-1]+'-'+chromosomes[i]+'.debreak.temp']
if not debreak_args.skip_detect:
debreak_det.apply_async(debreak_detect.detect_sortbam,args=(readpath,writepath,debreak_args.min_size,debreak_args.max_size,chromosomes[i],chromosomes,record_clip,debreak_args.rescue_dup,debreak_args.tumor))
debreak_det.close()
debreak_det.join()
# Merge deletions & insertions from temp files
logfile=open(writepath+'log.txt','a')
logfile.write('Done detecting raw SV signals.\n')
logfile.close()
rawinscall={}; rawdelcall={}; rawdupcall={}; rawinvcall={}; rawtracall={}
for chrom in chromosomes:
rawinscall[chrom]=[]; rawdelcall[chrom]=[]; rawdupcall[chrom]=[]; rawinvcall[chrom]=[]; rawtracall[chrom]=[]
for filename in vcflist:
try:
allvcf=open(writepath+'sv_raw_calls/'+filename,'r').read().split('\n')[:-1]
event_chrom=allvcf[0].split('\t')[0]
except:
continue
for event in allvcf:
event_chrom=event.split('\t')[0]
if 'I-' in event:
event=event.split('\t')
rawinscall[event_chrom]+=[event[0]+'\t'+event[1]+'\t'+event[2]+'\t'+event[3]+'\t'+event[4]+'\t'+event[5]+'\t'+event[6]]
if 'D-' in event:
rawdelcall[event_chrom]+=[event]
if 'DUP-' in event:
rawdupcall[event_chrom]+=[event]
if 'INV-' in event:
rawinvcall[event_chrom]+=[event]
if 'TRA-' in event:
rawtracall[event_chrom]+=[event.split('\t')[0]+'\t'+event.split('\t')[1]+'\t'+event.split('\t')[2]+'\t'+event.split('\t')[3]+'\t'+event.split('\t')[7]+'\t'+event.split('\t')[5]]
# Estimate Depth of input dataset
logfile=open(writepath+'log.txt','a')
if not debreak_args.min_support:
if debreak_args.depth:
depth=debreak_args.depth
minsupp=round(debreak_args.depth//10.0)+2
else:
os.system('cat '+writepath+'map_depth/maplength_* > '+writepath+'map_depth/maplength_allchrom_debreak')
allmaplength=open(writepath+'map_depth/maplength_allchrom_debreak','r').read().split('\n')[:-1]
allmaplength=[int(c) for c in allmaplength]
allmaplength=sum(allmaplength)
if debreak_args.bam:
reflength=sum(pysam.AlignmentFile(debreak_args.bam,'rb').lengths)
else:
reflength=sum(pysam.AlignmentFile(samlist[0],'r').lengths)
depth=float(allmaplength)/reflength
minsupp=round(depth/10.0)+2
logfile.write("Estimated Sequencing Depth: "+str(depth)+'\n')
logfile.write("Suggested min_supp:"+str(minsupp)+'\n')
else:
minsupp=debreak_args.min_support
if debreak_args.depth:
depth=debreak_args.depth
else:
os.system('cat '+writepath+'map_depth/maplength_* > '+writepath+'map_depth/maplength_allchrom_debreak')
allmaplength=open(writepath+'map_depth/maplength_allchrom_debreak','r').read().split('\n')[:-1]
allmaplength=[int(c) for c in allmaplength]
allmaplength=sum(allmaplength)
if debreak_args.bam:
reflength=sum(pysam.AlignmentFile(debreak_args.bam,'rb').lengths)
else:
reflength=sum(pysam.AlignmentFile(samlist[0],'r').lengths)
depth=float(allmaplength)/reflength
logfile.write("Estimated Sequencing Depth: "+str(depth)+'\n')
logfile.write("Using input min_supp:"+str(minsupp)+'\n')
if debreak_args.maxcov:
highcov=debreak_args.maxcov
logfile.write('Using maximal depth:'+str(highcov)+'\n')
else:
highcov=max(50,round(depth*2+20))
logfile.write('Suggested maximal depth:'+str(highcov)+'\n')
logfile.write('Start clustering raw signals...\n')
logfile.close()
# Merge all SVs
debreak_meg=multiprocessing.Pool(debreak_args.thread//2)
for i in range(len(chromosomes)):
debreak_meg.apply_async(debreak_merge_cluster.cluster,args=(writepath,rawdelcall[chromosomes[i]],chromosomes[i],chromolength[chromosomes[i]],minsupp,highcov,'del',debreak_args.min_quality,))
debreak_meg.apply_async(debreak_merge_cluster.cluster_ins,args=(writepath,rawinscall[chromosomes[i]],chromosomes[i],chromolength[chromosomes[i]],minsupp,highcov,debreak_args.min_quality,))
debreak_meg.apply_async(debreak_merge_cluster.cluster,args=(writepath,rawdupcall[chromosomes[i]],chromosomes[i],chromolength[chromosomes[i]],minsupp,highcov,'dup',debreak_args.min_quality,))
debreak_meg.apply_async(debreak_merge_cluster.cluster,args=(writepath,rawinvcall[chromosomes[i]],chromosomes[i],chromolength[chromosomes[i]],minsupp,highcov,'inv',debreak_args.min_quality,))
debreak_meg.apply_async(debreak_merge.merge_translocation,args=(minsupp,writepath,rawtracall[chromosomes[i]],chromosomes[i],True,debreak_args.min_quality,))
debreak_meg.close()
debreak_meg.join()
os.system("cat "+writepath+"del-merged-cluster-* > "+writepath+"deletion-merged")
os.system("cat "+writepath+"ins-merged-cluster-* > "+writepath+"insertion-merged")
os.system("cat "+writepath+"dup-merged-cluster-* > "+writepath+"duplication-merged")
os.system("cat "+writepath+"inv-merged-cluster-* > "+writepath+"inversion-merged")
os.system("cat "+writepath+"tra-merged-* > "+writepath+"translocation-merged")
os.system("rm "+writepath+"*-merged-*")
logfile=open(writepath+'log.txt','a')
logfile.write('Done clustering raw signals.\n')
logfile.close()
debreak_resdup_selfalignment.clean_ins_dup(writepath)
if debreak_args.rescue_dup:
if debreak_args.ref:
logfile=open(writepath+'log.txt','a')
logfile.write('Start identifying DUP from INS calls...\n')
logfile.close()
debreak_resdup_selfalignment.identify_duplication(vcflist,writepath,debreak_args.ref)
logfile=open(writepath+'log.txt','a')
logfile.write('Done DUP rescue.\n')
logfile.close()
else:
logfile=open(writepath+'log.txt','a')
logfile.write('Warning: Reference genome is required for duplication rescue. Skip this step.\n')
logfile.close()
# POA for all SVs
if debreak_args.poa:
logfile=open(writepath+'log.txt','a')
if not debreak_args.ref:
logfile.write('Warning: No reference genome provided. Abort breakpoint refinement.\n')
logfile.close()
else:
logfile.write('Start breakpoint refinement for all SV candidates...\n')
logfile.close()
if debreak_args.bam:
debreak_allpoa.poa_bam(debreak_args.bam,writepath,chromosomes,debreak_args.thread,debreak_args.ref,debreak_args.min_size,debreak_args.max_size)
else:
debreak_allpoa.poa_sam(readpath,samlist,writepath,debreak_args.thread,debreak_args.ref,chromosomes,debreak_args.min_size,debreak_args.max_size)
logfile=open(writepath+'log.txt','a')
logfile.write('Done SV breakpoint refinement.\n')
logfile.close()
# Genotyping
if debreak_args.no_genotype:
if debreak_args.bam:
logfile=open(writepath+'log.txt','a')
logfile.write('Start SV filtering based on depth...\n')
logfile.close()
debreak_gt=multiprocessing.Pool(debreak_args.thread)
for i in range(len(chromosomes)):
debreak_gt.apply_async(debreak_genotype.genotype_filter_del,args=(debreak_args.bam,writepath+'deletion-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_filter_ins,args=(debreak_args.bam,writepath+'insertion-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_filter_del,args=(debreak_args.bam,writepath+'duplication-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_filter_del,args=(debreak_args.bam,writepath+'inversion-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_filter_tra,args=(debreak_args.bam,writepath+'translocation-merged',chromosomes[i],highcov))
debreak_gt.close()
debreak_gt.join()
os.system("cat "+writepath+"*merged-gt* > "+writepath+"debreak-allsv-merged-final")
os.system("rm "+writepath+"*merged-gt*")
logfile=open(writepath+'log.txt','a')
logfile.write('Done SV filtering.\n')
logfile.close()
else:
os.system("cat "+writepath+"*merged > "+writepath+"debreak-allsv-merged-final")
else:
logfile=open(writepath+'log.txt','a')
if debreak_args.bam:
logfile.write('Start genotyping for all SV candidates...\n')
logfile.close()
debreak_gt=multiprocessing.Pool(debreak_args.thread)
for i in range(len(chromosomes)):
debreak_gt.apply_async(debreak_genotype.genotype_ins,args=(debreak_args.bam,writepath+'insertion-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_del,args=(debreak_args.bam,writepath+'deletion-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_del,args=(debreak_args.bam,writepath+'duplication-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_del,args=(debreak_args.bam,writepath+'inversion-merged',chromosomes[i],highcov))
debreak_gt.apply_async(debreak_genotype.genotype_tra,args=(debreak_args.bam,writepath+'translocation-merged',chromosomes[i],highcov))
debreak_gt.close()
debreak_gt.join()
os.system("cat "+writepath+"*merged-gt* > "+writepath+"debreak-allsv-merged-final")
os.system("rm "+writepath+"*merged-gt*")
logfile=open(writepath+'log.txt','a')
logfile.write('Done genotyping.\n')
logfile.close()
else:
os.system("cat "+writepath+"*merged > "+writepath+"debreak-allsv-merged-final")
logfile.write('Warning: No sorted bam file provided. Abort genotyping.\n')
logfile.close()
if record_clip:
logfile=open(writepath+'log.txt','a')
logfile.write('Start Rescue-Lagre-Ins module...\n')
logfile.close()
debreak_rescuelargeins.rescue_ins_bam(debreak_args.bam,chromosomes,writepath,debreak_args.thread,debreak_args.ref,minsupp,debreak_args.min_size,debreak_args.max_size)
logfile=open(writepath+'log.txt','a')
logfile.write('Done large INS rescue.\n')
logfile.close()
# Write into vcf file
logfile=open(writepath+'log.txt','a')
logfile.write('Start writing VCF file...\n')
logfile.close()
debreak_writevcf.writevcf_pysam(writepath,readpath,debreak_args.poa,debreak_args.ref)
logfile=open(writepath+'log.txt','a')
logfile.write('VCF file written. Bye.\n')
logfile.close()