Skip to content

Commit d41676e

Browse files
authored
🔀 Merge pull request #74 from cnr-ibba/issue-73
⚡ Optimize `samtools/depth` step
2 parents 055c09d + 0b84168 commit d41676e

File tree

10 files changed

+110
-53
lines changed

10 files changed

+110
-53
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"coverage",
1111
"cpus",
1212
"demultiplexing",
13+
"depthfile",
1314
"downsampling",
1415
"fasta",
1516
"fastq",

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6+
## 0.6.1 - [2024-04-11]
7+
8+
- Calculate `samtools/depth` on each chromosomes ([#73](https://github.com/cnr-ibba/nf-resequencing-mem/issues/73))
9+
10+
### `Fixed`
11+
12+
- Passing args to `modules/local/freebayes_splitcram`
13+
- Calculate `samtools/depth` without _0 coverage_ regions
14+
615
## 0.6.0 - [2024-04-04]
716

817
- Replace `*.bam` file format with `*.cram` ([#9](https://github.com/cnr-ibba/nf-resequencing-mem/issues/9))

assets/multiqc_config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
report_comment: >
2-
This report has been generated by the <a href="https://github.com/cnr-ibba/nf-resequencing-mem/releases/tag/v0.6.0" target="_blank">nf-resequencing-mem</a>
2+
This report has been generated by the <a href="https://github.com/cnr-ibba/nf-resequencing-mem/releases/tag/v0.6.1" target="_blank">nf-resequencing-mem</a>
33
analysis pipeline. For information about how to interpret these results, please see the
4-
<a href="https://github.com/cnr-ibba/nf-resequencing-mem/blob/v0.6.0/README.md" target="_blank">documentation</a>.
4+
<a href="https://github.com/cnr-ibba/nf-resequencing-mem/blob/v0.6.1/README.md" target="_blank">documentation</a>.
55
report_section_order:
66
"nf-resequencing-mem-methods-description":
77
order: -1000

bin/split_ref_by_depth.py

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,19 @@
22
# -*- coding: utf-8 -*-
33

44
"""
5+
56
This script will write a list of region from a reference file relying on
6-
coverage depth
7+
coverage depth. Is supposed to work on a single chromosome at a time, it
8+
requires chromosome length in order to work on sample depth with no 0 regions
79
810
Author: Paolo Cozzi
9-
Date: 2024/03/21
11+
Date: 2024/04/11
1012
1113
Usage:
12-
python split_ref_by_depth.py --depth_file <depth_file>
14+
python split_ref_by_depth.py --depth_file <depth_file> \
15+
--chromosome <chromosome> --chromosome_length <chromosome_length> \
16+
[--min_length <min_length>] [--max_coverage <max_coverage>] \
17+
[--overlap_size <overlap_size>] [--verbose <verbose>] [--quiet <quiet>]
1318
1419
Arguments:
1520
depth_file: The path of samtools depth output file
@@ -126,49 +131,40 @@ def append_or_extend_region(
126131

127132

128133
def split_ref_by_coverage(
129-
depthfile: str, max_coverage: int, min_length: int, overlap_size: int):
134+
depthfile: str, chromosome: str, chromosome_length: int,
135+
max_coverage: int, min_length: int, overlap_size: int):
130136
with gzip.open(depthfile, "rt") as handle:
131137
reader = csv.reader(handle, delimiter="\t", lineterminator="\n")
132138
header = next(reader)
133139

134140
# header: ["#CHROM", "POS", "Sample_1.bam", "Sample_2.bam", ...]
135141
logging.debug(f"Got header: {header}")
136142

137-
# Inizialize some variables
138-
line = next(reader)
139-
start_chrom = line[0]
140-
start_pos = int(line[1])
141-
old_pos = start_pos
142-
cumulative_coverage = 0
143+
# test if I have reads aligned in this file
144+
try:
145+
line = next(reader)
146+
147+
except StopIteration:
148+
logging.error(
149+
f"File '{depthfile}' has no coverage data"
150+
)
151+
return [[chromosome, 0, chromosome_length]]
143152

153+
# Initialize some variables
144154
regions = []
155+
start_chrom = chromosome
156+
start_pos = 1
157+
old_pos = start_pos
158+
cumulative_coverage = 0
145159

146160
logging.debug(f"Starting from chrom: '{start_chrom}'")
147161

148162
for i, line in enumerate(reader):
149163
chrom, pos = line[0], int(line[1])
150164

151165
if chrom != start_chrom:
152-
logging.debug(f"{i}: Got a new chromosome '{chrom}'")
153-
logging.debug(
154-
f"{i}: Test for a new region with: "
155-
f"{[start_chrom, start_pos, old_pos]}"
156-
f" ({old_pos-start_pos+1} bp; "
157-
f"{cumulative_coverage:.2e} cumulative coverage)"
158-
)
159-
160-
# add and open a new region
161-
regions = append_or_extend_region(
162-
regions,
163-
[start_chrom, start_pos, old_pos],
164-
min_length,
165-
overlap_size
166-
)
167-
168-
# reset variables
169-
start_chrom = chrom
170-
start_pos = pos
171-
cumulative_coverage = 0
166+
logging.critical(f"{i}: Got a new chromosome '{chrom}'")
167+
raise Exception("This script works on a single chromosome at a time")
172168

173169
# determine region size
174170
length = pos - start_pos
@@ -219,6 +215,14 @@ def split_ref_by_coverage(
219215
overlap_size
220216
)
221217

218+
# check for last region end
219+
last_region = regions[-1]
220+
if last_region[2] < chromosome_length:
221+
logging.debug(
222+
f"Extending the last region: {last_region} to {chromosome_length}"
223+
)
224+
last_region[2] = chromosome_length
225+
222226
return regions
223227

224228

@@ -232,6 +236,14 @@ def split_ref_by_coverage(
232236
'-d', '--depth_file', required=True,
233237
help="The output of samtools depth file for all samples"
234238
)
239+
parser.add_argument(
240+
'-c', '--chromosome', required=True, type=str,
241+
help="The chromosome name"
242+
)
243+
parser.add_argument(
244+
'-l', '--chromosome_length', required=True, type=int,
245+
help="The length of the chromosome"
246+
)
235247
parser.add_argument(
236248
"--min_length", default=100_000, type=int,
237249
help="minimum fragment length in bp"
@@ -263,7 +275,13 @@ def split_ref_by_coverage(
263275

264276
# split reference by coverage depth
265277
regions = split_ref_by_coverage(
266-
args.depth_file, args.max_coverage, args.min_length, args.overlap_size)
278+
args.depth_file,
279+
args.chromosome,
280+
args.chromosome_length,
281+
args.max_coverage,
282+
args.min_length,
283+
args.overlap_size
284+
)
267285

268286
logging.info(f"Number of regions: {len(regions)}")
269287

conf/base.config

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ process {
2626
withLabel:process_single {
2727
cpus = { check_max( 1 , 'cpus' ) }
2828
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
29-
time = { check_max( 4.h * task.attempt, 'time' ) }
29+
time = { check_max( 6.h * task.attempt, 'time' ) }
3030
}
3131
withLabel:process_low {
3232
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
3333
memory = { check_max( 4.GB * task.attempt, 'memory' ) }
34-
time = { check_max( 4.h * task.attempt, 'time' ) }
34+
time = { check_max( 6.h * task.attempt, 'time' ) }
3535
}
3636
withLabel:process_medium {
3737
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
3838
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
39-
time = { check_max( 8.h * task.attempt, 'time' ) }
39+
time = { check_max( 12.h * task.attempt, 'time' ) }
4040
}
4141
withLabel:process_high {
4242
cpus = { check_max( 12 * task.attempt, 'cpus' ) }

modules/local/freebayes_splitcram.nf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@ process FREEBAYES_SPLITCRAM {
2323
def prefix = task.ext.prefix ?: "${meta.id}"
2424
"""
2525
split_ref_by_depth.py \\
26-
--depth_file ${depth} > ${prefix}.regions.txt
26+
${args} \\
27+
--depth_file ${depth} \\
28+
--chromosome ${meta.id} \\
29+
--chromosome_length ${meta.length} \\
30+
> ${prefix}.regions.txt
2731
2832
cat <<-END_VERSIONS > versions.yml
2933
"${task.process}":

modules/nf-core/samtools/depth/main.nf

Lines changed: 6 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

modules/nf-core/samtools/depth/samtools-depth.diff

Lines changed: 15 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

nextflow.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ manifest {
162162
description = 'Nextflow Resequencing with BWA MEM'
163163
mainScript = 'main.nf'
164164
nextflowVersion = '!>=23.04.0'
165-
version = '0.6.0'
165+
version = '0.6.1'
166166
}
167167

168168
// Load modules.config for DSL2 module specific options

subworkflows/local/cram_freebayes_parallel/main.nf

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,16 @@ workflow CRAM_FREEBAYES_PARALLEL {
1818
main:
1919
ch_versions = Channel.empty()
2020

21-
// calculate total coverage depth for all samples
22-
SAMTOOLS_DEPTH( bam, [[], []] )
21+
// read chromosome list from fasta index
22+
chromosome_ch = fai.map{ it -> it[1] }
23+
.splitCsv(header: false, sep: '\t')
24+
.map{ row -> [[id: row[0], length: row[1]], row[0]] }
25+
// .view()
26+
27+
// calculate total coverage depth for all samples by chromosome
28+
// bam is a value channel; chromosome_ch is a queue channel
29+
// for every chromosome
30+
SAMTOOLS_DEPTH( bam, bai, chromosome_ch )
2331
ch_versions = ch_versions.mix(SAMTOOLS_DEPTH.out.versions)
2432

2533
// split fasta in chunks relying BAM size
@@ -33,7 +41,15 @@ workflow CRAM_FREEBAYES_PARALLEL {
3341
.map{ it -> [[id: it.trim()], it.trim()]}
3442

3543
// call freebayes on each region
36-
FREEBAYES_CHUNK ( regions_ch, bam, bai, SAMTOOLS_DEPTH.out.bam_list, fasta, fai )
44+
FREEBAYES_CHUNK (
45+
regions_ch,
46+
bam,
47+
bai,
48+
// converting to a value channel
49+
SAMTOOLS_DEPTH.out.bam_list.first(),
50+
fasta,
51+
fai
52+
)
3753
ch_versions = ch_versions.mix(FREEBAYES_CHUNK.out.versions)
3854

3955
// merge freebayes chunks

0 commit comments

Comments
 (0)