Skip to content

Commit 24fd490

Browse files
committed
- better infer inbyte size when file does not output actual original file size.
1 parent ae85e50 commit 24fd490

File tree

3 files changed

+16
-7
lines changed

3 files changed

+16
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
- Feature: multiple primer trimming.
66
- Feature: UMI trimming.
77

8+
## v4.1.4
9+
10+
- Change: compressed file: better infer inbyte size when `file` does not output actual original file size.
11+
812
## v4.1.3
913

1014
- Fix: when the paired end files are compressed, read chunks did not resize, which led to excessive copy, and copy number might accumulate round by round.

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Atria"
22
uuid = "226cbef3-b485-431c-85c2-d8bd8da14025"
33
authors = ["Jiacheng Chuan <jiacheng_chuan@outlook.com>"]
4-
version = "4.1.3"
4+
version = "4.1.4"
55

66
[deps]
77
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"

src/FqRecords/thread_input.jl

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ Return chunk sizes of file 1 and 2, uncompressed sizes of file 1 and 2. If no co
422422
@inline function chunk_sizes(file1::String, file2::String, max_chunk_size::Int)
423423
filesize1, isgz1 = check_filesize(file1)
424424
filesize2, isgz2 = check_filesize(file2)
425-
if isgz1 != isgz2
425+
if isgz1 != isgz2 || filesize1 / filesize2 > 1.5 || filesize2 / filesize1 > 1.5
426426
# cannot determine gzip size using `file` for one sample.
427427
# just use size.
428428
filesize1 = filesize(file1)
@@ -465,7 +465,7 @@ Return (chunk_size1, chunk_size2):Tuple{Int,Int}
465465
avg_length_r2 = avg_length_r1 * default_chunk_size2 / default_chunk_size1
466466
end
467467

468-
if 0.975 < n_r1 / n_r2 < 1.025
468+
if 0.995 < n_r1 / n_r2 < 1.005
469469
# do not care about n_r2 - n_r1
470470
if avg_length_r1 > avg_length_r2
471471
chunk_size2 = round(Int, (max_chunk_size/avg_length_r1) * avg_length_r2)
@@ -485,13 +485,18 @@ Return (chunk_size1, chunk_size2):Tuple{Int,Int}
485485
if chunk_size1 <= max_chunk_size
486486
# resize!(in1bytes, chunk_size1)
487487
# resize!(in2bytes, max_chunk_size)
488+
if chunk_size1 < 0
489+
chunk_size1 = round(Int, 0.1 * max_chunk_size)
490+
end
488491
return chunk_size1, max_chunk_size
489492
end
490493

491494
chunk_size2 = (n_r1 - n_r2 + max_chunk_size/avg_length_r1) * avg_length_r2
492495
chunk_size2 = round(Int, chunk_size2)
493496
if chunk_size2 > max_chunk_size
494497
@warn "Unexpected situation in get_ideal_inbyte_sizes!: chunk_size2 > max_chunk_size"
498+
elseif chunk_size2 < 0
499+
chunk_size2 = round(Int, 0.1 * max_chunk_size)
495500
end
496501
# resize!(in1bytes, max_chunk_size)
497502
# resize!(in2bytes, chunk_size2)
@@ -655,7 +660,7 @@ function read_chunks!(io::IO, inbytes::Vector{UInt8}, nremain::Integer, nthread:
655660
nbytes = length_inbytes
656661
else
657662
# move the unprocessed part of inbytes (which is in the end) to the front.
658-
copyto!(inbytes, 1, inbytes, length_inbytes - nremain + 1, nremain)
663+
nremain > 0 && copyto!(inbytes, 1, inbytes, length_inbytes - nremain + 1, nremain)
659664

660665
if resize_before_read != length_inbytes
661666
if resize_before_read < nremain
@@ -813,13 +818,13 @@ end
813818

814819

815820
"""
816-
seeklastfq(inbytes::Vector{UInt8}, nbytes::UInt)
821+
seeklastfq(inbytes::Vector{UInt8}, nbytes::Integer)
817822
818823
Seek the index of the last fastq read.
819824
820825
Caution: This function does not check whether the last fastq read is truncated or complete!
821826
"""
822-
@inline function seeklastfq(inbytes::Vector{UInt8}, nbytes::UInt)::UInt
827+
@inline function seeklastfq(inbytes::Vector{UInt8}, nbytes::Integer)::UInt
823828
# idx_last_fq = nbytes
824829
i = nbytes
825830
while i > 0
@@ -853,7 +858,7 @@ Caution: This function does not check whether the last fastq read is truncated o
853858
i = i_last_but_1
854859
end
855860
end
856-
i
861+
UInt(i)
857862
end
858863

859864
"""

0 commit comments

Comments
 (0)