From 6d4d5d669669f7cf8cf7335755017d1205980811 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 22 Sep 2023 14:55:09 +0200 Subject: [PATCH 1/6] Make documentation more approachable Mention the most important things first in the documentation: How to read and write files. --- docs/src/files.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/src/files.md b/docs/src/files.md index d80651c..3a0401d 100644 --- a/docs/src/files.md +++ b/docs/src/files.md @@ -116,3 +116,27 @@ UInt8[] julia> validate_fastq(IOBuffer("@header\nTTT\n+\njkm")) === nothing true ``` + +However, this latter syntax does not easily extend to different types of IO, such as gzip compressed streams. + +### Validate files +The functions `validate_fasta` and `validate_fastq` can be used to check if an `IO` +contains data that can be read as FASTX. +They return `nothing` if the IO is correctly formatted, and another value if not. + +They are significantly faster than parsing the whole file into records, +and are memory efficient. +Be aware that the validators mutate the IO by reading it, so make sure to reset the IO before using it to parse FASTX files. + +```jldoctest +julia> io = IOBuffer(">header\r\nAGG\nKK"); + +julia> validate_fasta(io) === nothing +true + +julia> read(io) # NB: IO is now exhausted +UInt8[] + +julia> validate_fastq(IOBuffer("@header\nTTT\n+\njkm")) === nothing +true +``` From 17e57487e59f9aa05cfd73635cbb2d736109a8e2 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 22 Sep 2023 13:58:39 +0200 Subject: [PATCH 2/6] Bump to BioGenerics 0.2 and use new BioGenerics macros The new BioGenerics `@rdr_str` and `@wtr_str` macros, as well as the `defer` keyword has been a longstanding FASTX todo. This change adds this new functionality. --- Project.toml | 13 +++++++------ src/FASTX.jl | 13 ++++++++++++- test/maintests.jl | 4 +++- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 216ee17..f72c8dc 100644 --- a/Project.toml +++ b/Project.toml @@ -3,9 +3,6 @@ uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12" authors = ["Sabrina J. Ward ", "Jakob N. Nissen "] version = "2.1.3" -[weakdeps] -BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" - [deps] Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" @@ -14,12 +11,15 @@ PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" StringViews = "354b36f9-a18e-4713-926e-db85100087ba" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +[weakdeps] +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" + [extensions] BioSequencesExt = "BioSequences" [compat] Automa = "1" -BioGenerics = "0.1.2" +BioGenerics = "0.1.3" BioSequences = "3" PrecompileTools = "1" StringViews = "1" @@ -27,10 +27,11 @@ TranscodingStreams = "0.9.5" julia = "1.6" [extras] -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["BioSequences", "Random", "Test", "FormatSpecimens"] +test = ["BioSequences", "CodecZlib", "Random", "Test", "FormatSpecimens"] diff --git a/src/FASTX.jl b/src/FASTX.jl index f539830..6055ef4 100644 --- a/src/FASTX.jl +++ b/src/FASTX.jl @@ -2,6 +2,7 @@ module FASTX using StringViews: StringView using Automa: Automa +using BioGenerics: BioGenerics, defer, @rdr_str, @wtr_str """ identifier(record::Record)::AbstractString @@ -220,6 +221,11 @@ const FASTQReader = FASTQ.Reader const FASTAWriter = FASTA.Writer const FASTQWriter = FASTQ.Writer +const FASTA_EXTENSIONS = Union{Val{:fa}, Val{:fasta}, Val{:faa}, Val{:fna}} + +BioGenerics.readertype(::FASTA_EXTENSIONS, arg) = FASTAReader +BioGenerics.writertype(::FASTA_EXTENSIONS, arg) = FASTAWriter + if !isdefined(Base, :get_extension) include("../ext/BioSequencesExt.jl") end @@ -249,6 +255,11 @@ export faidx, index!, extract, - seekrecord + seekrecord, + + # Re-export from BioGenerics + defer, + @rdr_str, + @wtr_str end # module diff --git a/test/maintests.jl b/test/maintests.jl index d1e7429..2eba5d5 100644 --- a/test/maintests.jl +++ b/test/maintests.jl @@ -1,3 +1,5 @@ +using CodecZlib + # Common tests @testset "FASTX" begin @testset "Copying to LongSequence" begin @@ -133,4 +135,4 @@ @test sequence(fq) == "TAGJKKm" @test quality(fq) == "jjkkmmo" end -end \ No newline at end of file +end From c2ba935d56641fade53d5473680e1cd1ace6ff9f Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 22 Sep 2023 16:15:09 +0200 Subject: [PATCH 3/6] Add docs --- docs/Project.toml | 1 + docs/src/files.md | 24 +++++++++++++++++++++++- docs/src/index.md | 14 ++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/docs/Project.toml b/docs/Project.toml index a760fb8..30dbcbf 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,4 +1,5 @@ [deps] +BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/docs/src/files.md b/docs/src/files.md index 3a0401d..b483d01 100644 --- a/docs/src/files.md +++ b/docs/src/files.md @@ -7,7 +7,7 @@ end # FASTX formatted files -### Readers and writers +### Readers and writers - basics A `Reader` and a `Writer` are structs that wrap an IO, and allows efficient reading/writing of FASTX `Record`s. For FASTA, use `FASTAReader` and `FASTAWriter`, and for FASTQ - well I'm sure you've guessed it. @@ -75,6 +75,8 @@ UInt8[] ``` To use it correctly, either call `flush`, or close the writer first (which also closes the underlying stream). + +### Readers and writers with do-syntax It is recommended to use readers and writers to `do` syntax in the form: ```jldoctest julia> FASTAWriter(open(tempname(), "w")) do writer @@ -119,6 +121,26 @@ true However, this latter syntax does not easily extend to different types of IO, such as gzip compressed streams. +### `rdr` and `wtr` macros +The `rdr` and `wtr` macros use the passed file name to determine the FASTX reader or writer to use - including any compression file extensions. +Since this both uses heuristics, and the macro is a little opaque to users, it is recommended to use these macros for ephemeral REPL work, and not in packages where the more explicit forms are preferred. + +The macro call `rdr"seqs.fna.gz"` expands to +```julia +FASTAReader(GzipDecompressorStream(open("seqs.fna.gz"; lock=false))) +``` + +To use rdr `rdr` and `wtr` macros with `do`-syntax, use the `defer` function. +The only purpose of the defer function is to enable `do`-syntax: + +```julia +record = FASTARecord("my_header", "TAGAG") + +defer(wtr"seqs.fna.gz") do writer + write(writer, record) +end +``` + ### Validate files The functions `validate_fasta` and `validate_fastq` can be used to check if an `IO` contains data that can be read as FASTX. diff --git a/docs/src/index.md b/docs/src/index.md index 719ae56..e6c0020 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -75,6 +75,20 @@ julia> FASTQWriter(GzipCompressorStream(open(tempname(), "w"))) do writer 28 ``` +For added convenience, you can also use the reader and writer macros `rdr""` and `wtr""`. +These macros use the file extensions to determine the biological sequence reader or writer type, and any file compresion. +To use these macros with the `do`-syntax, you can use the `defer` function. Hence, the above code block can also be written in the following equivalent way: + +```julia +using CodecZlib + +defer(rdr"seqs.fna.gz") do reader + for record in reader + println(identifier(record)) + end +end +``` + ### Construct FASTA or FASTQ records from raw parts ```jldoctest julia> fasta_record = FASTARecord("some header", dna"TAGAAGA"); From c4034390602ef76dd12a43440868927ca28901c7 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 24 Sep 2023 21:40:39 +0200 Subject: [PATCH 4/6] Update docs --- docs/src/files.md | 16 +++++++++++----- docs/src/index.md | 16 +++++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/docs/src/files.md b/docs/src/files.md index b483d01..a10b197 100644 --- a/docs/src/files.md +++ b/docs/src/files.md @@ -133,12 +133,18 @@ FASTAReader(GzipDecompressorStream(open("seqs.fna.gz"; lock=false))) To use rdr `rdr` and `wtr` macros with `do`-syntax, use the `defer` function. The only purpose of the defer function is to enable `do`-syntax: -```julia -record = FASTARecord("my_header", "TAGAG") +```jldoctest +julia> using CodecZlib # for gzip files -defer(wtr"seqs.fna.gz") do writer - write(writer, record) -end +julia> defer(rdr"../test/data/test.fasta") do reader + println(identifier(first(reader))) + end +abc + +julia> defer(wtr"seqs.fna.gz") do writer + write(writer, FASTARecord("my_header", "TAGAG")) + end +17 ``` ### Validate files diff --git a/docs/src/index.md b/docs/src/index.md index e6c0020..07389df 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -79,14 +79,16 @@ For added convenience, you can also use the reader and writer macros `rdr""` and These macros use the file extensions to determine the biological sequence reader or writer type, and any file compresion. To use these macros with the `do`-syntax, you can use the `defer` function. Hence, the above code block can also be written in the following equivalent way: -```julia -using CodecZlib +```jldoctest +julia> using CodecZlib -defer(rdr"seqs.fna.gz") do reader - for record in reader - println(identifier(record)) - end -end +julia> defer(rdr"../test/data/seqs.fna.gz") do reader + for record in reader + println(identifier(record)) + end + end +seqa +seqb ``` ### Construct FASTA or FASTQ records from raw parts From 66c70073990bc3b3f7cf735028751b1727e0bc51 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 27 Sep 2023 19:29:06 +0200 Subject: [PATCH 5/6] Update --- docs/src/files.md | 8 ++++---- docs/src/index.md | 4 ++-- src/FASTX.jl | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/src/files.md b/docs/src/files.md index a10b197..691068a 100644 --- a/docs/src/files.md +++ b/docs/src/files.md @@ -130,18 +130,18 @@ The macro call `rdr"seqs.fna.gz"` expands to FASTAReader(GzipDecompressorStream(open("seqs.fna.gz"; lock=false))) ``` -To use rdr `rdr` and `wtr` macros with `do`-syntax, use the `defer` function. -The only purpose of the defer function is to enable `do`-syntax: +Even though the reader (or writer) is already opened, you can still use the ordinary `open(x) do f` +pattern to automatically close the reader when done: ```jldoctest julia> using CodecZlib # for gzip files -julia> defer(rdr"../test/data/test.fasta") do reader +julia> open(rdr"../test/data/test.fasta") do reader println(identifier(first(reader))) end abc -julia> defer(wtr"seqs.fna.gz") do writer +julia> open(wtr"seqs.fna.gz") do writer write(writer, FASTARecord("my_header", "TAGAG")) end 17 diff --git a/docs/src/index.md b/docs/src/index.md index 07389df..33c0614 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -77,12 +77,12 @@ julia> FASTQWriter(GzipCompressorStream(open(tempname(), "w"))) do writer For added convenience, you can also use the reader and writer macros `rdr""` and `wtr""`. These macros use the file extensions to determine the biological sequence reader or writer type, and any file compresion. -To use these macros with the `do`-syntax, you can use the `defer` function. Hence, the above code block can also be written in the following equivalent way: +To use these macros with the `do`-syntax, you can use `open` as normal. Hence, the above code block can also be written in the following equivalent way: ```jldoctest julia> using CodecZlib -julia> defer(rdr"../test/data/seqs.fna.gz") do reader +julia> open(rdr"../test/data/seqs.fna.gz") do reader for record in reader println(identifier(record)) end diff --git a/src/FASTX.jl b/src/FASTX.jl index 6055ef4..d084083 100644 --- a/src/FASTX.jl +++ b/src/FASTX.jl @@ -2,7 +2,7 @@ module FASTX using StringViews: StringView using Automa: Automa -using BioGenerics: BioGenerics, defer, @rdr_str, @wtr_str +using BioGenerics: BioGenerics, @rdr_str, @wtr_str """ identifier(record::Record)::AbstractString @@ -258,7 +258,6 @@ export seekrecord, # Re-export from BioGenerics - defer, @rdr_str, @wtr_str From 7eaea5b7a30954ed57000c94953fa957c404b7f5 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 27 Sep 2023 21:14:38 +0200 Subject: [PATCH 6/6] Remove duplicate doc section --- docs/src/files.md | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/docs/src/files.md b/docs/src/files.md index 691068a..5b4f3cd 100644 --- a/docs/src/files.md +++ b/docs/src/files.md @@ -7,7 +7,7 @@ end # FASTX formatted files -### Readers and writers - basics +### Readers and writers A `Reader` and a `Writer` are structs that wrap an IO, and allows efficient reading/writing of FASTX `Record`s. For FASTA, use `FASTAReader` and `FASTAWriter`, and for FASTQ - well I'm sure you've guessed it. @@ -97,28 +97,6 @@ julia> open(FASTAWriter, tempname()) do writer However, this latter syntax does not easily extend to different types of IO, such as gzip compressed streams. -### Validate files -The functions `validate_fasta` and `validate_fastq` can be used to check if an `IO` -contains data that can be read as FASTX. -They return `nothing` if the IO is correctly formatted, and another value if not. - -They are significantly faster than parsing the whole file into records, -and are memory efficient. -Be aware that the validators mutate the IO by reading it, so make sure to reset the IO before using it to parse FASTX files. - -```jldoctest -julia> io = IOBuffer(">header\r\nAGG\nKK"); - -julia> validate_fasta(io) === nothing -true - -julia> read(io) # NB: IO is now exhausted -UInt8[] - -julia> validate_fastq(IOBuffer("@header\nTTT\n+\njkm")) === nothing -true -``` - However, this latter syntax does not easily extend to different types of IO, such as gzip compressed streams. ### `rdr` and `wtr` macros