diff --git a/.gitignore b/.gitignore index e845b0fd..a92ef0bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store docs/build Manifest.toml +.vscode/ diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 53c1470d..c28f1589 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -23,7 +23,7 @@ module TextAnalysis export Corpus, DirectoryCorpus export stemmer_types, Stemmer export DocumentTermMatrix - export text, tokens, ngrams + export text, tokens, ngrams, ordered_vocab export text!, tokens!, ngrams! export documents export language, title, author, timestamp @@ -112,4 +112,4 @@ module TextAnalysis function __init__() end -end +end \ No newline at end of file diff --git a/src/coom.jl b/src/coom.jl index e76cb151..7d320a73 100644 --- a/src/coom.jl +++ b/src/coom.jl @@ -22,26 +22,31 @@ of not the counts by the distance between word positions. The `mode` keyword can julia> using TextAnalysis, DataStructures doc = StringDocument("This is a text about an apple. There are many texts about apples.") docv = TextAnalysis.tokenize(language(doc), text(doc)) - vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3) + vocab = ordered_vocab(doc) TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true) 3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries: - [2, 1] = 2.0 - [1, 2] = 2.0 - [3, 2] = 0.3999 - [2, 3] = 0.3999 +13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries: + ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 1.0 2.0 ⋅ 2.0 1.0 0.6665 0.5 0.4 ⋅ ⋅ ⋅ ⋅ ⋅ + ⋮ ⋮ ⋮ + ⋅ ⋅ ⋅ ⋅ 2.0 ⋅ 0.4 1.166 0.6665 1.0 2.0 ⋅ 1.0 + ⋅ ⋅ ⋅ ⋅ 2.0 ⋅ ⋅ 2.0 0.4 0.5 0.6665 1.0 ⋅ julia> using TextAnalysis, DataStructures doc = StringDocument("This is a text about an apple. There are many texts about apples.") docv = TextAnalysis.tokenize(language(doc), text(doc)) - vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3) + vocab = ordered_vocab(doc) TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional) -3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries: - [2, 1] = 1.0 - [1, 2] = 1.0 - [3, 2] = 0.1999 - [2, 3] = 0.1999 +13×13 SparseArrays.SparseMatrixCSC{Float16, Int64} with 106 stored entries: + ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ + 0.5 1.0 ⋅ 1.0 0.5 0.3333 0.25 0.2 ⋅ ⋅ ⋅ ⋅ ⋅ + ⋮ ⋮ ⋮ + ⋅ ⋅ ⋅ ⋅ 1.0 ⋅ 0.2 0.583 0.3333 0.5 1.0 ⋅ 0.5 + ⋅ ⋅ ⋅ ⋅ 1.0 ⋅ ⋅ 1.0 0.2 0.25 0.3333 0.5 ⋅ ``` """ function coo_matrix(::Type{T}, diff --git a/src/document.jl b/src/document.jl index 0d05c19a..e432ce9a 100644 --- a/src/document.jl +++ b/src/document.jl @@ -46,7 +46,7 @@ end # ############################################################################## -abstract type AbstractDocument; end +abstract type AbstractDocument end mutable struct FileDocument <: AbstractDocument @@ -142,7 +142,7 @@ A TokenDocument{String} function TokenDocument(txt::AbstractString, dm::DocumentMetadata) TokenDocument(tokenize(dm.language, String(txt)), dm) end -function TokenDocument(tkns::Vector{T}) where T <: AbstractString +function TokenDocument(tkns::Vector{T}) where {T<:AbstractString} TokenDocument(tkns, DocumentMetadata()) end TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata()) @@ -189,7 +189,7 @@ end function NGramDocument(txt::AbstractString, n::Integer...=1) NGramDocument(txt, DocumentMetadata(), n...) end -function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString +function NGramDocument(ng::Dict{T,Int}, n::Integer...=1) where {T<:AbstractString} NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata()) end @@ -270,17 +270,82 @@ julia> tokens(sd) "." ``` """ -tokens(d::(Union{FileDocument, StringDocument})) = tokenize(language(d), text(d)) +tokens(d::(Union{FileDocument,StringDocument})) = tokenize(language(d), text(d)) tokens(d::TokenDocument) = d.tokens function tokens(d::NGramDocument) error("The tokens of an NGramDocument cannot be reconstructed") end -tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens) -function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString +tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T<:AbstractString} = (d.tokens = new_tokens) +function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where {T<:AbstractString} error("The tokens of a $(typeof(d)) cannot be directly edited") end + +############################################################################## +# +# vocab() / vocab!(): Access to document text as a vocabulary +# +# to_string_vector(): Helper function for creating a vocabulary from a StringDocument or a Vector{String} +# +############################################################################## +# Converts a StringDocument to Vector{String} +to_string_vector(doc::StringDocument) = tokens(doc) +# Identity function for Vector{String} +to_string_vector(vec::Vector{String}) = vec + +""" + ordered_vocab(input::Union{StringDocument, Vector{String}}) -> OrderedDict{String, Int} + +Create an ordered dictionary from a `StringDocument` or a `Vector` of strings (useful for creating cooccurrence matrices with coo_matrix() (cf. example below). The dictionary maps each unique string to its corresponding index. + +# Arguments +- `input::Union{StringDocument, Vector{String}}`: Input can be either a `StringDocument` or a `Vector{String}`. + For `StringDocument`, the tokens are extracted and used. For `Vector{String}`, the vector itself is used. + +# Returns +- `OrderedDict{String, Int}`: An ordered dictionary where each key is a unique string from the input, + and the value is the index of that string in the original input. + +# Examples +```julia-repl +julia> doc = StringDocument("This is a sample sentence of a sample document."); + ordered_vocab(doc) + +OrderedDict{String, Int64} with 8 entries: + "This" => 1 + "is" => 2 + "a" => 3 + "sample" => 4 + "sentence" => 5 + ⋮ => ⋮ + +julia> str_vec = ["This", "is", "a", "sample", "sentence", "of", "a", "sample", "document"]; + ordered_vocab(str_vec) + +OrderedDict{String, Int64} with 7 entries: + "This" => 1 + "is" => 2 + "a" => 3 + "sample" => 4 + "sentence" => 5 + ⋮ => ⋮ +""" +function ordered_vocab(input::Union{StringDocument,Vector{String}}) + string_vector = to_string_vector(input) |> unique + + # preallocating the ordered dictionary with the size of the string_vector + ordered_dict = OrderedDict{String,Int}() + sizehint!(ordered_dict, length(string_vector)) + + # populating the ordered dictionary + for (index, key) in enumerate(string_vector) + ordered_dict[key] = index + end + return ordered_dict +end + + ############################################################################## # # ngrams() / ngrams!(): Access to document text as n-gram counts @@ -322,7 +387,7 @@ ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n. ngrams(d::NGramDocument) = d.ngrams ngrams(d::AbstractDocument) = ngrams(d, 1) -ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString, Int}) = (d.ngrams = new_ngrams) +ngrams!(d::NGramDocument, new_ngrams::Dict{AbstractString,Int}) = (d.ngrams = new_ngrams) function ngrams!(d::AbstractDocument, new_ngrams::Dict) error("The n-grams of $(typeof(d)) cannot be directly edited") end @@ -371,8 +436,8 @@ const GenericDocument = Union{ ############################################################################## Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str) -Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns) -Document(ng::Dict{String, Int}) = NGramDocument(ng) +Document(tkns::Vector{T}) where {T<:AbstractString} = TokenDocument(tkns) +Document(ng::Dict{String,Int}) = NGramDocument(ng) ############################################################################## # @@ -383,11 +448,11 @@ Document(ng::Dict{String, Int}) = NGramDocument(ng) function Base.convert(::Type{StringDocument}, d::FileDocument) StringDocument(text(d), d.metadata) end -function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument, StringDocument})) +function Base.convert(::Type{TokenDocument}, d::(Union{FileDocument,StringDocument})) TokenDocument(tokens(d), d.metadata) end function Base.convert(::Type{NGramDocument}, - d::(Union{FileDocument, StringDocument, TokenDocument})) + d::(Union{FileDocument,StringDocument,TokenDocument})) NGramDocument(ngrams(d), 1, d.metadata) end Base.convert(::Type{TokenDocument}, d::TokenDocument) = d diff --git a/test/document.jl b/test/document.jl index e080f841..7eca0df5 100644 --- a/test/document.jl +++ b/test/document.jl @@ -1,13 +1,14 @@ +using DataStructures: OrderedDict @testset "Document" begin - dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1=>"v1", :k2=>"v2")) - @test (dmeta.language == Languages.English()) && - (dmeta.title == "test title") && - (dmeta.author == "test author") && - (dmeta.timestamp == "test time") && - (get(dmeta.custom, :k1, "") == "v1") && - (get(dmeta.custom, :k2, "") == "v2") + dmeta = TextAnalysis.DocumentMetadata(Languages.English(), "test title", "test author", "test time", Dict(:k1 => "v1", :k2 => "v2")) + @test (dmeta.language == Languages.English()) && + (dmeta.title == "test title") && + (dmeta.author == "test author") && + (dmeta.timestamp == "test time") && + (get(dmeta.custom, :k1, "") == "v1") && + (get(dmeta.custom, :k2, "") == "v2") # mutability dmeta.custom = nothing @@ -34,6 +35,9 @@ @test "a" in keys(ngrams(sd, 1)) @test "string" in keys(ngrams(sd, 1)) + @test ordered_vocab(sd) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) + @test ordered_vocab(["This", "is", "a", "string"]) == OrderedDict("This" => 1, "is" => 2, "a" => 3, "string" => 4) + @test length(sd) == 16 hamlet_text = "To be or not to be..." @@ -79,8 +83,8 @@ @test isequal(length(Document("this is text")), 12) # NGramDocument creation with multiple ngram complexity - let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7) - for (n,c,l) in zip(N,C,L) + let N = ((), (2,), (Int32(2),), (1, 2), (Int32(1), Int16(2))), C = (1, 2, 2, [1, 2], [1, 2]), L = (4, 3, 3, 7, 7) + for (n, c, l) in zip(N, C, L) ngd = NGramDocument(sample_text1, n...) @test ngram_complexity(ngd) == c @test length(ngd.ngrams) == l