Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ deps/build.log
*.jl.mem
Manifest.toml
.vscode
test/write_tests/
11 changes: 6 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ uuid = "d71aba96-b539-5138-91ee-935c3ee1374c"
version = "1.1.2-DEV"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
ReadStat_jll = "a4dc8951-f1cc-5499-9034-9ec1c3e64557"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea of this package here is that it is a low-level wrapper around the C library that itself doesn't take any dependency on either Tables.jl or TableTraits.jl or anything like that. The idea is also that this package here typically won't be used by end-users directly, in the realm of Queryverse the end-user package really is https://github.com/queryverse/StatFiles.jl, and that is the package that for example brings the integration with TableTraits.jl along.

So ideally this package here would continue to not take a dependency on either Tables nor TableTraits, but instead just expose relatively low-level functions to write files and stay a package with as few dependencies as possible. And we can then add user-facing APIs to either StatFiles.jl, or any other package if someone wants to provide a more Tables.jl centric experience, and then those user-facing packages can share the implementation in this package here.


[compat]
julia = "1.3"
DataValues = "0.4.13"
ReadStat_jll = "1.1.1"
julia = "1.3"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
63 changes: 62 additions & 1 deletion src/C_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ function readstat_get_var_count(metadata::Ptr{Nothing})
end

function readstat_value_is_missing(value::ReadStatValue, variable::Ptr{Nothing})
return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue,Ptr{Nothing}), value, variable))
return Bool(ccall((:readstat_value_is_missing, libreadstat), Cint, (ReadStatValue, Ptr{Nothing}), value, variable))
end

function readstat_variable_get_index(variable::Ptr{Nothing})
Expand Down Expand Up @@ -78,3 +78,64 @@ end
function readstat_variable_get_missing_ranges_count(variable::Ptr{Nothing})
return ccall((:readstat_variable_get_missing_ranges_count, libreadstat), Cint, (Ptr{Nothing},), variable)
end


function readstat_begin_row(writer)
return ccall((:readstat_begin_row, libreadstat), Int, (Ptr{Nothing},), writer)
end

function readstat_end_row(writer)
return ccall((:readstat_end_row, libreadstat), Int, (Ptr{Nothing},), writer)
end

function readstat_begin_writing(writer, filetype::Val{:dta}, io, row_count)
return ccall((:readstat_begin_writing_dta, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:sav}, io, row_count)
return ccall((:readstat_begin_writing_sav, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:por}, io, row_count)
return ccall((:readstat_begin_writing_por, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:sas7bdat}, io, row_count)
return ccall((:readstat_begin_writing_sas7bdat, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_begin_writing(writer, filetype::Val{:xport}, io, row_count)
return ccall((:readstat_begin_writing_xport, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, pointer_from_objref(io), Cint(row_count))
end

function readstat_insert_double_value(writer, variable, value)
return ccall((:readstat_insert_double_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cdouble), writer, variable, value)
end

function readstat_insert_float_value(writer, variable, value)
return ccall((:readstat_insert_float_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cfloat), writer, variable, value)
end

function readstat_insert_int32_value(writer, variable, value)
return ccall((:readstat_insert_int32_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cint), writer, variable, value)
end

function readstat_insert_int16_value(writer, variable, value)
return ccall((:readstat_insert_int16_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cshort), writer, variable, value)
end

function readstat_insert_int8_value(writer, variable, value)
return ccall((:readstat_insert_int8_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cchar), writer, variable, value)
end

function readstat_insert_string_value(writer, variable, value)
return ccall((:readstat_insert_string_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}, Cstring), writer, variable, value)
end

function readstat_insert_missing_value(writer, variable)
return ccall((:readstat_insert_missing_value, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), writer, variable)
end

function readstat_add_variable(writer, name, type, width)
return ccall((:readstat_add_variable, libreadstat), Ptr{Nothing}, (Ptr{Nothing}, Cstring, Cint, Cint), writer, name, type, width)
end
89 changes: 88 additions & 1 deletion src/ReadStat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ using ReadStat_jll
using DataValues: DataValueVector
import DataValues
using Dates
import Tables

export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport
export ReadStatDataFrame, read_dta, read_sav, read_por, read_sas7bdat, read_xport, write_dta, write_sav, write_por, write_sas7bdat, write_xport

##############################################################################
##
Expand Down Expand Up @@ -287,10 +288,96 @@ function parse_data_file!(ds::ReadStatDataFrame, parser::Ptr{Nothing}, filename:
retval == 0 || error("Error parsing $filename: $(error_message(retval))")
end

function handle_write!(data::Ptr{UInt8}, len::Cint, ctx::Ptr)
io = unsafe_pointer_to_objref(ctx) # restore io
actual_data = unsafe_wrap(Array{UInt8}, data, (len, )) # we may want to specify the type later
write(io, actual_data)
return len
end

function Writer(; filelabel)
writer = ccall((:readstat_writer_init, libreadstat), Ptr{Nothing}, ())
write_bytes = @cfunction(handle_write!, Cint, (Ptr{UInt8}, Cint, Ptr{Nothing}))
ccall((:readstat_set_data_writer, libreadstat), Int, (Ptr{Nothing}, Ptr{Nothing}), writer, write_bytes)
ccall((:readstat_writer_set_file_label, libreadstat), Cvoid, (Ptr{Nothing}, Cstring), writer, filelabel)
return writer
end

function write_data_file(filename::AbstractString, filetype::Val, source; kwargs...)
io = open(filename, "w")
write_data_file(filetype::Val, io, source; kwargs...)
close(io)
end


function write_data_file(filetype::Val, io::IO, source; filelabel = "")
writer = Writer(; filelabel = filelabel)

rows = Tables.rows(source)
schema = Tables.schema(rows)
if schema === nothing
error("Could not determine table schema for data source.")
end
variables_array = []

variables_array = map(schema.names, schema.types) do column_name, column_type
readstat_type, storage_width = readstat_column_type_and_width(source, column_name, nonmissingtype(column_type))
return add_variable!(writer, column_name, readstat_type, storage_width)
# readstat_variable_set_label(variable, String(field)) TODO: label for a variable
end

readstat_begin_writing(writer, filetype, io, length(rows))

for row in rows
readstat_begin_row(writer)
Tables.eachcolumn(schema, row) do val, i, name
insert_value!(writer, variables_array[i], val)
end
readstat_end_row(writer);
end

ccall((:readstat_end_writing, libreadstat), Int, (Ptr{Nothing},), writer)
ccall((:readstat_writer_free, libreadstat), Cvoid, (Ptr{Nothing},), writer)
end

readstat_column_type_and_width(_, _, other_type) = error("Cannot handle column with element type $other_type. Is this type supported by ReadStat?")
readstat_column_type_and_width(_, _, ::Type{Float64}) = READSTAT_TYPE_DOUBLE, 0
readstat_column_type_and_width(_, _, ::Type{Float32}) = READSTAT_TYPE_FLOAT, 0
readstat_column_type_and_width(_, _, ::Type{Int32}) = READSTAT_TYPE_INT32, 0
readstat_column_type_and_width(_, _, ::Type{Int16}) = READSTAT_TYPE_INT16, 0
readstat_column_type_and_width(_, _, ::Type{Int8}) = READSTAT_TYPE_CHAR, 0
function readstat_column_type_and_width(source, colname, ::Type{String})
col = Tables.getcolumn(source, colname)
maxlen = maximum(col) do str
str === missing ? 0 : ncodeunits(str)
end
if maxlen >= 2045 # maximum length of normal strings
return READSTAT_TYPE_LONG_STRING, 0
else
return READSTAT_TYPE_STRING, maxlen
end
end

add_variable!(writer, name, type, width = 0) = readstat_add_variable(writer, name, type, width)

insert_value!(writer, variable, value::Float64) = readstat_insert_double_value(writer, variable, value)
insert_value!(writer, variable, value::Float32) = readstat_insert_float_value(writer, variable, value)
insert_value!(writer, variable, ::Missing) = readstat_insert_missing_value(writer, variable)
insert_value!(writer, variable, value::Int8) = readstat_insert_int8_value(writer, variable, value)
insert_value!(writer, variable, value::Int16) = readstat_insert_int16_value(writer, variable, value)
insert_value!(writer, variable, value::Int32) = readstat_insert_int32_value(writer, variable, value)
insert_value!(writer, variable, value::AbstractString) = readstat_insert_string_value(writer, variable, value)

read_dta(filename::AbstractString) = read_data_file(filename, Val(:dta))
read_sav(filename::AbstractString) = read_data_file(filename, Val(:sav))
read_por(filename::AbstractString) = read_data_file(filename, Val(:por))
read_sas7bdat(filename::AbstractString) = read_data_file(filename, Val(:sas7bdat))
read_xport(filename::AbstractString) = read_data_file(filename, Val(:xport))

write_dta(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:dta), source; kwargs...)
write_sav(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:sav), source; kwargs...)
write_por(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:por), source; kwargs...)
write_sas7bdat(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:sas7bdat), source; kwargs...)
write_xport(filename::AbstractString, source; kwargs...) = write_data_file(filename, Val(:xport), source; kwargs...)

end #module ReadStat
89 changes: 71 additions & 18 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,75 @@ using ReadStat
using DataValues
using Test

@testset "ReadStat: $ext files" for (reader, ext) in
((read_dta, "dta"),
(read_sav, "sav"),
(read_sas7bdat, "sas7bdat"),
(read_xport, "xpt"))

dtafile = joinpath(dirname(@__FILE__), "types.$ext")
rsdf = reader(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
testdir = joinpath(@__DIR__, "write_tests")
if isdir(testdir)
rm(testdir, recursive = true)
end
mkdir(testdir)

@testset "ReadStat" begin
@testset "$ext files" for (reader, writer, ext) in
((read_dta, write_dta, "dta"),
(read_sav, write_sav, "sav"),
(read_sas7bdat, write_sas7bdat, "sas7bdat"),
(read_xport, write_xport, "xpt"))

@testset "Reading" begin
dtafile = joinpath(@__DIR__, "types.$ext")
rsdf = reader(dtafile)
data = rsdf.data

@test length(data) == 6
@test rsdf.headers == [:vfloat, :vdouble, :vlong, :vint, :vbyte, :vstring]
@test data[1] == DataValueArray{Float32}([3.14, 7., NA])
@test data[2] == DataValueArray{Float64}([3.14, 7., NA])
@test data[3] == DataValueArray{Int32}([2, 7, NA])
@test data[4] == DataValueArray{Int16}([2, 7, NA])
@test data[5] == DataValueArray{Int8}([2, 7., NA])
@test data[6] == DataValueArray{String}(["2", "7", ""])
end

@testset "Writing" begin
data = (
vdouble = [3.14, 7., missing],
vfloat = [3.14f0, 7.f0, missing],
vint32 = [Int32(2), Int32(7), missing],
vint16 = [Int16(2), Int16(7), missing],
vint8 = [Int8(2), Int8(7), missing],
vstring = ["2", "7", missing],
)
filepath = joinpath(testdir, "testwrite.$ext")
writer(filepath, data)
rsdf = reader(filepath)
data_read = rsdf.data
@test length(data_read) == length(data)
@test rsdf.headers == collect(keys(data))

same_value(a::DataValue, b) = a.hasvalue && get(a) == b # SAS and SPSS only support Float64 and String, so we can't test ===
same_value(a::DataValue, b::Missing) = !a.hasvalue
# missing String appears to be read back in as the empty string ""
same_value(a::DataValue{String}, b::Missing) = a.hasvalue && get(a) == ""

@test all(zip(data_read, values(data))) do (col_read, col)
all(Base.splat(same_value), zip(col_read, col))
end
end

@testset "Long string" begin
data = (x = ["a" ^ 2046, missing],)
filepath = joinpath(testdir, "testwrite_longstring.$ext")
writer(filepath, data)
rsdf = reader(filepath)
data_read = rsdf.data
@test_broken get(data_read[1][1]) == "a" ^ 2046
end

@testset "File metadata" begin
data = (a = Int32[1, 2, 3],)
filepath = joinpath(testdir, "testwrite_file_metadata.$ext")
writer(filepath, data; filelabel = "Test label")
rsdf = reader(filepath)
@test rsdf.filelabel == "Test label"
end
end
end