Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/ARFFFiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ function parse_type(data, pos, len, offset, opts1, opts2)
pos, fmt = parse_string(data, pos, len, offset, opts1, opts2)
return pos, ARFFDateType(fmt)
else
return pos, ARFFDateType(fmt)
return pos, ARFFDateType()
end
elseif lstr == "relational"
return pos, ARFFRelationalType()
Expand Down Expand Up @@ -406,7 +406,7 @@ function parse_javadateformat(java::AbstractString)
write(io, c)
end
end
quoted && error("unclosed quote in date format: $(repr(arff))")
quoted && error("unclosed quote in date format: $(repr(java))")
return DateFormat(String(take!(io)))
end

Expand Down
197 changes: 197 additions & 0 deletions test/load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,200 @@ end
),
)
end

@testitem "load missingcols strict" begin
using CategoricalArrays
using Tables
arff = """
@RELATION strict
@ATTRIBUTE num NUMERIC
@ATTRIBUTE str STRING
@DATA
% keep this line to cover comment skipping
1,"alpha"
?,"beta"
"""
table = ARFFFiles.load(NamedTuple, IOBuffer(arff); missingcols=false)
@test table.num[1] == 1.0
@test isnan(table.num[2])
@test table.str == ["alpha", "beta"]

reader = ARFFFiles.loadstreaming(IOBuffer(arff); missingcols=false)
chunk = ARFFFiles.readcolumns(reader; chunkbytes=nothing, maxbytes=nothing)
@test Tuple(Tables.schema(chunk).names) == (:num, :str)
nums = Tables.getcolumn(chunk, :num)
@test nums[1] == 1.0
@test isnan(nums[2])
close(reader)

arff_rel = """
@RELATION combos
@ATTRIBUTE cat {yes,no}
@ATTRIBUTE nested RELATIONAL
@ATTRIBUTE score NUMERIC
@END nested
@DATA
'yes','1.0'
'no','2.5'
"""
reader = ARFFFiles.loadstreaming(IOBuffer(arff_rel); missingcols=false)
relchunk = ARFFFiles.readcolumns(reader)
cats = Tables.getcolumn(relchunk, :cat)
expected_cats = CategoricalArrays.CategoricalArray(["yes", "no"], levels = ["yes", "no"])
@test cats == expected_cats
nested = Tables.getcolumn(relchunk, :nested)
@test length(nested) == 2
@test all(x -> Tables.getcolumn(x, :score)[1] > 0, nested)
close(reader)
end

@testitem "load invalid data" begin
# strict missing string column should error
strict_missing = """
@RELATION strict-missing
@ATTRIBUTE str STRING
@DATA
?
"""
@test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(strict_missing); missingcols=false)

# invalid nominal choice
invalid_nominal = """
@RELATION cls
@ATTRIBUTE cls {yes,no}
@DATA
yes
maybe
"""
err = try
ARFFFiles.load(NamedTuple, IOBuffer(invalid_nominal))
nothing
catch ex
ex
end
@test err !== nothing
@test occursin("Invalid nominal", sprint(showerror, err))

# sparse row with duplicate column index
dup_sparse = """
@RELATION dup
@ATTRIBUTE num NUMERIC
@ATTRIBUTE str STRING
@DATA
{0 1,0 2}
"""
@test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(dup_sparse))

# sparse row omitting a required string column
strict_sparse = """
@RELATION strict-sparse
@ATTRIBUTE num NUMERIC
@ATTRIBUTE when DATE "yyyy-MM-dd'T'HH:mm:ss"
@DATA
{0 1}
"""
@test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(strict_sparse); missingcols=false)

# sparse row missing closing brace
unclosed_sparse = """
@RELATION broken
@ATTRIBUTE num NUMERIC
@DATA
{0 1
"""
@test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(unclosed_sparse))

# unsupported column type in header
struct DummyType <: ARFFFiles.ARFFType end
header = ARFFFiles.ARFFHeader("dummy", [ARFFFiles.ARFFAttribute("x", DummyType())])
@test_throws ErrorException ARFFFiles.loadstreaming(IOBuffer(""), header=header)
end

@testitem "sparse zero fill" begin
using CategoricalArrays
using Tables

arff = """
@RELATION fill
@ATTRIBUTE num NUMERIC
@ATTRIBUTE str STRING
@ATTRIBUTE cat {foo,bar}
@DATA
{0 1.0}
% inline comment to cover skip loop
{0 2.0,1 'hi',2 bar}
"""

logbuf = IOBuffer()
logger = Base.CoreLogging.SimpleLogger(logbuf, Base.CoreLogging.Warn)
table = Base.CoreLogging.with_logger(logger) do
ARFFFiles.load(NamedTuple, IOBuffer(arff))
end
@test occursin("Value of string column 'str'", String(take!(logbuf)))
@test table.num == [1.0, 2.0]
@test table.str == ["", "hi"]
cats = table.cat
@test cats isa CategoricalArray{String,1,UInt32}
@test cats[1] == "foo"
@test cats[2] == "bar"
@test Tables.schema(table).names == (:num, :str, :cat)
end

@testitem "sparse non-missable date errors" begin
arff = """
@RELATION missing-date
@ATTRIBUTE num NUMERIC
@ATTRIBUTE when DATE "yyyy-MM-dd"
@DATA
{0 1.0}
"""
err = try
ARFFFiles.load(NamedTuple, IOBuffer(arff))
nothing
catch ex
ex
end
@test err !== nothing
message = sprint(showerror, err)
@test occursin("Value of non-numeric column 'when'", message)
end

@testitem "readcolumns guard rails" begin
using ARFFFiles
using Dates

reader = ARFFFiles.loadstreaming(IOBuffer("""
@RELATION bogus
@ATTRIBUTE good NUMERIC
@DATA
1
"""))
reader.colkinds[1] = :Z
@test_throws ErrorException ARFFFiles.readcolumns(reader)

sparse = """
@RELATION tiny
@ATTRIBUTE value NUMERIC
@DATA
{0 1.0}
"""
legit = ARFFFiles.loadstreaming(IOBuffer(sparse))
chunk = collect(codeunits("1"))
opts = ARFFFiles.Parsing.options('\'')
@test_throws ErrorException ARFFFiles._readcolumns_readdatum(
legit,
Val(:Z),
chunk,
1,
length(chunk),
0,
1,
false,
1,
1,
opts,
opts,
Float64[],
nothing,
)
end
124 changes: 124 additions & 0 deletions test/parsing.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
@testitem "parse_header edge cases" begin
using ARFFFiles
using ARFFFiles: ARFFNominalType
using ARFFFiles.Parsing

# missing leading @ should trigger expect error
err = try
Parsing.parse_header(IOBuffer("@RELATION demo\nnot-a-directive\n"))
nothing
catch ex
ex
end
@test err !== nothing
@test occursin("expecting", sprint(showerror, err))

# unknown directive should trigger invalid header item
err = try
Parsing.parse_header(IOBuffer("@RELATION demo\n@FOO bar\n"))
nothing
catch ex
ex
end
@test err !== nothing
@test occursin("invalid header item", sprint(showerror, err))

# string parsing supports escaped quotes
single = Parsing.options('\'')
double = Parsing.options('"')
data = codeunits("'demo\\'s'")
_, relation = Parsing.parse_string(data, 1, length(data), 0, single, double)
@test relation == "demo's"
data = codeunits("'escaped\\'name'")
_, attrname = Parsing.parse_string(data, 1, length(data), 0, single, double)
@test attrname == "escaped'name"

# empty nominal list is accepted
data = codeunits("{}");
_, attrtype = Parsing.parse_type(data, 1, length(data), 0, single, double)
@test attrtype isa ARFFNominalType
@test isempty((attrtype::ARFFNominalType).classes)

# malformed escape sequences bubble the parse error
data = codeunits("'unfinished\\")
@test_throws ErrorException Parsing.parse_string(data, 1, length(data), 0, single, double)
@test_throws ErrorException Parsing.parse_escape("\\")
end

@testitem "parse_type validations" begin
using ARFFFiles.Parsing
using ARFFFiles
# invalid separators inside nominal braces
single = Parsing.options('\'')
double = Parsing.options('"')
data = codeunits("{yes no}")
err = try
Parsing.parse_type(data, 1, length(data), 0, single, double)
nothing
catch ex
ex
end
@test err !== nothing
@test occursin(",' or '}", sprint(showerror, err))

err = try
Parsing.parse_type(codeunits("bogus"), 1, length("bogus"), 0, single, double)
nothing
catch ex
ex
end
@test err !== nothing
@test occursin("invalid type", sprint(showerror, err))

# default DATE attributes fall back to the module default format
data = codeunits("date")
_, attrtype = Parsing.parse_type(data, 1, length(data), 0, single, double)
@test attrtype isa ARFFFiles.ARFFDateType
@test attrtype.format == ARFFFiles.ARFFDateType().format
end

@testitem "parse_javadateformat edge cases" begin
using ARFFFiles
using Dates

# doubled quotes emit a literal quote
fmt = ARFFFiles.parse_javadateformat("yy''MM")
@test Dates.format(Date(2020, 7, 1), fmt) == "20'07"

# unsupported format character
@test_throws ErrorException ARFFFiles.parse_javadateformat("Q")

# unclosed quote reports a descriptive error
err = try
ARFFFiles.parse_javadateformat("'open")
nothing
catch ex
ex
end
@test err isa ErrorException
message = sprint(showerror, err)
@test occursin("unclosed quote", message)
@test occursin("'open", message)
end

@testitem "parse_datum quote preference" begin
using ARFFFiles
using ARFFFiles: Parsing

single = Parsing.options('\'')
double = Parsing.options('"')

# double-quoted strings fall back from the single-quoted parser
data = codeunits("\"quoted\"")
res = Parsing.parse_datum(String, data, 1, length(data), single, double)
@test !Parsing.Parsers.invalid(res.code)
@test Parsing.Parsers.quoted(res.code)
@test Parsing.get_parsed_string(data, res) == "quoted"

# mismatched quotes prefer the single-quoted parse result
data = codeunits("'dangling\"")
res1 = Parsing.Parsers.xparse(String, data, 1, length(data), single)
res = Parsing.parse_datum(String, data, 1, length(data), single, double)
@test Parsing.Parsers.invalid(res.code)
@test res.code == res1.code
end
34 changes: 34 additions & 0 deletions test/save.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,37 @@
@test String(take!(io)) == expected
end
end

@testitem "save edge cases" begin
using CategoricalArrays

io = IOBuffer()
ARFFFiles.write_datum(io, SubString("quoted", 1, 6))
@test String(take!(io)) == "'quoted'"

struct UInt8String <: AbstractString
data::Vector{UInt8}
end
Base.eltype(::Type{UInt8String}) = UInt8
Base.convert(::Type{String}, s::UInt8String) = String(s.data)
Base.String(s::UInt8String) = convert(String, s)
custom = UInt8String(codeunits("custom"))
io = IOBuffer()
ARFFFiles.write_datum(io, custom)
@test String(take!(io)) == "'custom'"

missing_only = (allmissing = [missing, missing],)
io = IOBuffer()
ARFFFiles.save(io, missing_only, relation = "missing", comment = "")
text = String(take!(io))
@test occursin("@ATTRIBUTE 'allmissing' {}", text)

cat_missing = (cats = CategoricalArray([missing, missing], levels = String[]),)
io = IOBuffer()
ARFFFiles.save(io, cat_missing, relation = "cats", comment = "")
text = String(take!(io))
@test occursin("@ATTRIBUTE 'cats' {}", text)

unsupported = (bad = ComplexF64[1 + 1im],)
@test_throws ErrorException ARFFFiles.save(IOBuffer(), unsupported, relation = "bad", comment = "")
end
Loading