diff --git a/src/ARFFFiles.jl b/src/ARFFFiles.jl index a45ba93..18a2fa8 100644 --- a/src/ARFFFiles.jl +++ b/src/ARFFFiles.jl @@ -255,7 +255,7 @@ function parse_type(data, pos, len, offset, opts1, opts2) pos, fmt = parse_string(data, pos, len, offset, opts1, opts2) return pos, ARFFDateType(fmt) else - return pos, ARFFDateType(fmt) + return pos, ARFFDateType() end elseif lstr == "relational" return pos, ARFFRelationalType() @@ -406,7 +406,7 @@ function parse_javadateformat(java::AbstractString) write(io, c) end end - quoted && error("unclosed quote in date format: $(repr(arff))") + quoted && error("unclosed quote in date format: $(repr(java))") return DateFormat(String(take!(io))) end diff --git a/test/load.jl b/test/load.jl index 8393add..bd2ee9d 100644 --- a/test/load.jl +++ b/test/load.jl @@ -142,3 +142,200 @@ end ), ) end + +@testitem "load missingcols strict" begin + using CategoricalArrays + using Tables + arff = """ + @RELATION strict + @ATTRIBUTE num NUMERIC + @ATTRIBUTE str STRING + @DATA + % keep this line to cover comment skipping + 1,"alpha" + ?,"beta" + """ + table = ARFFFiles.load(NamedTuple, IOBuffer(arff); missingcols=false) + @test table.num[1] == 1.0 + @test isnan(table.num[2]) + @test table.str == ["alpha", "beta"] + + reader = ARFFFiles.loadstreaming(IOBuffer(arff); missingcols=false) + chunk = ARFFFiles.readcolumns(reader; chunkbytes=nothing, maxbytes=nothing) + @test Tuple(Tables.schema(chunk).names) == (:num, :str) + nums = Tables.getcolumn(chunk, :num) + @test nums[1] == 1.0 + @test isnan(nums[2]) + close(reader) + + arff_rel = """ + @RELATION combos + @ATTRIBUTE cat {yes,no} + @ATTRIBUTE nested RELATIONAL + @ATTRIBUTE score NUMERIC + @END nested + @DATA + 'yes','1.0' + 'no','2.5' + """ + reader = ARFFFiles.loadstreaming(IOBuffer(arff_rel); missingcols=false) + relchunk = ARFFFiles.readcolumns(reader) + cats = Tables.getcolumn(relchunk, :cat) + expected_cats = CategoricalArrays.CategoricalArray(["yes", "no"], levels = ["yes", "no"]) + @test cats == expected_cats + nested = Tables.getcolumn(relchunk, :nested) + @test length(nested) == 2 + @test all(x -> Tables.getcolumn(x, :score)[1] > 0, nested) + close(reader) +end + +@testitem "load invalid data" begin + # strict missing string column should error + strict_missing = """ + @RELATION strict-missing + @ATTRIBUTE str STRING + @DATA + ? + """ + @test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(strict_missing); missingcols=false) + + # invalid nominal choice + invalid_nominal = """ + @RELATION cls + @ATTRIBUTE cls {yes,no} + @DATA + yes + maybe + """ + err = try + ARFFFiles.load(NamedTuple, IOBuffer(invalid_nominal)) + nothing + catch ex + ex + end + @test err !== nothing + @test occursin("Invalid nominal", sprint(showerror, err)) + + # sparse row with duplicate column index + dup_sparse = """ + @RELATION dup + @ATTRIBUTE num NUMERIC + @ATTRIBUTE str STRING + @DATA + {0 1,0 2} + """ + @test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(dup_sparse)) + + # sparse row omitting a required string column + strict_sparse = """ + @RELATION strict-sparse + @ATTRIBUTE num NUMERIC + @ATTRIBUTE when DATE "yyyy-MM-dd'T'HH:mm:ss" + @DATA + {0 1} + """ + @test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(strict_sparse); missingcols=false) + + # sparse row missing closing brace + unclosed_sparse = """ + @RELATION broken + @ATTRIBUTE num NUMERIC + @DATA + {0 1 + """ + @test_throws ErrorException ARFFFiles.load(NamedTuple, IOBuffer(unclosed_sparse)) + + # unsupported column type in header + struct DummyType <: ARFFFiles.ARFFType end + header = ARFFFiles.ARFFHeader("dummy", [ARFFFiles.ARFFAttribute("x", DummyType())]) + @test_throws ErrorException ARFFFiles.loadstreaming(IOBuffer(""), header=header) +end + +@testitem "sparse zero fill" begin + using CategoricalArrays + using Tables + + arff = """ + @RELATION fill + @ATTRIBUTE num NUMERIC + @ATTRIBUTE str STRING + @ATTRIBUTE cat {foo,bar} + @DATA + {0 1.0} + % inline comment to cover skip loop + {0 2.0,1 'hi',2 bar} + """ + + logbuf = IOBuffer() + logger = Base.CoreLogging.SimpleLogger(logbuf, Base.CoreLogging.Warn) + table = Base.CoreLogging.with_logger(logger) do + ARFFFiles.load(NamedTuple, IOBuffer(arff)) + end + @test occursin("Value of string column 'str'", String(take!(logbuf))) + @test table.num == [1.0, 2.0] + @test table.str == ["", "hi"] + cats = table.cat + @test cats isa CategoricalArray{String,1,UInt32} + @test cats[1] == "foo" + @test cats[2] == "bar" + @test Tables.schema(table).names == (:num, :str, :cat) +end + +@testitem "sparse non-missable date errors" begin + arff = """ + @RELATION missing-date + @ATTRIBUTE num NUMERIC + @ATTRIBUTE when DATE "yyyy-MM-dd" + @DATA + {0 1.0} + """ + err = try + ARFFFiles.load(NamedTuple, IOBuffer(arff)) + nothing + catch ex + ex + end + @test err !== nothing + message = sprint(showerror, err) + @test occursin("Value of non-numeric column 'when'", message) +end + +@testitem "readcolumns guard rails" begin + using ARFFFiles + using Dates + + reader = ARFFFiles.loadstreaming(IOBuffer(""" + @RELATION bogus + @ATTRIBUTE good NUMERIC + @DATA + 1 + """)) + reader.colkinds[1] = :Z + @test_throws ErrorException ARFFFiles.readcolumns(reader) + + sparse = """ + @RELATION tiny + @ATTRIBUTE value NUMERIC + @DATA + {0 1.0} + """ + legit = ARFFFiles.loadstreaming(IOBuffer(sparse)) + chunk = collect(codeunits("1")) + opts = ARFFFiles.Parsing.options('\'') + @test_throws ErrorException ARFFFiles._readcolumns_readdatum( + legit, + Val(:Z), + chunk, + 1, + length(chunk), + 0, + 1, + false, + 1, + 1, + opts, + opts, + Float64[], + nothing, + ) +end diff --git a/test/parsing.jl b/test/parsing.jl new file mode 100644 index 0000000..761388c --- /dev/null +++ b/test/parsing.jl @@ -0,0 +1,124 @@ +@testitem "parse_header edge cases" begin + using ARFFFiles + using ARFFFiles: ARFFNominalType + using ARFFFiles.Parsing + + # missing leading @ should trigger expect error + err = try + Parsing.parse_header(IOBuffer("@RELATION demo\nnot-a-directive\n")) + nothing + catch ex + ex + end + @test err !== nothing + @test occursin("expecting", sprint(showerror, err)) + + # unknown directive should trigger invalid header item + err = try + Parsing.parse_header(IOBuffer("@RELATION demo\n@FOO bar\n")) + nothing + catch ex + ex + end + @test err !== nothing + @test occursin("invalid header item", sprint(showerror, err)) + + # string parsing supports escaped quotes + single = Parsing.options('\'') + double = Parsing.options('"') + data = codeunits("'demo\\'s'") + _, relation = Parsing.parse_string(data, 1, length(data), 0, single, double) + @test relation == "demo's" + data = codeunits("'escaped\\'name'") + _, attrname = Parsing.parse_string(data, 1, length(data), 0, single, double) + @test attrname == "escaped'name" + + # empty nominal list is accepted + data = codeunits("{}"); + _, attrtype = Parsing.parse_type(data, 1, length(data), 0, single, double) + @test attrtype isa ARFFNominalType + @test isempty((attrtype::ARFFNominalType).classes) + + # malformed escape sequences bubble the parse error + data = codeunits("'unfinished\\") + @test_throws ErrorException Parsing.parse_string(data, 1, length(data), 0, single, double) + @test_throws ErrorException Parsing.parse_escape("\\") +end + +@testitem "parse_type validations" begin + using ARFFFiles.Parsing + using ARFFFiles + # invalid separators inside nominal braces + single = Parsing.options('\'') + double = Parsing.options('"') + data = codeunits("{yes no}") + err = try + Parsing.parse_type(data, 1, length(data), 0, single, double) + nothing + catch ex + ex + end + @test err !== nothing + @test occursin(",' or '}", sprint(showerror, err)) + + err = try + Parsing.parse_type(codeunits("bogus"), 1, length("bogus"), 0, single, double) + nothing + catch ex + ex + end + @test err !== nothing + @test occursin("invalid type", sprint(showerror, err)) + + # default DATE attributes fall back to the module default format + data = codeunits("date") + _, attrtype = Parsing.parse_type(data, 1, length(data), 0, single, double) + @test attrtype isa ARFFFiles.ARFFDateType + @test attrtype.format == ARFFFiles.ARFFDateType().format +end + +@testitem "parse_javadateformat edge cases" begin + using ARFFFiles + using Dates + + # doubled quotes emit a literal quote + fmt = ARFFFiles.parse_javadateformat("yy''MM") + @test Dates.format(Date(2020, 7, 1), fmt) == "20'07" + + # unsupported format character + @test_throws ErrorException ARFFFiles.parse_javadateformat("Q") + + # unclosed quote reports a descriptive error + err = try + ARFFFiles.parse_javadateformat("'open") + nothing + catch ex + ex + end + @test err isa ErrorException + message = sprint(showerror, err) + @test occursin("unclosed quote", message) + @test occursin("'open", message) +end + +@testitem "parse_datum quote preference" begin + using ARFFFiles + using ARFFFiles: Parsing + + single = Parsing.options('\'') + double = Parsing.options('"') + + # double-quoted strings fall back from the single-quoted parser + data = codeunits("\"quoted\"") + res = Parsing.parse_datum(String, data, 1, length(data), single, double) + @test !Parsing.Parsers.invalid(res.code) + @test Parsing.Parsers.quoted(res.code) + @test Parsing.get_parsed_string(data, res) == "quoted" + + # mismatched quotes prefer the single-quoted parse result + data = codeunits("'dangling\"") + res1 = Parsing.Parsers.xparse(String, data, 1, length(data), single) + res = Parsing.parse_datum(String, data, 1, length(data), single, double) + @test Parsing.Parsers.invalid(res.code) + @test res.code == res1.code +end diff --git a/test/save.jl b/test/save.jl index 8a1c55a..ef09f43 100644 --- a/test/save.jl +++ b/test/save.jl @@ -46,3 +46,37 @@ @test String(take!(io)) == expected end end + +@testitem "save edge cases" begin + using CategoricalArrays + + io = IOBuffer() + ARFFFiles.write_datum(io, SubString("quoted", 1, 6)) + @test String(take!(io)) == "'quoted'" + + struct UInt8String <: AbstractString + data::Vector{UInt8} + end + Base.eltype(::Type{UInt8String}) = UInt8 + Base.convert(::Type{String}, s::UInt8String) = String(s.data) + Base.String(s::UInt8String) = convert(String, s) + custom = UInt8String(codeunits("custom")) + io = IOBuffer() + ARFFFiles.write_datum(io, custom) + @test String(take!(io)) == "'custom'" + + missing_only = (allmissing = [missing, missing],) + io = IOBuffer() + ARFFFiles.save(io, missing_only, relation = "missing", comment = "") + text = String(take!(io)) + @test occursin("@ATTRIBUTE 'allmissing' {}", text) + + cat_missing = (cats = CategoricalArray([missing, missing], levels = String[]),) + io = IOBuffer() + ARFFFiles.save(io, cat_missing, relation = "cats", comment = "") + text = String(take!(io)) + @test occursin("@ATTRIBUTE 'cats' {}", text) + + unsupported = (bad = ComplexF64[1 + 1im],) + @test_throws ErrorException ARFFFiles.save(IOBuffer(), unsupported, relation = "bad", comment = "") +end