diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..5070066 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,9 @@ +# Agent Notes for ARFFFiles.jl + +- This repo uses Julia's workspace-aware test layout. The root `Project.toml` lists the `test` project under `[workspace]`, and the tests themselves live in `test/Project.toml`. + - Activate and instantiate the full workspace with `julia --project=@. -e 'using Pkg; Pkg.instantiate(workspace=true; allow_autoprecomp=false)'` when dependencies change. + - Tests should be executed with `julia --project=@. -e 'using Pkg; Pkg.test()'` so that the workspace metadata is respected. +- The `test` project is a plain environment (no package). Activate it from the repository root; `TestItemRunner` will still execute the `@testitem` files via `include`. +- Test fixtures under `test/data` are generated via `test/data/generate_datasets.jl`. Run that script with Julia instead of editing the `.arff` fixtures by hand, and keep the emitted comment string set to `"test data file"` so the save tests comparing against the fixtures continue to pass. +- If additional tooling or conventions become important, append them here so future agents stay up to date. **Keep this file current with any new discoveries.** +- When working on relational columns, remember that nested `ARFFReader`s get reused; reset their buffers (see the `:R` handling in `src/ARFFFiles.jl`) if you make structural changes. diff --git a/Project.toml b/Project.toml index 2090093..bef0613 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,9 @@ uuid = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" authors = ["Christopher Doris "] version = "1.5.1" +[workspace] +projects = ["test"] + [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" @@ -10,19 +13,8 @@ Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Aqua = "0 - 999" CategoricalArrays = "0.10, 1.0" Dates = "1" Parsers = "2.8" Tables = "1.12" -Test = "1" -TestItemRunner = "0 - 999" julia = "1.6" - -[extras] -Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" - -[targets] -test = ["Aqua", "Test", "TestItemRunner"] diff --git a/src/ARFFFiles.jl b/src/ARFFFiles.jl index 441e893..a45ba93 100644 --- a/src/ARFFFiles.jl +++ b/src/ARFFFiles.jl @@ -1025,7 +1025,10 @@ end elseif kind == :R || kind == :RX str = Parsing.get_parsed_string(chunk, res) r2 = info - r2.io = IOBuffer(str) + r2.io = IOBuffer(String(str)) + r2.chunk = ARFFTable(_schema([], []), Dict()) + r2.chunklen = 0 + r2.chunkidx = 0 push!(col, readcolumns(r2)) else error() diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..b8e6e24 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,8 @@ +[deps] +ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8" +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" diff --git a/test/data/generate_datasets.jl b/test/data/generate_datasets.jl new file mode 100755 index 0000000..9b138bf --- /dev/null +++ b/test/data/generate_datasets.jl @@ -0,0 +1,126 @@ +#!/usr/bin/env julia +using ARFFFiles +using Dates +using CategoricalArrays + +const DATADIR = @__DIR__ +const COMMENT = "test data file" + +function write_simple_with_missing(path) + data = ( + nums = [1.5, 2.5, NaN], + ints = Union{Missing, Int}[1, missing, 3], + strs = Union{Missing, String}["foo", "bar", missing], + cats = CategoricalArray([missing, "c", "b"], levels = ["a", "b", "c"]), + dates = Union{Missing, Date}[Date(2020, 1, 2), missing, Date(2022, 4, 5)], + ) + ARFFFiles.save( + path, + data, + relation = "test-data", + comment = COMMENT, + ) +end + +function write_simple(path) + data = ( + nums = [1.5, 2.5, 3.5], + ints = [1, 2, 3], + strs = ["foo", "bar", "baz"], + cats = CategoricalArray(["a", "c", "b"], levels = ["a", "b", "c"]), + dates = [Date(2020, 1, 2), Date(2021, 3, 4), Date(2022, 4, 5)], + ) + ARFFFiles.save( + path, + data, + relation = "test-data", + comment = COMMENT, + ) +end + +function write_sparse(path) + open(path, "w") do io + println(io, "% $COMMENT") + println(io) + print(io, "@RELATION ") + ARFFFiles.write_datum(io, "sparse-example") + println(io, "\n") + for (name, type) in (("num1", "NUMERIC"), ("str1", "STRING"), ("cat1", "{yes,no}")) + print(io, "@ATTRIBUTE ") + ARFFFiles.write_datum(io, name) + println(io, " $type") + end + println(io) + println(io, "@DATA") + rows = [ + [(0, 1.5), (1, "foo"), (2, "yes")], + [(1, "bar")], + [(0, 3.0), (2, "no")], + ] + for row in rows + print(io, "{") + for (j, (idx, value)) in enumerate(row) + j > 1 && print(io, ",") + print(io, idx, " ") + ARFFFiles.write_datum(io, value) + end + println(io, "}") + end + end +end + +function write_relational(path) + open(path, "w") do io + println(io, "% $COMMENT") + println(io) + print(io, "@RELATION ") + ARFFFiles.write_datum(io, "relational-example") + println(io, "\n") + print(io, "@ATTRIBUTE ") + ARFFFiles.write_datum(io, "id") + println(io, " NUMERIC") + print(io, "@ATTRIBUTE ") + ARFFFiles.write_datum(io, "measurements") + println(io, " RELATIONAL") + nested_attrs = [ + ("temp", "NUMERIC"), + ("flag", "STRING"), + ] + for (name, type) in nested_attrs + print(io, " @ATTRIBUTE ") + ARFFFiles.write_datum(io, name) + println(io, " $type") + end + print(io, "@END ") + ARFFFiles.write_datum(io, "measurements") + println(io) + print(io, "@ATTRIBUTE ") + ARFFFiles.write_datum(io, "label") + println(io, " {yes,no}") + println(io) + println(io, "@DATA") + rows = ( + (id = 1, nested = "1.0,hot\n2.0,cold", label = "yes"), + (id = 2, nested = "3.0,warm", label = "no"), + (id = 3, nested = "", label = missing), + ) + for row in rows + ARFFFiles.write_datum(io, row.id) + print(io, ",") + ARFFFiles.write_datum(io, row.nested) + print(io, ",") + ARFFFiles.write_datum(io, row.label) + println(io) + end + end +end + +function main() + mkpath(DATADIR) + write_simple_with_missing(joinpath(DATADIR, "simple_with_missing.arff")) + write_simple(joinpath(DATADIR, "simple.arff")) + write_sparse(joinpath(DATADIR, "sparse_example.arff")) + write_relational(joinpath(DATADIR, "relational_example.arff")) +end + +main() diff --git a/test/data/relational_example.arff b/test/data/relational_example.arff new file mode 100644 index 0000000..686dc30 --- /dev/null +++ b/test/data/relational_example.arff @@ -0,0 +1,15 @@ +% test data file + +@RELATION 'relational-example' + +@ATTRIBUTE 'id' NUMERIC +@ATTRIBUTE 'measurements' RELATIONAL + @ATTRIBUTE 'temp' NUMERIC + @ATTRIBUTE 'flag' STRING +@END 'measurements' +@ATTRIBUTE 'label' {yes,no} + +@DATA +1,'1.0,hot\n2.0,cold','yes' +2,'3.0,warm','no' +3,'',? diff --git a/test/data/simple.arff b/test/data/simple.arff new file mode 100644 index 0000000..ccd1264 --- /dev/null +++ b/test/data/simple.arff @@ -0,0 +1,14 @@ +% test data file + +@RELATION 'test-data' + +@ATTRIBUTE 'nums' NUMERIC +@ATTRIBUTE 'ints' NUMERIC +@ATTRIBUTE 'strs' STRING +@ATTRIBUTE 'cats' {'a','b','c'} +@ATTRIBUTE 'dates' DATE "yyyy-MM-dd'T'HH:mm:ss.SSS" + +@DATA +1.5,1,'foo','a','2020-01-02T00:00:00.000' +2.5,2,'bar','c','2021-03-04T00:00:00.000' +3.5,3,'baz','b','2022-04-05T00:00:00.000' diff --git a/test/data/test_1.arff b/test/data/simple_with_missing.arff similarity index 100% rename from test/data/test_1.arff rename to test/data/simple_with_missing.arff diff --git a/test/data/sparse_example.arff b/test/data/sparse_example.arff new file mode 100644 index 0000000..3877dbe --- /dev/null +++ b/test/data/sparse_example.arff @@ -0,0 +1,12 @@ +% test data file + +@RELATION 'sparse-example' + +@ATTRIBUTE 'num1' NUMERIC +@ATTRIBUTE 'str1' STRING +@ATTRIBUTE 'cat1' {yes,no} + +@DATA +{0 1.5,1 'foo',2 'yes'} +{1 'bar'} +{0 3.0,2 'no'} diff --git a/test/load.jl b/test/load.jl index 5ed6c3d..8393add 100644 --- a/test/load.jl +++ b/test/load.jl @@ -1,15 +1,48 @@ @testitem "load_header" begin include("setup.jl") - cases = [( - filename = "openml_32_sample.arff", - header = ARFFHeader( - "pendigits", - [ - [ARFFAttribute("input$i", ARFFNumericType()) for i = 1:16]..., - ARFFAttribute("class", ARFFNominalType(map(string, 0:9))), - ], + using ARFFFiles: ARFFRelationalType + cases = [ + ( + filename = "openml_32_sample.arff", + header = ARFFHeader( + "pendigits", + [ + [ARFFAttribute("input$i", ARFFNumericType()) for i = 1:16]..., + ARFFAttribute("class", ARFFNominalType(map(string, 0:9))), + ], + ), + ), + ( + filename = "simple_with_missing.arff", + header = ARFFHeader( + "test-data", + [ + ARFFAttribute("nums", ARFFNumericType()), + ARFFAttribute("ints", ARFFNumericType()), + ARFFAttribute("strs", ARFFStringType()), + ARFFAttribute("cats", ARFFNominalType(["a", "b", "c"])), + ARFFAttribute("dates", ARFFDateType("yyyy-MM-dd'T'HH:mm:ss.SSS")), + ], + ), + ), + ( + filename = "relational_example.arff", + header = ARFFHeader( + "relational-example", + [ + ARFFAttribute("id", ARFFNumericType()), + ARFFAttribute( + "measurements", + ARFFRelationalType([ + ARFFAttribute("temp", ARFFNumericType()), + ARFFAttribute("flag", ARFFStringType()), + ]), + ), + ARFFAttribute("label", ARFFNominalType(["yes", "no"])), + ], + ), ), - )] + ] @testset "$(case.filename)" for case in cases header = ARFFFiles.load_header(joinpath(datadir, case.filename)) @test header isa ARFFHeader @@ -20,40 +53,92 @@ end @testitem "load" begin using CategoricalArrays + using Dates using Tables include("setup.jl") - cases = [( - filename = "openml_32_sample.arff", - df = ( - input1 = [47.0, 0.0, 19.0, 38.0], - input2 = [100.0, 89.0, 100.0, 100.0], - input3 = [27.0, 27.0, 0.0, 37.0], - input4 = [81.0, 100.0, 61.0, 81.0], - input5 = [57.0, 42.0, 3.0, 12.0], - input6 = [37.0, 75.0, 23.0, 55.0], - input7 = [26.0, 29.0, 48.0, 0.0], - input8 = [0.0, 45.0, 0.0, 28.0], - input9 = [0.0, 15.0, 97.0, 52.0], - input10 = [23.0, 15.0, 27.0, 27.0], - input11 = [56.0, 37.0, 100.0, 100.0], - input12 = [53.0, 0.0, 66.0, 42.0], - input13 = [100.0, 69.0, 62.0, 86.0], - input14 = [90.0, 2.0, 97.0, 26.0], - input15 = [40.0, 100.0, 10.0, 65.0], - input16 = [98.0, 6.0, 81.0, 0.0], - class = CategoricalArray(["8", "2", "0", "4"], levels = map(string, 0:9)), - ), - )] + cases = [ + ( + filename = "openml_32_sample.arff", + df = ( + input1 = [47.0, 0.0, 19.0, 38.0], + input2 = [100.0, 89.0, 100.0, 100.0], + input3 = [27.0, 27.0, 0.0, 37.0], + input4 = [81.0, 100.0, 61.0, 81.0], + input5 = [57.0, 42.0, 3.0, 12.0], + input6 = [37.0, 75.0, 23.0, 55.0], + input7 = [26.0, 29.0, 48.0, 0.0], + input8 = [0.0, 45.0, 0.0, 28.0], + input9 = [0.0, 15.0, 97.0, 52.0], + input10 = [23.0, 15.0, 27.0, 27.0], + input11 = [56.0, 37.0, 100.0, 100.0], + input12 = [53.0, 0.0, 66.0, 42.0], + input13 = [100.0, 69.0, 62.0, 86.0], + input14 = [90.0, 2.0, 97.0, 26.0], + input15 = [40.0, 100.0, 10.0, 65.0], + input16 = [98.0, 6.0, 81.0, 0.0], + class = CategoricalArray(["8", "2", "0", "4"], levels = map(string, 0:9)), + ), + ), + ( + filename = "simple_with_missing.arff", + df = ( + nums = Union{Missing, Float64}[1.5, 2.5, missing], + ints = Union{Missing, Float64}[1, missing, 3], + strs = Union{Missing, String}["foo", "bar", missing], + cats = CategoricalArray([missing, "c", "b"], levels = ["a", "b", "c"]), + dates = Union{Missing, DateTime}[ + DateTime(2020, 1, 2), + missing, + DateTime(2022, 4, 5), + ], + ), + ), + ( + filename = "simple.arff", + df = ( + nums = [1.5, 2.5, 3.5], + ints = [1.0, 2.0, 3.0], + strs = ["foo", "bar", "baz"], + cats = CategoricalArray(["a", "c", "b"], levels = ["a", "b", "c"]), + dates = [ + DateTime(2020, 1, 2), + DateTime(2021, 3, 4), + DateTime(2022, 4, 5), + ], + ), + ), + ( + filename = "sparse_example.arff", + df = ( + num1 = [1.5, 0.0, 3.0], + str1 = ["foo", "bar", ""], + cat1 = CategoricalArray(["yes", "yes", "no"], levels = ["yes", "no"]), + ), + ), + ] @testset "$(case.filename)" for case in cases df = ARFFFiles.load(NamedTuple, joinpath(datadir, case.filename)) @test Tables.schema(df) == Tables.schema(case.df) @testset "column $k" for k in propertynames(df) @test typeof(df[k]) == typeof(case.df[k]) - @test df[k] == case.df[k] + @test isequal(df[k], case.df[k]) if case.df[k] isa CategoricalArray @test df[k].pool.levels == case.df[k].pool.levels end end - @test df == case.df + @test isequal(df, case.df) end + relational_path = joinpath(datadir, "relational_example.arff") + @test isequal( + ARFFFiles.load(NamedTuple, relational_path), + ( + id = [1.0, 2.0, 3.0], + measurements = [ + (temp = [1.0, 2.0], flag = ["hot", "cold"]), + (temp = [3.0], flag = ["warm"]), + (temp = Float64[], flag = String[]), + ], + label = CategoricalArray(["yes", "no", missing], levels = ["yes", "no"]), + ), + ) end diff --git a/test/save.jl b/test/save.jl index c163681..8a1c55a 100644 --- a/test/save.jl +++ b/test/save.jl @@ -2,16 +2,28 @@ using CategoricalArrays using Dates include("setup.jl") - cases = [( - filename = "test_1.arff", - df = ( - nums = [1.5, 2.5, NaN], - ints = [1, missing, 3], - strs = ["foo", "bar", missing], - cats = CategoricalArray([missing, "c", "b"], levels = ["a", "b", "c"]), - dates = [Date(2020, 1, 2), missing, Date(2022, 4, 5)], + cases = [ + ( + filename = "simple_with_missing.arff", + df = ( + nums = [1.5, 2.5, NaN], + ints = Union{Missing, Int}[1, missing, 3], + strs = Union{Missing, String}["foo", "bar", missing], + cats = CategoricalArray([missing, "c", "b"], levels = ["a", "b", "c"]), + dates = Union{Missing, Date}[Date(2020, 1, 2), missing, Date(2022, 4, 5)], + ), ), - )] + ( + filename = "simple.arff", + df = ( + nums = [1.5, 2.5, 3.5], + ints = [1, 2, 3], + strs = ["foo", "bar", "baz"], + cats = CategoricalArray(["a", "c", "b"], levels = ["a", "b", "c"]), + dates = [Date(2020, 1, 2), Date(2021, 3, 4), Date(2022, 4, 5)], + ), + ), + ] @testset "$(case.filename)" for case in cases path = joinpath(datadir, case.filename) if !ispath(path) @@ -26,4 +38,11 @@ ARFFFiles.save(io, case.df, relation = "test-data", comment = "test data file") @test String(take!(io)) == read(path, String) end + @testset "boolean columns" begin + df = (flags = [true, false],) + io = IOBuffer() + ARFFFiles.save(io, df, relation = "bools", comment = "") + expected = "@RELATION 'bools'\n\n@ATTRIBUTE 'flags' NUMERIC\n\n@DATA\n1\n0\n" + @test String(take!(io)) == expected + end end