Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Agent Notes for ARFFFiles.jl

- This repo uses Julia's workspace-aware test layout. The root `Project.toml` lists the `test` project under `[workspace]`, and the tests themselves live in `test/Project.toml`.
- Activate and instantiate the full workspace with `julia --project=@. -e 'using Pkg; Pkg.instantiate(workspace=true; allow_autoprecomp=false)'` when dependencies change.
- Tests should be executed with `julia --project=@. -e 'using Pkg; Pkg.test()'` so that the workspace metadata is respected.
- The `test` project is a plain environment (no package). Activate it from the repository root; `TestItemRunner` will still execute the `@testitem` files via `include`.
- Test fixtures under `test/data` are generated via `test/data/generate_datasets.jl`. Run that script with Julia instead of editing the `.arff` fixtures by hand, and keep the emitted comment string set to `"test data file"` so the save tests comparing against the fixtures continue to pass.
- If additional tooling or conventions become important, append them here so future agents stay up to date. **Keep this file current with any new discoveries.**
- When working on relational columns, remember that nested `ARFFReader`s get reused; reset their buffers (see the `:R` handling in `src/ARFFFiles.jl`) if you make structural changes.
14 changes: 3 additions & 11 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,18 @@ uuid = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
authors = ["Christopher Doris <github.com/cjdoris>"]
version = "1.5.1"

[workspace]
projects = ["test"]

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Aqua = "0 - 999"
CategoricalArrays = "0.10, 1.0"
Dates = "1"
Parsers = "2.8"
Tables = "1.12"
Test = "1"
TestItemRunner = "0 - 999"
julia = "1.6"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"

[targets]
test = ["Aqua", "Test", "TestItemRunner"]
5 changes: 4 additions & 1 deletion src/ARFFFiles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,10 @@ end
elseif kind == :R || kind == :RX
str = Parsing.get_parsed_string(chunk, res)
r2 = info
r2.io = IOBuffer(str)
r2.io = IOBuffer(String(str))
r2.chunk = ARFFTable(_schema([], []), Dict())
r2.chunklen = 0
r2.chunkidx = 0
push!(col, readcolumns(r2))
else
error()
Expand Down
8 changes: 8 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[deps]
ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
126 changes: 126 additions & 0 deletions test/data/generate_datasets.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env julia
using ARFFFiles
using Dates
using CategoricalArrays

const DATADIR = @__DIR__
const COMMENT = "test data file"

function write_simple_with_missing(path)
data = (
nums = [1.5, 2.5, NaN],
ints = Union{Missing, Int}[1, missing, 3],
strs = Union{Missing, String}["foo", "bar", missing],
cats = CategoricalArray([missing, "c", "b"], levels = ["a", "b", "c"]),
dates = Union{Missing, Date}[Date(2020, 1, 2), missing, Date(2022, 4, 5)],
)
ARFFFiles.save(
path,
data,
relation = "test-data",
comment = COMMENT,
)
end

function write_simple(path)
data = (
nums = [1.5, 2.5, 3.5],
ints = [1, 2, 3],
strs = ["foo", "bar", "baz"],
cats = CategoricalArray(["a", "c", "b"], levels = ["a", "b", "c"]),
dates = [Date(2020, 1, 2), Date(2021, 3, 4), Date(2022, 4, 5)],
)
ARFFFiles.save(
path,
data,
relation = "test-data",
comment = COMMENT,
)
end

function write_sparse(path)
open(path, "w") do io
println(io, "% $COMMENT")
println(io)
print(io, "@RELATION ")
ARFFFiles.write_datum(io, "sparse-example")
println(io, "\n")
for (name, type) in (("num1", "NUMERIC"), ("str1", "STRING"), ("cat1", "{yes,no}"))
print(io, "@ATTRIBUTE ")
ARFFFiles.write_datum(io, name)
println(io, " $type")
end
println(io)
println(io, "@DATA")
rows = [
[(0, 1.5), (1, "foo"), (2, "yes")],
[(1, "bar")],
[(0, 3.0), (2, "no")],
]
for row in rows
print(io, "{")
for (j, (idx, value)) in enumerate(row)
j > 1 && print(io, ",")
print(io, idx, " ")
ARFFFiles.write_datum(io, value)
end
println(io, "}")
end
end
end

function write_relational(path)
open(path, "w") do io
println(io, "% $COMMENT")
println(io)
print(io, "@RELATION ")
ARFFFiles.write_datum(io, "relational-example")
println(io, "\n")
print(io, "@ATTRIBUTE ")
ARFFFiles.write_datum(io, "id")
println(io, " NUMERIC")
print(io, "@ATTRIBUTE ")
ARFFFiles.write_datum(io, "measurements")
println(io, " RELATIONAL")
nested_attrs = [
("temp", "NUMERIC"),
("flag", "STRING"),
]
for (name, type) in nested_attrs
print(io, " @ATTRIBUTE ")
ARFFFiles.write_datum(io, name)
println(io, " $type")
end
print(io, "@END ")
ARFFFiles.write_datum(io, "measurements")
println(io)
print(io, "@ATTRIBUTE ")
ARFFFiles.write_datum(io, "label")
println(io, " {yes,no}")
println(io)
println(io, "@DATA")
rows = (
(id = 1, nested = "1.0,hot\n2.0,cold", label = "yes"),
(id = 2, nested = "3.0,warm", label = "no"),
(id = 3, nested = "", label = missing),
)
for row in rows
ARFFFiles.write_datum(io, row.id)
print(io, ",")
ARFFFiles.write_datum(io, row.nested)
print(io, ",")
ARFFFiles.write_datum(io, row.label)
println(io)
end
end
end

function main()
mkpath(DATADIR)
write_simple_with_missing(joinpath(DATADIR, "simple_with_missing.arff"))
write_simple(joinpath(DATADIR, "simple.arff"))
write_sparse(joinpath(DATADIR, "sparse_example.arff"))
write_relational(joinpath(DATADIR, "relational_example.arff"))
end

main()
15 changes: 15 additions & 0 deletions test/data/relational_example.arff
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
% test data file

@RELATION 'relational-example'

@ATTRIBUTE 'id' NUMERIC
@ATTRIBUTE 'measurements' RELATIONAL
@ATTRIBUTE 'temp' NUMERIC
@ATTRIBUTE 'flag' STRING
@END 'measurements'
@ATTRIBUTE 'label' {yes,no}

@DATA
1,'1.0,hot\n2.0,cold','yes'
2,'3.0,warm','no'
3,'',?
14 changes: 14 additions & 0 deletions test/data/simple.arff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
% test data file

@RELATION 'test-data'

@ATTRIBUTE 'nums' NUMERIC
@ATTRIBUTE 'ints' NUMERIC
@ATTRIBUTE 'strs' STRING
@ATTRIBUTE 'cats' {'a','b','c'}
@ATTRIBUTE 'dates' DATE "yyyy-MM-dd'T'HH:mm:ss.SSS"

@DATA
1.5,1,'foo','a','2020-01-02T00:00:00.000'
2.5,2,'bar','c','2021-03-04T00:00:00.000'
3.5,3,'baz','b','2022-04-05T00:00:00.000'
File renamed without changes.
12 changes: 12 additions & 0 deletions test/data/sparse_example.arff
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
% test data file

@RELATION 'sparse-example'

@ATTRIBUTE 'num1' NUMERIC
@ATTRIBUTE 'str1' STRING
@ATTRIBUTE 'cat1' {yes,no}

@DATA
{0 1.5,1 'foo',2 'yes'}
{1 'bar'}
{0 3.0,2 'no'}
Loading
Loading