JuliaData · cjdoris · Nov 12, 2025 · Nov 12, 2025
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,9 @@
+# Agent Notes for ARFFFiles.jl
+
+- This repo uses Julia's workspace-aware test layout. The root `Project.toml` lists the `test` project under `[workspace]`, and the tests themselves live in `test/Project.toml`.
+  - Activate and instantiate the full workspace with `julia --project=@. -e 'using Pkg; Pkg.instantiate(workspace=true; allow_autoprecomp=false)'` when dependencies change.
+  - Tests should be executed with `julia --project=@. -e 'using Pkg; Pkg.test()'` so that the workspace metadata is respected.
+- The `test` project is a plain environment (no package). Activate it from the repository root; `TestItemRunner` will still execute the `@testitem` files via `include`.
+- Test fixtures under `test/data` are generated via `test/data/generate_datasets.jl`. Run that script with Julia instead of editing the `.arff` fixtures by hand, and keep the emitted comment string set to `"test data file"` so the save tests comparing against the fixtures continue to pass.
+- If additional tooling or conventions become important, append them here so future agents stay up to date. **Keep this file current with any new discoveries.**
+- When working on relational columns, remember that nested `ARFFReader`s get reused; reset their buffers (see the `:R` handling in `src/ARFFFiles.jl`) if you make structural changes.
diff --git a/Project.toml b/Project.toml
@@ -3,26 +3,18 @@ uuid = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
 authors = ["Christopher Doris <github.com/cjdoris>"]
 version = "1.5.1"
 
+[workspace]
+projects = ["test"]
+
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-Aqua = "0 - 999"
 CategoricalArrays = "0.10, 1.0"
 Dates = "1"
 Parsers = "2.8"
 Tables = "1.12"
-Test = "1"
-TestItemRunner = "0 - 999"
 julia = "1.6"
-
-[extras]
-Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
-
-[targets]
-test = ["Aqua", "Test", "TestItemRunner"]
diff --git a/src/ARFFFiles.jl b/src/ARFFFiles.jl
@@ -1025,7 +1025,10 @@ end
         elseif kind == :R || kind == :RX
             str = Parsing.get_parsed_string(chunk, res)
             r2 = info
-            r2.io = IOBuffer(str)
+            r2.io = IOBuffer(String(str))
+            r2.chunk = ARFFTable(_schema([], []), Dict())
+            r2.chunklen = 0
+            r2.chunkidx = 0
             push!(col, readcolumns(r2))
         else
             error()

diff --git a/test/Project.toml b/test/Project.toml
@@ -0,0 +1,8 @@
+[deps]
+ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
diff --git a/test/data/generate_datasets.jl b/test/data/generate_datasets.jl
@@ -0,0 +1,126 @@
+#!/usr/bin/env julia
+using ARFFFiles
+using Dates
+using CategoricalArrays
+
+const DATADIR = @__DIR__
+const COMMENT = "test data file"
+
+function write_simple_with_missing(path)
+    data = (
+        nums = [1.5, 2.5, NaN],
+        ints = Union{Missing, Int}[1, missing, 3],
+        strs = Union{Missing, String}["foo", "bar", missing],
+        cats = CategoricalArray([missing, "c", "b"], levels = ["a", "b", "c"]),
+        dates = Union{Missing, Date}[Date(2020, 1, 2), missing, Date(2022, 4, 5)],
+    )
+    ARFFFiles.save(
+        path,
+        data,
+        relation = "test-data",
+        comment = COMMENT,
+    )
+end
+
+function write_simple(path)
+    data = (
+        nums = [1.5, 2.5, 3.5],
+        ints = [1, 2, 3],
+        strs = ["foo", "bar", "baz"],
+        cats = CategoricalArray(["a", "c", "b"], levels = ["a", "b", "c"]),
+        dates = [Date(2020, 1, 2), Date(2021, 3, 4), Date(2022, 4, 5)],
+    )
+    ARFFFiles.save(
+        path,
+        data,
+        relation = "test-data",
+        comment = COMMENT,
+    )
+end
+
+function write_sparse(path)
+    open(path, "w") do io
+        println(io, "% $COMMENT")
+        println(io)
+        print(io, "@RELATION ")
+        ARFFFiles.write_datum(io, "sparse-example")
+        println(io, "\n")
+        for (name, type) in (("num1", "NUMERIC"), ("str1", "STRING"), ("cat1", "{yes,no}"))
+            print(io, "@ATTRIBUTE ")
+            ARFFFiles.write_datum(io, name)
+            println(io, " $type")
+        end
+        println(io)
+        println(io, "@DATA")
+        rows = [
+            [(0, 1.5), (1, "foo"), (2, "yes")],
+            [(1, "bar")],
+            [(0, 3.0), (2, "no")],
+        ]
+        for row in rows
+            print(io, "{")
+            for (j, (idx, value)) in enumerate(row)
+                j > 1 && print(io, ",")
+                print(io, idx, " ")
+                ARFFFiles.write_datum(io, value)
+            end
+            println(io, "}")
+        end
+    end
+end
+
+function write_relational(path)
+    open(path, "w") do io
+        println(io, "% $COMMENT")
+        println(io)
+        print(io, "@RELATION ")
+        ARFFFiles.write_datum(io, "relational-example")
+        println(io, "\n")
+        print(io, "@ATTRIBUTE ")
+        ARFFFiles.write_datum(io, "id")
+        println(io, " NUMERIC")
+        print(io, "@ATTRIBUTE ")
+        ARFFFiles.write_datum(io, "measurements")
+        println(io, " RELATIONAL")
+        nested_attrs = [
+            ("temp", "NUMERIC"),
+            ("flag", "STRING"),
+        ]
+        for (name, type) in nested_attrs
+            print(io, "    @ATTRIBUTE ")
+            ARFFFiles.write_datum(io, name)
+            println(io, " $type")
+        end
+        print(io, "@END ")
+        ARFFFiles.write_datum(io, "measurements")
+        println(io)
+        print(io, "@ATTRIBUTE ")
+        ARFFFiles.write_datum(io, "label")
+        println(io, " {yes,no}")
+        println(io)
+        println(io, "@DATA")
+        rows = (
+            (id = 1, nested = "1.0,hot\n2.0,cold", label = "yes"),
+            (id = 2, nested = "3.0,warm", label = "no"),
+            (id = 3, nested = "", label = missing),
+        )
+        for row in rows
+            ARFFFiles.write_datum(io, row.id)
+            print(io, ",")
+            ARFFFiles.write_datum(io, row.nested)
+            print(io, ",")
+            ARFFFiles.write_datum(io, row.label)
+            println(io)
+        end
+    end
+end
+
+function main()
+    mkpath(DATADIR)
+    write_simple_with_missing(joinpath(DATADIR, "simple_with_missing.arff"))
+    write_simple(joinpath(DATADIR, "simple.arff"))
+    write_sparse(joinpath(DATADIR, "sparse_example.arff"))
+    write_relational(joinpath(DATADIR, "relational_example.arff"))
+end
+
+main()
diff --git a/test/data/relational_example.arff b/test/data/relational_example.arff
@@ -0,0 +1,15 @@
+% test data file
+
+@RELATION 'relational-example'
+
+@ATTRIBUTE 'id' NUMERIC
+@ATTRIBUTE 'measurements' RELATIONAL
+    @ATTRIBUTE 'temp' NUMERIC
+    @ATTRIBUTE 'flag' STRING
+@END 'measurements'
+@ATTRIBUTE 'label' {yes,no}
+
+@DATA
+1,'1.0,hot\n2.0,cold','yes'
+2,'3.0,warm','no'
+3,'',?
diff --git a/test/data/simple.arff b/test/data/simple.arff
@@ -0,0 +1,14 @@
+% test data file
+
+@RELATION 'test-data'
+
+@ATTRIBUTE 'nums' NUMERIC
+@ATTRIBUTE 'ints' NUMERIC
+@ATTRIBUTE 'strs' STRING
+@ATTRIBUTE 'cats' {'a','b','c'}
+@ATTRIBUTE 'dates' DATE "yyyy-MM-dd'T'HH:mm:ss.SSS"
+
+@DATA
+1.5,1,'foo','a','2020-01-02T00:00:00.000'
+2.5,2,'bar','c','2021-03-04T00:00:00.000'
+3.5,3,'baz','b','2022-04-05T00:00:00.000'
diff --git a/test/data/test_1.arff → test/data/simple_with_missing.arff b/test/data/test_1.arff → test/data/simple_with_missing.arff
diff --git a/test/data/sparse_example.arff b/test/data/sparse_example.arff
@@ -0,0 +1,12 @@
+% test data file
+
+@RELATION 'sparse-example'
+
+@ATTRIBUTE 'num1' NUMERIC
+@ATTRIBUTE 'str1' STRING
+@ATTRIBUTE 'cat1' {yes,no}
+
+@DATA
+{0 1.5,1 'foo',2 'yes'}
+{1 'bar'}
+{0 3.0,2 'no'}