Skip to content

Commit 1c44fe6

Browse files
authored
Merge pull request #52 from mabel-dev/element-type-for-lists
element type for lists
2 parents 483e362 + f86e510 commit 1c44fe6

File tree

4 files changed

+95
-4
lines changed

4 files changed

+95
-4
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "rugo"
7-
version = "0.1.13"
7+
version = "0.1.14"
88
description = "Parquet Metadata Reader"
99
readme = "README.md"
1010
authors = [

rugo/jsonl/decode.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,14 @@ class JsonParser {
520520
pos_++; // include closing bracket
521521
out_start = start;
522522
out_len = pos_ - start;
523-
type = JsonType::String;
523+
// For slices that are arrays or objects, return the
524+
// appropriate JsonType so schema inference can record
525+
// Array/Object instead of falling back to String.
526+
if (open == '[') {
527+
type = JsonType::Array;
528+
} else {
529+
type = JsonType::Object;
530+
}
524531
out_has_escape = false; // leave as raw JSON slice
525532
// Debug: print short preview of slice
526533
size_t preview_len = out_len < 64 ? out_len : 64;
@@ -620,8 +627,10 @@ std::vector<ColumnSchema> GetJsonlSchema(const uint8_t* data, size_t size, size_
620627
// If this value is an array, attempt a quick element type inference
621628
if (type == JsonType::Array) {
622629
// Simple heuristic: look at the first non-whitespace char after '['
623-
size_t idx = 0;
624-
while (idx < val_len && (val_ptr[idx] == ' ' || val_ptr[idx] == '\t' || val_ptr[idx] == '\r' || val_ptr[idx] == '\n')) idx++;
630+
size_t idx = 0;
631+
// Skip the opening '[' if present
632+
if (idx < val_len && val_ptr[idx] == '[') idx++;
633+
while (idx < val_len && (val_ptr[idx] == ' ' || val_ptr[idx] == '\t' || val_ptr[idx] == '\r' || val_ptr[idx] == '\n')) idx++;
625634
if (idx < val_len) {
626635
char fc = val_ptr[idx];
627636
JsonType elem = JsonType::Null;

tests/data/list_column.jsonl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{ "id": 0, "values": [""] }
2+
{ "id": 1, "values": [] }
3+
{ "id": 2, "values": [null] }
4+
{ "id": 3, "values": null }
5+
{ "id": 4, "values": [] }
6+
{ "id": 5, "values": ["value", null] }
7+
{ "id": 6, "values": [null, "value"] }
8+
{ "id": 7, "values": ["value1", "value2", "value3"] }
9+
{ "id": 8, "values": [null, null] }
10+
{ "id": 9 }

tests/test_jsonl_schema.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import sys
2+
from pathlib import Path
3+
4+
sys.path.insert(0, str(Path(__file__).parent.parent))
5+
6+
import pytest
7+
import rugo.jsonl as rj
8+
9+
def test_get_schema_basic():
10+
"""Test schema extraction from basic JSON lines data."""
11+
data = b'''{"id": 1, "name": "Alice", "age": 30}'''
12+
schema = rj.get_jsonl_schema(data)
13+
14+
for col in schema:
15+
if col['name'] == 'id':
16+
assert col['type'] == 'int64', f"Expected 'int64' for 'id', got {col['type']}"
17+
elif col['name'] == 'name':
18+
assert col['type'] == 'string', f"Expected 'string' for 'name', got {col['type']}"
19+
elif col['name'] == 'age':
20+
assert col['type'] == 'int64', f"Expected 'int64' for 'age', got {col['type']}"
21+
else:
22+
pytest.fail(f"Unexpected column name: {col['name']}")
23+
24+
def test_get_schema_with_complex_types():
25+
"""Test schema extraction with varied JSON lines data types."""
26+
data = b'''{ "id": 0, "values": [""] }
27+
{ "id": 1, "values": [] }
28+
{ "id": 2, "values": [null] }
29+
{ "id": 3, "values": null }
30+
{ "id": 4, "values": [] }
31+
{ "id": 5, "values": ["value", null] }
32+
{ "id": 6, "values": [null, "value"] }
33+
{ "id": 7, "values": ["value1", "value2", "value3"] }
34+
{ "id": 8, "values": [null, null] }
35+
{ "id": 9 }'''
36+
schema = rj.get_jsonl_schema(data)
37+
38+
expected_types = {
39+
'id': 'int64',
40+
'values': 'array<string>'
41+
}
42+
43+
for col in schema:
44+
expected_type = expected_types.get(col['name'])
45+
assert expected_type is not None, f"Unexpected column name: {col['name']}"
46+
assert col['type'] == expected_type, f"Expected '{expected_type}' for '{col['name']}', got {col['type']}"
47+
48+
def test_how_nonexistent_values_are_handled():
49+
"""Test how nonexistent values are handled in schema extraction."""
50+
data = b'''{"id": 1, "dict": {"list": [1, 2, 3], "key": "value"}, "nested": {"level1": {"key": "val"}}}
51+
{"id": 2, "dict": {"list": [4, 5]}, "nested": {"level1": {"key": null}}}
52+
{"id": 3, "dict": {"other_list": [6, 7, 8]}, "nested": {"level1": {}}}
53+
{"id": 4, "dict": {"list": [], "key": "another_value"}, "nested": {}}
54+
{"id": 5, "dict": {}, "nested": {"level1": {"key": "val"}}}
55+
{"id": 6, "dict": {"list": [9], "nested_list": [{"key": "a"}, {"key": "b"}]}, "nested": {"level1": {"key": "val"}}}
56+
'''
57+
schema = rj.get_jsonl_schema(data)
58+
59+
print(schema)
60+
61+
for col in schema:
62+
if col['name'] == 'id':
63+
assert col['type'] == 'int64', f"Expected 'int64' for 'id', got {col['type']}"
64+
elif col['name'] == 'dict':
65+
assert col['type'] == 'object', f"Expected 'object' for 'dict', got {col['type']}"
66+
elif col['name'] == 'nested':
67+
assert col['type'] == 'object', f"Expected 'object' for 'nested', got {col['type']}"
68+
else:
69+
pytest.fail(f"Unexpected column name: {col['name']}")
70+
71+
if __name__ == "__main__":
72+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)