Merge pull request #52 from mabel-dev/element-type-for-lists

joocer · web-flow · commit 1c44fe6cf05b · 2025-10-22T19:36:15.000+01:00
element type for lists
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rugo"
-version = "0.1.13"
+version = "0.1.14"
 description = "Parquet Metadata Reader"
 readme = "README.md"
 authors = [
diff --git a/rugo/jsonl/decode.cpp b/rugo/jsonl/decode.cpp
@@ -520,7 +520,14 @@ class JsonParser {
                         pos_++; // include closing bracket
                         out_start = start;
                         out_len = pos_ - start;
-                        type = JsonType::String;
+                        // For slices that are arrays or objects, return the
+                        // appropriate JsonType so schema inference can record
+                        // Array/Object instead of falling back to String.
+                        if (open == '[') {
+                            type = JsonType::Array;
+                        } else {
+                            type = JsonType::Object;
+                        }
                         out_has_escape = false; // leave as raw JSON slice
                         // Debug: print short preview of slice
                         size_t preview_len = out_len < 64 ? out_len : 64;
@@ -620,8 +627,10 @@ std::vector<ColumnSchema> GetJsonlSchema(const uint8_t* data, size_t size, size_
             // If this value is an array, attempt a quick element type inference
             if (type == JsonType::Array) {
                 // Simple heuristic: look at the first non-whitespace char after '['
-                size_t idx = 0;
-                while (idx < val_len && (val_ptr[idx] == ' ' || val_ptr[idx] == '\t' || val_ptr[idx] == '\r' || val_ptr[idx] == '\n')) idx++;
+                    size_t idx = 0;
+                    // Skip the opening '[' if present
+                    if (idx < val_len && val_ptr[idx] == '[') idx++;
+                    while (idx < val_len && (val_ptr[idx] == ' ' || val_ptr[idx] == '\t' || val_ptr[idx] == '\r' || val_ptr[idx] == '\n')) idx++;
                 if (idx < val_len) {
                     char fc = val_ptr[idx];
                     JsonType elem = JsonType::Null;
diff --git a/tests/data/list_column.jsonl b/tests/data/list_column.jsonl
@@ -0,0 +1,10 @@
+{ "id": 0, "values": [""] }
+{ "id": 1, "values": [] }
+{ "id": 2, "values": [null] }
+{ "id": 3, "values": null }
+{ "id": 4, "values": [] }
+{ "id": 5, "values": ["value", null] }
+{ "id": 6, "values": [null, "value"] }
+{ "id": 7, "values": ["value1", "value2", "value3"] }
+{ "id": 8, "values": [null, null] }
+{ "id": 9 }
diff --git a/tests/test_jsonl_schema.py b/tests/test_jsonl_schema.py
@@ -0,0 +1,72 @@
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import pytest
+import rugo.jsonl as rj
+
+def test_get_schema_basic():
+    """Test schema extraction from basic JSON lines data."""
+    data = b'''{"id": 1, "name": "Alice", "age": 30}'''
+    schema = rj.get_jsonl_schema(data)
+
+    for col in schema:
+        if col['name'] == 'id':
+            assert col['type'] == 'int64', f"Expected 'int64' for 'id', got {col['type']}"
+        elif col['name'] == 'name':
+            assert col['type'] == 'string', f"Expected 'string' for 'name', got {col['type']}"  
+        elif col['name'] == 'age':
+            assert col['type'] == 'int64', f"Expected 'int64' for 'age', got {col['type']}"
+        else:
+            pytest.fail(f"Unexpected column name: {col['name']}")
+
+def test_get_schema_with_complex_types():
+    """Test schema extraction with varied JSON lines data types."""
+    data = b'''{ "id": 0, "values": [""] }
+{ "id": 1, "values": [] }
+{ "id": 2, "values": [null] }
+{ "id": 3, "values": null }
+{ "id": 4, "values": [] }
+{ "id": 5, "values": ["value", null] }
+{ "id": 6, "values": [null, "value"] }
+{ "id": 7, "values": ["value1", "value2", "value3"] }
+{ "id": 8, "values": [null, null] }
+{ "id": 9 }'''
+    schema = rj.get_jsonl_schema(data)
+    
+    expected_types = {
+        'id': 'int64',
+        'values': 'array<string>'
+    }
+    
+    for col in schema:
+        expected_type = expected_types.get(col['name'])
+        assert expected_type is not None, f"Unexpected column name: {col['name']}"
+        assert col['type'] == expected_type, f"Expected '{expected_type}' for '{col['name']}', got {col['type']}"
+
+def test_how_nonexistent_values_are_handled():
+    """Test how nonexistent values are handled in schema extraction."""
+    data = b'''{"id": 1, "dict": {"list": [1, 2, 3], "key": "value"}, "nested": {"level1": {"key": "val"}}}
+{"id": 2, "dict": {"list": [4, 5]}, "nested": {"level1": {"key": null}}}
+{"id": 3, "dict": {"other_list": [6, 7, 8]}, "nested": {"level1": {}}}
+{"id": 4, "dict": {"list": [], "key": "another_value"}, "nested": {}}
+{"id": 5, "dict": {}, "nested": {"level1": {"key": "val"}}}
+{"id": 6, "dict": {"list": [9], "nested_list": [{"key": "a"}, {"key": "b"}]}, "nested": {"level1": {"key": "val"}}}
+'''
+    schema = rj.get_jsonl_schema(data)
+
+    print(schema)
+
+    for col in schema:
+        if col['name'] == 'id':
+            assert col['type'] == 'int64', f"Expected 'int64' for 'id', got {col['type']}"
+        elif col['name'] == 'dict':
+            assert col['type'] == 'object', f"Expected 'object' for 'dict', got {col['type']}"
+        elif col['name'] == 'nested':
+            assert col['type'] == 'object', f"Expected 'object' for 'nested', got {col['type']}"
+        else:
+            pytest.fail(f"Unexpected column name: {col['name']}")
+
+if __name__ == "__main__":    
+    pytest.main([__file__, "-v"])