|
| 1 | +import sys |
| 2 | +from pathlib import Path |
| 3 | + |
| 4 | +sys.path.insert(0, str(Path(__file__).parent.parent)) |
| 5 | + |
| 6 | +import pytest |
| 7 | +import rugo.jsonl as rj |
| 8 | + |
| 9 | +def test_get_schema_basic(): |
| 10 | + """Test schema extraction from basic JSON lines data.""" |
| 11 | + data = b'''{"id": 1, "name": "Alice", "age": 30}''' |
| 12 | + schema = rj.get_jsonl_schema(data) |
| 13 | + |
| 14 | + for col in schema: |
| 15 | + if col['name'] == 'id': |
| 16 | + assert col['type'] == 'int64', f"Expected 'int64' for 'id', got {col['type']}" |
| 17 | + elif col['name'] == 'name': |
| 18 | + assert col['type'] == 'string', f"Expected 'string' for 'name', got {col['type']}" |
| 19 | + elif col['name'] == 'age': |
| 20 | + assert col['type'] == 'int64', f"Expected 'int64' for 'age', got {col['type']}" |
| 21 | + else: |
| 22 | + pytest.fail(f"Unexpected column name: {col['name']}") |
| 23 | + |
| 24 | +def test_get_schema_with_complex_types(): |
| 25 | + """Test schema extraction with varied JSON lines data types.""" |
| 26 | + data = b'''{ "id": 0, "values": [""] } |
| 27 | +{ "id": 1, "values": [] } |
| 28 | +{ "id": 2, "values": [null] } |
| 29 | +{ "id": 3, "values": null } |
| 30 | +{ "id": 4, "values": [] } |
| 31 | +{ "id": 5, "values": ["value", null] } |
| 32 | +{ "id": 6, "values": [null, "value"] } |
| 33 | +{ "id": 7, "values": ["value1", "value2", "value3"] } |
| 34 | +{ "id": 8, "values": [null, null] } |
| 35 | +{ "id": 9 }''' |
| 36 | + schema = rj.get_jsonl_schema(data) |
| 37 | + |
| 38 | + expected_types = { |
| 39 | + 'id': 'int64', |
| 40 | + 'values': 'array<string>' |
| 41 | + } |
| 42 | + |
| 43 | + for col in schema: |
| 44 | + expected_type = expected_types.get(col['name']) |
| 45 | + assert expected_type is not None, f"Unexpected column name: {col['name']}" |
| 46 | + assert col['type'] == expected_type, f"Expected '{expected_type}' for '{col['name']}', got {col['type']}" |
| 47 | + |
| 48 | +def test_how_nonexistent_values_are_handled(): |
| 49 | + """Test how nonexistent values are handled in schema extraction.""" |
| 50 | + data = b'''{"id": 1, "dict": {"list": [1, 2, 3], "key": "value"}, "nested": {"level1": {"key": "val"}}} |
| 51 | +{"id": 2, "dict": {"list": [4, 5]}, "nested": {"level1": {"key": null}}} |
| 52 | +{"id": 3, "dict": {"other_list": [6, 7, 8]}, "nested": {"level1": {}}} |
| 53 | +{"id": 4, "dict": {"list": [], "key": "another_value"}, "nested": {}} |
| 54 | +{"id": 5, "dict": {}, "nested": {"level1": {"key": "val"}}} |
| 55 | +{"id": 6, "dict": {"list": [9], "nested_list": [{"key": "a"}, {"key": "b"}]}, "nested": {"level1": {"key": "val"}}} |
| 56 | +''' |
| 57 | + schema = rj.get_jsonl_schema(data) |
| 58 | + |
| 59 | + print(schema) |
| 60 | + |
| 61 | + for col in schema: |
| 62 | + if col['name'] == 'id': |
| 63 | + assert col['type'] == 'int64', f"Expected 'int64' for 'id', got {col['type']}" |
| 64 | + elif col['name'] == 'dict': |
| 65 | + assert col['type'] == 'object', f"Expected 'object' for 'dict', got {col['type']}" |
| 66 | + elif col['name'] == 'nested': |
| 67 | + assert col['type'] == 'object', f"Expected 'object' for 'nested', got {col['type']}" |
| 68 | + else: |
| 69 | + pytest.fail(f"Unexpected column name: {col['name']}") |
| 70 | + |
| 71 | +if __name__ == "__main__": |
| 72 | + pytest.main([__file__, "-v"]) |
0 commit comments