Skip to content

Commit e71239a

Browse files
committed
Align offsets on struct-list validation
1 parent f592cac commit e71239a

File tree

4 files changed

+167
-25
lines changed

4 files changed

+167
-25
lines changed

src/nested_pandas/series/_storage/struct_list_storage.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
import pyarrow as pa
77

88
from nested_pandas.series.utils import (
9+
align_chunked_struct_list_offsets,
910
table_to_struct_array,
1011
transpose_list_struct_chunked,
11-
validate_struct_list_array_for_equal_lengths,
1212
)
1313

1414
if TYPE_CHECKING:
@@ -25,7 +25,9 @@ class StructListStorage:
2525
Pyarrow struct-array with all fields to be list-arrays.
2626
All list-values must be "aligned", e.g., have the same length.
2727
validate : bool (default True)
28-
Check that all the lists have the same lengths for each struct-value.
28+
Check that all the lists have the same lengths for each struct-value,
29+
and if all list offset arrays are the same. Fails for the first check,
30+
and reallocates the data for the second check.
2931
"""
3032

3133
_data: pa.ChunkedArray
@@ -37,8 +39,7 @@ def __init__(self, array: pa.StructArray | pa.ChunkedArray, *, validate: bool =
3739
raise ValueError("array must be a StructArray or ChunkedArray")
3840

3941
if validate:
40-
for chunk in array.chunks:
41-
validate_struct_list_array_for_equal_lengths(chunk)
42+
array = align_chunked_struct_list_offsets(array)
4243

4344
self._data = array
4445

src/nested_pandas/series/_storage/table_storage.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
import pyarrow as pa
66

77
from nested_pandas.series.utils import (
8+
align_chunked_struct_list_offsets,
89
table_from_struct_array,
910
table_to_struct_array,
10-
validate_struct_list_array_for_equal_lengths,
1111
)
1212

1313
if TYPE_CHECKING:
@@ -30,8 +30,8 @@ class TableStorage:
3030
def __init__(self, table: pa.Table, validate: bool = True) -> None:
3131
if validate:
3232
struct_array = table_to_struct_array(table)
33-
for chunk in struct_array.iterchunks():
34-
validate_struct_list_array_for_equal_lengths(chunk)
33+
aligned_struct_array = align_chunked_struct_list_offsets(struct_array)
34+
table = table_from_struct_array(aligned_struct_array)
3535

3636
self._data = table
3737

src/nested_pandas/series/utils.py

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,36 +58,97 @@ def is_pa_type_is_list_struct(pa_type: pa.DataType) -> bool:
5858
return is_pa_type_a_list(pa_type) and pa.types.is_struct(pa_type.value_type)
5959

6060

61-
def validate_struct_list_array_for_equal_lengths(array: pa.StructArray) -> None:
62-
"""Check if the given struct array has lists of equal length.
61+
def align_struct_list_offsets(array: pa.StructArray) -> pa.StructArray:
62+
"""Checks if all struct-list offsets are the same, and reallocates if needed
6363
6464
Parameters
6565
----------
6666
array : pa.StructArray
6767
Input struct array.
6868
69+
Returns
70+
-------
71+
pa.StructArray
72+
Array with all struct-list offsets aligned. May be the input,
73+
if it was valid.
74+
6975
Raises
7076
------
7177
ValueError
72-
If the struct array has lists of unequal length or type of the input
73-
array is not a StructArray or fields are not ListArrays.
78+
If the input is not a valid "nested" StructArray.
7479
"""
7580
if not pa.types.is_struct(array.type):
7681
raise ValueError(f"Expected a StructArray, got {array.type}")
7782

78-
first_list_array: pa.ListArray | None = None
83+
first_offsets: pa.ListArray | None = None
7984
for field in array.type:
8085
inner_array = array.field(field.name)
8186
if not is_pa_type_a_list(inner_array.type):
8287
raise ValueError(f"Expected a ListArray, got {inner_array.type}")
8388
list_array = cast(pa.ListArray, inner_array)
8489

85-
if first_list_array is None:
86-
first_list_array = list_array
90+
if first_offsets is None:
91+
first_offsets = list_array.offsets
8792
continue
8893
# compare offsets from the first list array with the current one
89-
if not first_list_array.offsets.equals(list_array.offsets):
90-
raise ValueError("Offsets of all ListArrays must be the same")
94+
if not first_offsets.equals(list_array.offsets):
95+
break
96+
else:
97+
# Return the original array if all offsets match
98+
return array
99+
100+
new_offsets = pa.compute.subtract(first_offsets, first_offsets[0])
101+
value_lengths = None
102+
list_arrays = []
103+
for field in array.type:
104+
inner_array = array.field(field.name)
105+
list_array = cast(pa.ListArray, inner_array)
106+
107+
if value_lengths is None:
108+
value_lengths = list_array.value_lengths()
109+
elif not value_lengths.equals(list_array.value_lengths()):
110+
raise ValueError(
111+
f"List lengths do not match for struct fields {array.type.fields[0].name} and {field.name}",
112+
)
113+
114+
list_arrays.append(
115+
pa.ListArray.from_arrays(
116+
values=list_array.values[list_array.offsets[0].as_py() : list_array.offsets[-1].as_py()],
117+
offsets=new_offsets,
118+
)
119+
)
120+
new_array = pa.StructArray.from_arrays(
121+
arrays=list_arrays,
122+
type=array.type,
123+
)
124+
return new_array
125+
126+
127+
def align_chunked_struct_list_offsets(array: pa.Array | pa.ChunkedArray) -> pa.ChunkedArray:
128+
"""Checks if all struct-list offsets are the same, and reallocates if needed
129+
130+
Parameters
131+
----------
132+
array : pa.ChunkedArray or pa.Array
133+
Input chunked array, it must be a valid "nested" struct-list array,
134+
e.g. all list lengths must match. Non-chunked arrays are allowed,
135+
but the return array will always be chunked.
136+
137+
Returns
138+
-------
139+
pa.ChunkedArray
140+
Chunked array with all struct-list offsets aligned.
141+
142+
Raises
143+
------
144+
ValueError
145+
If the input is not a valid "nested" struct-list-array.
146+
"""
147+
if isinstance(array, pa.Array):
148+
array = pa.chunked_array([array])
149+
chunks = [align_struct_list_offsets(chunk) for chunk in array.iterchunks()]
150+
# Provide type for the case of zero-chunks array
151+
return pa.chunked_array(chunks, type=array.type)
91152

92153

93154
def transpose_struct_list_type(t: pa.StructType) -> pa.ListType:
@@ -139,7 +200,7 @@ def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) ->
139200
List array of structs.
140201
"""
141202
if validate:
142-
validate_struct_list_array_for_equal_lengths(array)
203+
array = align_struct_list_offsets(array)
143204

144205
mask = array.is_null()
145206
if not pa.compute.any(mask).as_py():
@@ -220,6 +281,16 @@ def validate_list_struct_type(t: pa.ListType) -> None:
220281
raise ValueError(f"Expected a StructType as a list value type, got {t.value_type}")
221282

222283

284+
def validate_struct_list_type(t: pa.ListType) -> None:
285+
"""Raise a ValueError if not a struct-list-type."""
286+
if not pa.types.is_struct(t):
287+
raise ValueError(f"Expected a StructType, got {t}")
288+
289+
for field in t.fields:
290+
if not is_pa_type_a_list(field.type):
291+
raise ValueError(f"Expected a ListType for field {field.name}, got {field.type}")
292+
293+
223294
def transpose_list_struct_type(t: pa.ListType) -> pa.StructType:
224295
"""Converts a type of list-struct array into a type of struct-list array.
225296

tests/nested_pandas/series/test_series_utils.py

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,34 +3,35 @@
33
import pytest
44
from nested_pandas import NestedDtype
55
from nested_pandas.series.utils import (
6+
align_chunked_struct_list_offsets,
7+
align_struct_list_offsets,
68
nested_types_mapper,
79
struct_field_names,
810
transpose_list_struct_array,
911
transpose_list_struct_scalar,
1012
transpose_list_struct_type,
1113
transpose_struct_list_array,
1214
transpose_struct_list_type,
13-
validate_struct_list_array_for_equal_lengths,
1415
)
1516

1617

17-
def test_validate_struct_list_array_for_equal_lengths():
18-
"""Test validate_struct_list_array_for_equal_lengths function."""
18+
def test_align_struct_list_offsets():
19+
"""Test align_struct_list_offsets function."""
1920
# Raises for wrong types
2021
with pytest.raises(ValueError):
21-
validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.int64()))
22+
align_struct_list_offsets(pa.array([], type=pa.int64()))
2223
with pytest.raises(ValueError):
23-
validate_struct_list_array_for_equal_lengths(pa.array([], type=pa.list_(pa.int64())))
24+
align_struct_list_offsets(pa.array([], type=pa.list_(pa.int64())))
2425

2526
# Raises if one of the fields is not a ListArray
2627
with pytest.raises(ValueError):
27-
validate_struct_list_array_for_equal_lengths(
28+
align_struct_list_offsets(
2829
pa.StructArray.from_arrays([pa.array([[1, 2], [3, 4, 5]]), pa.array([1, 2])], ["a", "b"])
2930
)
3031

3132
# Raises for mismatched lengths
3233
with pytest.raises(ValueError):
33-
validate_struct_list_array_for_equal_lengths(
34+
align_struct_list_offsets(
3435
pa.StructArray.from_arrays(
3536
[pa.array([[1, 2], [3, 4, 5]]), pa.array([[1, 2, 3], [4, 5]])], ["a", "b"]
3637
)
@@ -43,7 +44,76 @@ def test_validate_struct_list_array_for_equal_lengths():
4344
],
4445
names=["a", "b"],
4546
)
46-
assert validate_struct_list_array_for_equal_lengths(input_array) is None
47+
assert align_struct_list_offsets(input_array) is input_array
48+
49+
a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:]
50+
assert a.offsets[0].as_py() == 3
51+
b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]])
52+
assert b.offsets[0].as_py() == 0
53+
input_array = pa.StructArray.from_arrays(
54+
arrays=[a, b],
55+
names=["a", "b"],
56+
)
57+
aligned_array = align_struct_list_offsets(input_array)
58+
assert aligned_array is not input_array
59+
assert aligned_array.equals(input_array)
60+
61+
62+
def test_align_chunked_struct_list_offsets():
63+
"""Test align_chunked_struct_list_offsets function."""
64+
# Input is an array, output is chunked array
65+
a = pa.array([[1, 2], [3, 4], [], [5, 6, 7]])
66+
b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]])
67+
input_array = pa.StructArray.from_arrays(
68+
arrays=[a, b],
69+
names=["a", "b"],
70+
)
71+
output_array = align_chunked_struct_list_offsets(input_array)
72+
assert isinstance(output_array, pa.ChunkedArray)
73+
assert output_array.equals(pa.chunked_array([input_array]))
74+
75+
# Input is an "aligned" chunked array
76+
input_array = pa.chunked_array(
77+
[
78+
pa.StructArray.from_arrays(
79+
arrays=[a, b],
80+
names=["a", "b"],
81+
)
82+
]
83+
* 2
84+
)
85+
output_array = align_chunked_struct_list_offsets(input_array)
86+
assert output_array.equals(input_array)
87+
88+
# Input is an "aligned" chunked array, but offsets do not start with zero
89+
a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:]
90+
b = pa.array([["a", "a", "a", "a"], ["x", "y"], ["y", "x"], [], ["d", "e", "f"]])[1:]
91+
input_array = pa.chunked_array(
92+
[
93+
pa.StructArray.from_arrays(
94+
arrays=[a, b],
95+
names=["a", "b"],
96+
)
97+
]
98+
* 3
99+
)
100+
output_array = align_chunked_struct_list_offsets(input_array)
101+
assert output_array.equals(input_array)
102+
103+
# Input is a "non-aligned" chunked array
104+
a = pa.array([[0, 0, 0], [1, 2], [3, 4], [], [5, 6, 7]])[1:]
105+
b = pa.array([["x", "y"], ["y", "x"], [], ["d", "e", "f"]])
106+
input_array = pa.chunked_array(
107+
[
108+
pa.StructArray.from_arrays(
109+
arrays=[a, b],
110+
names=["a", "b"],
111+
)
112+
]
113+
* 4
114+
)
115+
output_array = align_chunked_struct_list_offsets(input_array)
116+
assert output_array.equals(input_array)
47117

48118

49119
def test_transpose_struct_list_type():

0 commit comments

Comments
 (0)