Skip to content

Commit 919fe82

Browse files
authored
Revisit nf.explode() for multiple columns and duplicated index (#332)
* .explode(): multiple columns and non-uniq idx * Fix mypy issue * code coverage
1 parent 46acb8e commit 919fe82

File tree

2 files changed

+178
-41
lines changed

2 files changed

+178
-41
lines changed

src/nested_pandas/nestedframe/core.py

Lines changed: 82 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def repack_row(chunk, header=True):
129129
# Add a row that shows the number of additional rows not shown
130130
len_row = pd.DataFrame(
131131
{
132-
col: [f"<i>+{n_rows-1} rows</i>"] if i == 0 else ["..."]
132+
col: [f"<i>+{n_rows - 1} rows</i>"] if i == 0 else ["..."]
133133
for i, col in enumerate(chunk.columns)
134134
}
135135
)
@@ -983,31 +983,32 @@ def describe(self, exclude_nest: bool = False, percentiles=None, include=None, e
983983
def explode(self, column: IndexLabel, ignore_index: bool = False):
984984
"""
985985
986-
Transform each element of a list-like base column to a row, replicating index value.
987-
Or unnest a specified nested column with the other columns being replicated as part
988-
of the unnest. The exploded columns will be added to the right of the rest of the frame.
986+
Transform each element of a list-like base column to a row, replicating index values.
989987
990988
Parameters
991989
----------
992990
column : IndexLabel
993-
Base column(s) or nested column to explode.
994-
For multiple base columns, specify a non-empty list with each element being a string or tuple.
995-
For all specified base columns, their list-like data on same row of the frame
996-
must have matching length.
997-
Only a single nested column can be exploded at a time. Indicate the nested column as a string.
991+
Column(s) to explode.
992+
For multiple columns, specify a non-empty list with each element
993+
be str or tuple, and all specified columns their list-like data
994+
on same row of the frame must have matching length.
998995
ignore_index : bool, default False
999996
If True, the resulting index will be labeled 0, 1, ..., n - 1.
1000997
1001998
Returns
1002999
-------
10031000
NestedFrame
1004-
A new NestedFrame with the specified column(s) exploded.
1001+
Exploded lists and to rows of the subset columns;
1002+
index will be duplicated for these rows.
10051003
10061004
Raises
10071005
------
10081006
ValueError
1009-
If specified columns to explode have more than one nested column,
1010-
or contain a mix of nested and base columns.
1007+
It raises if:
1008+
1) columns of the frame are not unique,
1009+
2) specified columns to explode is an empty list,
1010+
3) specified columns to explode do not have matching counts of
1011+
elements rowwise in the frame.
10111012
10121013
See Also
10131014
--------
@@ -1033,40 +1034,82 @@ def explode(self, column: IndexLabel, ignore_index: bool = False):
10331034
10341035
"""
10351036

1036-
if isinstance(column, list):
1037-
nested_in_list = [col for col in column if col in self.nested_columns]
1038-
# list contains more than 1 nested columns
1039-
if len(nested_in_list) > 1:
1037+
if isinstance(column, str):
1038+
columns = [column]
1039+
elif isinstance(column, list):
1040+
columns = column
1041+
if len(columns) == 0:
1042+
raise ValueError("`column` must not be empty")
1043+
if len(set(columns)) != len(columns):
1044+
raise ValueError("`column` must have unique elements")
1045+
else:
1046+
raise ValueError("`column` must be str or list")
1047+
if len(extra_cols := set(columns) - set(self.columns)) > 0:
1048+
if len(extra_cols) == 1:
10401049
raise ValueError(
1041-
f"Exploding multiple nested columns at once is not supported.\n"
1042-
f"Nested columns: {nested_in_list}"
1050+
f"column {extra_cols.pop()} not found, available columns: {list(self.columns)}"
10431051
)
1052+
raise ValueError(
1053+
f"columns {sorted(extra_cols)} not found, available columns: {list(self.columns)}"
1054+
)
10441055

1045-
# list contains mixing nested & base columns
1046-
if len(nested_in_list) == 1 and len(column) > 1:
1056+
nested_columns = [col for col in columns if col in self.nested_columns]
1057+
base_columns = [col for col in columns if col not in nested_columns]
1058+
1059+
# Shortcut for the base-column-only case
1060+
if len(nested_columns) == 0:
1061+
return NestedFrame(super().explode(columns, ignore_index=ignore_index))
1062+
1063+
# Handle duplicated index use-case: use "ordinal" index, but keep the original one as a column to
1064+
# restore it later.
1065+
default_index_name = "__index_"
1066+
index_col_name = self.index.name or default_index_name
1067+
w_ordinal_idx = self.reset_index(drop=False, names=index_col_name)
1068+
1069+
# Call pandas.DataFrame.explode for non-nested columns
1070+
all_but_requested_nested_columns = [col for col in w_ordinal_idx.columns if col not in nested_columns]
1071+
base_exploded = w_ordinal_idx[all_but_requested_nested_columns]
1072+
if len(all_but_requested_nested_columns) > 0 and len(base_columns) > 0:
1073+
base_exploded = super(NestedFrame, base_exploded).explode(base_columns, ignore_index=False)
1074+
base_exploded = NestedFrame(base_exploded)
1075+
1076+
# Check if it was actually exploded, or no list-columns were there.
1077+
# This could fail in the case when all lists had one element only, we ignore that edge-case here.
1078+
is_base_exploded = not w_ordinal_idx.index.equals(base_exploded.index)
1079+
1080+
# Unnest each requested nested column and store as a "flat" dataframe.
1081+
flat_frames: list[Self] = [] # type: ignore[name-defined] # noqa: F821
1082+
for nested_col in nested_columns:
1083+
# Check if counts (lengths) in nested columns mismatch
1084+
if len(flat_frames) > 0 and np.any(
1085+
w_ordinal_idx[nested_col].nest.list_lengths
1086+
!= w_ordinal_idx[nested_columns[0]].nest.list_lengths
1087+
):
10471088
raise ValueError(
1048-
f"Exploding nested column together with base columns is not supported.\n"
1049-
f"Nested column: {nested_in_list[0]}"
1089+
f"One or few rows of {nested_col} have different element counts from {nested_columns[0]}"
10501090
)
1091+
flat = w_ordinal_idx[nested_col].nest.to_flat()
1092+
# Check if counts (lengths) of this nested column mismatch with one of the list columns.
1093+
if is_base_exploded and not base_exploded.index.equals(flat.index):
1094+
raise ValueError(
1095+
f"One or few rows of {nested_col} have different element counts "
1096+
f"from one or few of these columns: {base_columns}"
1097+
)
1098+
flat_frames.append(flat)
10511099

1052-
# normalize a single-element list to string
1053-
if isinstance(column, list) and len(column) == 1:
1054-
column = column[0]
1055-
1056-
# handle single nested column explode
1057-
if isinstance(column, str) and column in self.nested_columns:
1058-
selected_nested_df = self[column].nest.to_flat()
1059-
other_col = [col for col in self.columns if col != column]
1060-
other_col_df = self[other_col]
1061-
result = other_col_df.join(selected_nested_df)
1062-
1063-
if ignore_index:
1064-
result = result.reset_index(drop=True)
1065-
1066-
return NestedFrame(result)
1100+
if is_base_exploded:
1101+
result = pd.concat([base_exploded] + flat_frames, axis=1)
1102+
else:
1103+
# Join works here, because we used the ordinal index before exploding
1104+
result = base_exploded.join(pd.concat(flat_frames, axis=1))
10671105

1068-
# otherwise just use pandas' explode
1069-
return NestedFrame(super().explode(column=column, ignore_index=ignore_index))
1106+
if ignore_index:
1107+
return result.drop(index_col_name, axis=1).reset_index(drop=True)
1108+
# Restore original index
1109+
result = result.set_index(index_col_name, drop=True)
1110+
if result.index.name == default_index_name:
1111+
result.index.name = None
1112+
return result
10701113

10711114
def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
10721115
"""Evaluate a string describing operations on NestedFrame columns.

tests/nested_pandas/nestedframe/test_nestedframe.py

Lines changed: 96 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from nested_pandas.datasets import generate_data
77
from nested_pandas.nestedframe.core import _SeriesFromNest
88
from nested_pandas.series.packer import pack_lists
9-
from pandas.testing import assert_frame_equal
9+
from pandas.testing import assert_frame_equal, assert_index_equal
1010

1111

1212
def test_nestedframe_construction():
@@ -1638,7 +1638,7 @@ def test_describe():
16381638
assert "top" not in r18.index
16391639

16401640

1641-
def test_explode():
1641+
def test_explode_1():
16421642
"""Test NestedFrame.explode gives correct result for flattening specified nested columns"""
16431643
base = NestedFrame(
16441644
data={
@@ -1693,6 +1693,100 @@ def test_explode():
16931693
assert (r5["f"] == expected5).all()
16941694

16951695

1696+
def test_explode_non_unique_index():
1697+
"""Test NestedFrame.explode function with non-unique index"""
1698+
n_base = 100
1699+
n_layer = 3
1700+
1701+
nf = generate_data(n_base, n_layer)
1702+
# Add a new nested column which has the same element length as the "nested"
1703+
nf["aligned_nested.aligned_t"] = nf["nested.t"]
1704+
# Add a new nested column which has different lengths
1705+
nf["unaligned_nested"] = nf.reduce(
1706+
lambda x: {"unaligned_nested.unaligned_t": x[:2]}, "nested.t"
1707+
).reset_index(drop=True)
1708+
# Add a list column which has the same lengths
1709+
nf["aligned_list_t"] = nf["nested"].nest.to_lists("t")["t"]
1710+
# Add a list column which has different lengths
1711+
nf["unaligned_list_t"] = nf["nested"].nest.to_lists("t")["t"].list[:2]
1712+
# Make index non-unique
1713+
nf.index = np.tile(np.arange(10), 10)
1714+
nf.index.name = "my_index"
1715+
1716+
# Check that explode does nothing on a non-list base column
1717+
assert_frame_equal(nf, nf.explode("a"))
1718+
1719+
# Check that explode works on a base column
1720+
assert_frame_equal(
1721+
pd.DataFrame(nf).explode("unaligned_list_t"),
1722+
nf.explode("unaligned_list_t"),
1723+
check_frame_type=False,
1724+
)
1725+
assert_frame_equal(
1726+
pd.DataFrame(nf).explode("aligned_list_t", ignore_index=True),
1727+
nf.explode("aligned_list_t", ignore_index=True),
1728+
check_frame_type=False,
1729+
)
1730+
1731+
# Check that explode works on a single nested column
1732+
nested_exploded = nf.explode("nested")
1733+
assert nested_exploded.shape == (
1734+
n_base * n_layer,
1735+
len(nf.columns) - 1 + len(nf.all_columns["nested"]),
1736+
)
1737+
assert_index_equal(nested_exploded.index, pd.Index(np.repeat(nf.index, n_layer), name="my_index"))
1738+
1739+
# Check that explode works on two nested columns
1740+
two_nested_exploded = nf.explode(["nested", "aligned_nested"])
1741+
assert two_nested_exploded.shape == (
1742+
n_base * n_layer,
1743+
len(nf.columns) - 2 + len(nf.all_columns["nested"]) + len(nf.all_columns["aligned_nested"]),
1744+
)
1745+
assert "t" in two_nested_exploded.columns
1746+
assert "aligned_t" in two_nested_exploded.columns
1747+
assert_index_equal(two_nested_exploded.index, pd.Index(np.repeat(nf.index, n_layer), name="my_index"))
1748+
1749+
# Check that explode works on a mix of list-column and nested column
1750+
list_nested_exploded = nf.explode(["nested", "aligned_list_t"], ignore_index=True)
1751+
assert list_nested_exploded.shape == (
1752+
n_base * n_layer,
1753+
len(nf.columns) - 1 + len(nf.all_columns["nested"]),
1754+
)
1755+
assert_index_equal(list_nested_exploded.index, pd.Index(range(n_base * n_layer)))
1756+
1757+
# Check that explode fails when running on "unaligned" list columns
1758+
with pytest.raises(ValueError):
1759+
nf.explode(["aligned_list_t", "unaligned_list_t"])
1760+
1761+
# Check that explode fails when running on "unaligned" nested columns
1762+
with pytest.raises(ValueError):
1763+
nf.explode(["nested", "unaligned_nested"])
1764+
1765+
# Check that explode fails when running on nested and list columns, which are not aligned
1766+
with pytest.raises(ValueError):
1767+
nf.explode(["nested", "unaligned_list_t"])
1768+
1769+
# Check that explode fails on invalid `columns` inputs
1770+
# Empty input
1771+
with pytest.raises(ValueError):
1772+
nf.explode([])
1773+
# Non-string, non-list values
1774+
with pytest.raises(ValueError):
1775+
nf.explode(b"nested")
1776+
with pytest.raises(ValueError):
1777+
nf.explode(("nested", "aligned_nested"))
1778+
# Repeated column names
1779+
with pytest.raises(ValueError):
1780+
nf.explode(["nested"] * 2)
1781+
# Non-existing columns
1782+
with pytest.raises(ValueError):
1783+
nf.explode("XXX")
1784+
with pytest.raises(ValueError):
1785+
nf.explode(["nested", "XXX"])
1786+
with pytest.raises(ValueError):
1787+
nf.explode(["nested", "XXX", "AAA"])
1788+
1789+
16961790
def test_eval():
16971791
"""
16981792
Test basic behavior of NestedFrame.eval, and that it can handle nested references

0 commit comments

Comments
 (0)