Skip to content
This repository was archived by the owner on Dec 1, 2025. It is now read-only.

Commit cb71a92

Browse files
authored
Merge pull request #49 from lincc-frameworks/from_flat_and_lists
Wrap `from_lists` and `from_flat`
2 parents 24fb348 + 745b5d1 commit cb71a92

File tree

4 files changed

+311
-16
lines changed

4 files changed

+311
-16
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ classifiers = [
1616
dynamic = ["version"]
1717
requires-python = ">=3.9"
1818
dependencies = [
19-
'nested-pandas==0.1.3',
19+
'nested-pandas==0.2.1',
2020
'numpy',
2121
'dask>=2024.3.0',
2222
'dask[distributed]>=2024.3.0',

src/nested_dask/core.py

Lines changed: 154 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,18 @@ def _rebuild(self, graph, func, args): # type: ignore
4646
return collection
4747

4848

49+
def _nested_meta_from_flat(flat, name):
50+
"""construct meta for a packed series from a flat dataframe"""
51+
pd_fields = flat.dtypes.to_dict() # grabbing pandas dtypes
52+
pyarrow_fields = {} # grab underlying pyarrow dtypes
53+
for field, dtype in pd_fields.items():
54+
if hasattr(dtype, "pyarrow_dtype"):
55+
pyarrow_fields[field] = dtype.pyarrow_dtype
56+
else: # or convert from numpy types
57+
pyarrow_fields[field] = pa.from_numpy_dtype(dtype)
58+
return pd.Series(name=name, dtype=NestedDtype.from_fields(pyarrow_fields))
59+
60+
4961
class NestedFrame(
5062
_Frame, dd.DataFrame
5163
): # can use dd.DataFrame instead of dx.DataFrame if the config is set true (default in >=2024.3.0)
@@ -70,17 +82,6 @@ def __getitem__(self, item):
7082
else:
7183
return super().__getitem__(item)
7284

73-
def _nested_meta_from_flat(self, flat, name):
74-
"""construct meta for a packed series from a flat dataframe"""
75-
pd_fields = flat.dtypes.to_dict() # grabbing pandas dtypes
76-
pyarrow_fields = {} # grab underlying pyarrow dtypes
77-
for field, dtype in pd_fields.items():
78-
if hasattr(dtype, "pyarrow_dtype"):
79-
pyarrow_fields[field] = dtype.pyarrow_dtype
80-
else: # or convert from numpy types
81-
pyarrow_fields[field] = pa.from_numpy_dtype(dtype)
82-
return pd.Series(name=name, dtype=NestedDtype.from_fields(pyarrow_fields))
83-
8485
def __setitem__(self, key, value):
8586
"""Adds custom __setitem__ behavior for nested columns"""
8687

@@ -102,8 +103,8 @@ def __setitem__(self, key, value):
102103
new_flat = new_flat.astype({col: pd.ArrowDtype(pa.string())})
103104

104105
# pack the modified df back into a nested column
105-
meta = self._nested_meta_from_flat(new_flat, nested)
106-
packed = new_flat.map_partitions(lambda x: pack(x), meta=meta)
106+
meta = _nested_meta_from_flat(new_flat, nested)
107+
packed = new_flat.map_partitions(lambda x: pack(x, dtype=meta.dtype), meta=meta)
107108
return super().__setitem__(nested, packed)
108109

109110
# Adding a new nested structure from a column
@@ -114,8 +115,8 @@ def __setitem__(self, key, value):
114115
value.name = col
115116
value = value.to_frame()
116117

117-
meta = self._nested_meta_from_flat(value, new_nested)
118-
packed = value.map_partitions(lambda x: pack(x), meta=meta)
118+
meta = _nested_meta_from_flat(value, new_nested)
119+
packed = value.map_partitions(lambda x: pack(x, dtype=meta.dtype), meta=meta)
119120
return super().__setitem__(new_nested, packed)
120121

121122
return super().__setitem__(key, value)
@@ -280,6 +281,144 @@ def from_map(
280281
)
281282
return NestedFrame.from_dask_dataframe(nf)
282283

284+
@classmethod
285+
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
286+
"""Creates a NestedFrame with base and nested columns from a flat
287+
dataframe.
288+
289+
Parameters
290+
----------
291+
df: dd.DataFrame or nd.NestedFrame
292+
A flat dataframe.
293+
base_columns: list-like
294+
The columns that should be used as base (flat) columns in the
295+
output dataframe.
296+
nested_columns: list-like, or None
297+
The columns that should be packed into a nested column. All columns
298+
in the list will attempt to be packed into a single nested column
299+
with the name provided in `nested_name`. If None, is defined as all
300+
columns not in `base_columns`.
301+
index: str, or None
302+
The name of a column to use as the new index. Typically, the index
303+
should have a unique value per row for base columns, and should
304+
repeat for nested columns. For example, a dataframe with two
305+
columns; a=[1,1,1,2,2,2] and b=[5,10,15,20,25,30] would want an
306+
index like [0,0,0,1,1,1] if a is chosen as a base column. If not
307+
provided the current index will be used.
308+
name:
309+
The name of the output column the `nested_columns` are packed into.
310+
311+
Returns
312+
-------
313+
NestedFrame
314+
A NestedFrame with the specified nesting structure.
315+
"""
316+
317+
# Handle meta
318+
meta = npd.NestedFrame(df[base_columns]._meta)
319+
320+
if nested_columns is None:
321+
nested_columns = [col for col in df.columns if (col not in base_columns) and col != index]
322+
323+
if len(nested_columns) > 0:
324+
nested_meta = pack(df[nested_columns]._meta, name)
325+
meta = meta.join(nested_meta)
326+
327+
return df.map_partitions(
328+
lambda x: npd.NestedFrame.from_flat(
329+
df=x, base_columns=base_columns, nested_columns=nested_columns, index=index, name=name
330+
),
331+
meta=meta,
332+
)
333+
334+
@classmethod
335+
def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
336+
"""Creates a NestedFrame with base and nested columns from a flat
337+
dataframe.
338+
339+
Parameters
340+
----------
341+
df: dd.DataFrame or nd.NestedFrame
342+
A dataframe with list columns.
343+
base_columns: list-like, or None
344+
Any columns that have non-list values in the input df. These will
345+
simply be kept as identical columns in the result
346+
list_columns: list-like, or None
347+
The list-value columns that should be packed into a nested column.
348+
All columns in the list will attempt to be packed into a single
349+
nested column with the name provided in `nested_name`. All columns
350+
in list_columns must have pyarrow list dtypes, otherwise the
351+
operation will fail. If None, is defined as all columns not in
352+
`base_columns`.
353+
name:
354+
The name of the output column the `nested_columns` are packed into.
355+
356+
Returns
357+
-------
358+
NestedFrame
359+
A NestedFrame with the specified nesting structure.
360+
361+
Note
362+
----
363+
As noted above, all columns in `list_columns` must have a pyarrow
364+
ListType dtype. This is needed for proper meta propagation. To convert
365+
a list column to this dtype, you can use this command structure:
366+
`nf= nf.astype({"colname": pd.ArrowDtype(pa.list_(pa.int64()))})`
367+
368+
Where pa.int64 above should be replaced with the correct dtype of the
369+
underlying data accordingly.
370+
371+
Additionally, it's a known issue in Dask
372+
(https://github.com/dask/dask/issues/10139) that columns with list
373+
values will by default be converted to the string type. This will
374+
interfere with the ability to recast these to pyarrow lists. We
375+
recommend setting the following dask config setting to prevent this:
376+
`dask.config.set({"dataframe.convert-string":False})`
377+
378+
"""
379+
380+
# Resolve inputs for meta
381+
if base_columns is None:
382+
if list_columns is None:
383+
# with no inputs, assume all columns are list-valued
384+
list_columns = df.columns
385+
else:
386+
# if list_columns are defined, assume everything else is base
387+
base_columns = [col for col in df.columns if col not in list_columns]
388+
else:
389+
if list_columns is None:
390+
# with defined base_columns, assume everything else is list
391+
list_columns = [col for col in df.columns if col not in base_columns]
392+
393+
# from_lists should have at least one list column defined
394+
if len(list_columns) == 0:
395+
raise ValueError("No columns were assigned as list columns.")
396+
else:
397+
# reject any list columns that are not pyarrow dtyped
398+
for col in list_columns:
399+
if not hasattr(df[col].dtype, "pyarrow_dtype"):
400+
raise TypeError(
401+
f"""List column '{col}' dtype ({df[col].dtype}) is not a pyarrow list dtype.
402+
Refer to the docstring for guidance on dtype requirements and assignment."""
403+
)
404+
elif not pa.types.is_list(df[col].dtype.pyarrow_dtype):
405+
raise TypeError(
406+
f"""List column '{col}' dtype ({df[col].dtype}) is not a pyarrow list dtype.
407+
Refer to the docstring for guidance on dtype requirements and assignment."""
408+
)
409+
410+
meta = npd.NestedFrame(df[base_columns]._meta)
411+
412+
nested_meta = pack(df[list_columns]._meta, name)
413+
meta = meta.join(nested_meta)
414+
415+
return df.map_partitions(
416+
lambda x: npd.NestedFrame.from_lists(
417+
df=x, base_columns=base_columns, list_columns=list_columns, name=name
418+
),
419+
meta=meta,
420+
)
421+
283422
def compute(self, **kwargs):
284423
"""Compute this Dask collection, returning the underlying dataframe or series."""
285424
return npd.NestedFrame(super().compute(**kwargs))

tests/nested_dask/test_io.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import nested_dask as nd
2+
import pandas as pd
3+
import pyarrow as pa
24

35

46
def test_read_parquet(test_dataset, tmp_path):
@@ -19,6 +21,8 @@ def test_read_parquet(test_dataset, tmp_path):
1921
base = nd.read_parquet(test_save_path, calculate_divisions=True)
2022
nested = nd.read_parquet(nested_save_path, calculate_divisions=True)
2123

24+
# this is read as a large_string, just make it a string
25+
nested = nested.astype({"band": pd.ArrowDtype(pa.string())})
2226
base = base.add_nested(nested, "nested")
2327

2428
# Check the loaded dataset against the original

0 commit comments

Comments
 (0)