@@ -46,6 +46,18 @@ def _rebuild(self, graph, func, args): # type: ignore
4646 return collection
4747
4848
49+ def _nested_meta_from_flat (flat , name ):
50+ """construct meta for a packed series from a flat dataframe"""
51+ pd_fields = flat .dtypes .to_dict () # grabbing pandas dtypes
52+ pyarrow_fields = {} # grab underlying pyarrow dtypes
53+ for field , dtype in pd_fields .items ():
54+ if hasattr (dtype , "pyarrow_dtype" ):
55+ pyarrow_fields [field ] = dtype .pyarrow_dtype
56+ else : # or convert from numpy types
57+ pyarrow_fields [field ] = pa .from_numpy_dtype (dtype )
58+ return pd .Series (name = name , dtype = NestedDtype .from_fields (pyarrow_fields ))
59+
60+
4961class NestedFrame (
5062 _Frame , dd .DataFrame
5163): # can use dd.DataFrame instead of dx.DataFrame if the config is set true (default in >=2024.3.0)
@@ -70,17 +82,6 @@ def __getitem__(self, item):
7082 else :
7183 return super ().__getitem__ (item )
7284
73- def _nested_meta_from_flat (self , flat , name ):
74- """construct meta for a packed series from a flat dataframe"""
75- pd_fields = flat .dtypes .to_dict () # grabbing pandas dtypes
76- pyarrow_fields = {} # grab underlying pyarrow dtypes
77- for field , dtype in pd_fields .items ():
78- if hasattr (dtype , "pyarrow_dtype" ):
79- pyarrow_fields [field ] = dtype .pyarrow_dtype
80- else : # or convert from numpy types
81- pyarrow_fields [field ] = pa .from_numpy_dtype (dtype )
82- return pd .Series (name = name , dtype = NestedDtype .from_fields (pyarrow_fields ))
83-
8485 def __setitem__ (self , key , value ):
8586 """Adds custom __setitem__ behavior for nested columns"""
8687
@@ -102,8 +103,8 @@ def __setitem__(self, key, value):
102103 new_flat = new_flat .astype ({col : pd .ArrowDtype (pa .string ())})
103104
104105 # pack the modified df back into a nested column
105- meta = self . _nested_meta_from_flat (new_flat , nested )
106- packed = new_flat .map_partitions (lambda x : pack (x ), meta = meta )
106+ meta = _nested_meta_from_flat (new_flat , nested )
107+ packed = new_flat .map_partitions (lambda x : pack (x , dtype = meta . dtype ), meta = meta )
107108 return super ().__setitem__ (nested , packed )
108109
109110 # Adding a new nested structure from a column
@@ -114,8 +115,8 @@ def __setitem__(self, key, value):
114115 value .name = col
115116 value = value .to_frame ()
116117
117- meta = self . _nested_meta_from_flat (value , new_nested )
118- packed = value .map_partitions (lambda x : pack (x ), meta = meta )
118+ meta = _nested_meta_from_flat (value , new_nested )
119+ packed = value .map_partitions (lambda x : pack (x , dtype = meta . dtype ), meta = meta )
119120 return super ().__setitem__ (new_nested , packed )
120121
121122 return super ().__setitem__ (key , value )
@@ -280,6 +281,144 @@ def from_map(
280281 )
281282 return NestedFrame .from_dask_dataframe (nf )
282283
284+ @classmethod
285+ def from_flat (cls , df , base_columns , nested_columns = None , index = None , name = "nested" ):
286+ """Creates a NestedFrame with base and nested columns from a flat
287+ dataframe.
288+
289+ Parameters
290+ ----------
291+ df: dd.DataFrame or nd.NestedFrame
292+ A flat dataframe.
293+ base_columns: list-like
294+ The columns that should be used as base (flat) columns in the
295+ output dataframe.
296+ nested_columns: list-like, or None
297+ The columns that should be packed into a nested column. All columns
298+ in the list will attempt to be packed into a single nested column
299+ with the name provided in `nested_name`. If None, is defined as all
300+ columns not in `base_columns`.
301+ index: str, or None
302+ The name of a column to use as the new index. Typically, the index
303+ should have a unique value per row for base columns, and should
304+ repeat for nested columns. For example, a dataframe with two
305+ columns; a=[1,1,1,2,2,2] and b=[5,10,15,20,25,30] would want an
306+ index like [0,0,0,1,1,1] if a is chosen as a base column. If not
307+ provided the current index will be used.
308+ name:
309+ The name of the output column the `nested_columns` are packed into.
310+
311+ Returns
312+ -------
313+ NestedFrame
314+ A NestedFrame with the specified nesting structure.
315+ """
316+
317+ # Handle meta
318+ meta = npd .NestedFrame (df [base_columns ]._meta )
319+
320+ if nested_columns is None :
321+ nested_columns = [col for col in df .columns if (col not in base_columns ) and col != index ]
322+
323+ if len (nested_columns ) > 0 :
324+ nested_meta = pack (df [nested_columns ]._meta , name )
325+ meta = meta .join (nested_meta )
326+
327+ return df .map_partitions (
328+ lambda x : npd .NestedFrame .from_flat (
329+ df = x , base_columns = base_columns , nested_columns = nested_columns , index = index , name = name
330+ ),
331+ meta = meta ,
332+ )
333+
334+ @classmethod
335+ def from_lists (cls , df , base_columns = None , list_columns = None , name = "nested" ):
336+ """Creates a NestedFrame with base and nested columns from a flat
337+ dataframe.
338+
339+ Parameters
340+ ----------
341+ df: dd.DataFrame or nd.NestedFrame
342+ A dataframe with list columns.
343+ base_columns: list-like, or None
344+ Any columns that have non-list values in the input df. These will
345+ simply be kept as identical columns in the result
346+ list_columns: list-like, or None
347+ The list-value columns that should be packed into a nested column.
348+ All columns in the list will attempt to be packed into a single
349+ nested column with the name provided in `nested_name`. All columns
350+ in list_columns must have pyarrow list dtypes, otherwise the
351+ operation will fail. If None, is defined as all columns not in
352+ `base_columns`.
353+ name:
354+ The name of the output column the `nested_columns` are packed into.
355+
356+ Returns
357+ -------
358+ NestedFrame
359+ A NestedFrame with the specified nesting structure.
360+
361+ Note
362+ ----
363+ As noted above, all columns in `list_columns` must have a pyarrow
364+ ListType dtype. This is needed for proper meta propagation. To convert
365+ a list column to this dtype, you can use this command structure:
366+ `nf= nf.astype({"colname": pd.ArrowDtype(pa.list_(pa.int64()))})`
367+
368+ Where pa.int64 above should be replaced with the correct dtype of the
369+ underlying data accordingly.
370+
371+ Additionally, it's a known issue in Dask
372+ (https://github.com/dask/dask/issues/10139) that columns with list
373+ values will by default be converted to the string type. This will
374+ interfere with the ability to recast these to pyarrow lists. We
375+ recommend setting the following dask config setting to prevent this:
376+ `dask.config.set({"dataframe.convert-string":False})`
377+
378+ """
379+
380+ # Resolve inputs for meta
381+ if base_columns is None :
382+ if list_columns is None :
383+ # with no inputs, assume all columns are list-valued
384+ list_columns = df .columns
385+ else :
386+ # if list_columns are defined, assume everything else is base
387+ base_columns = [col for col in df .columns if col not in list_columns ]
388+ else :
389+ if list_columns is None :
390+ # with defined base_columns, assume everything else is list
391+ list_columns = [col for col in df .columns if col not in base_columns ]
392+
393+ # from_lists should have at least one list column defined
394+ if len (list_columns ) == 0 :
395+ raise ValueError ("No columns were assigned as list columns." )
396+ else :
397+ # reject any list columns that are not pyarrow dtyped
398+ for col in list_columns :
399+ if not hasattr (df [col ].dtype , "pyarrow_dtype" ):
400+ raise TypeError (
401+ f"""List column '{ col } ' dtype ({ df [col ].dtype } ) is not a pyarrow list dtype.
402+ Refer to the docstring for guidance on dtype requirements and assignment."""
403+ )
404+ elif not pa .types .is_list (df [col ].dtype .pyarrow_dtype ):
405+ raise TypeError (
406+ f"""List column '{ col } ' dtype ({ df [col ].dtype } ) is not a pyarrow list dtype.
407+ Refer to the docstring for guidance on dtype requirements and assignment."""
408+ )
409+
410+ meta = npd .NestedFrame (df [base_columns ]._meta )
411+
412+ nested_meta = pack (df [list_columns ]._meta , name )
413+ meta = meta .join (nested_meta )
414+
415+ return df .map_partitions (
416+ lambda x : npd .NestedFrame .from_lists (
417+ df = x , base_columns = base_columns , list_columns = list_columns , name = name
418+ ),
419+ meta = meta ,
420+ )
421+
283422 def compute (self , ** kwargs ):
284423 """Compute this Dask collection, returning the underlying dataframe or series."""
285424 return npd .NestedFrame (super ().compute (** kwargs ))
0 commit comments