Merge pull request #31 from lincc-frameworks/count_nested

dougbrn · web-flow · commit bd4f3ced91e4 · 2024-07-16T11:27:15.000-07:00
Wrapper for count_nested
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
 dynamic = ["version"]
 requires-python = ">=3.9"
 dependencies = [
-    'nested-pandas==0.1.1',
+    'nested-pandas==0.1.2',
     'numpy',
     'dask>=2024.3.0',
     'dask[distributed]>=2024.3.0',
diff --git a/src/nested_dask/__init__.py b/src/nested_dask/__init__.py
@@ -2,4 +2,5 @@
 from .core import NestedFrame  # noqa
 from .io import read_parquet  # noqa
 from .datasets import generate_data  # noqa
+from .utils import count_nested  # noqa
 from ._version import __version__  # noqa
diff --git a/src/nested_dask/utils/__init__.py b/src/nested_dask/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *  # noqa
diff --git a/src/nested_dask/utils/utils.py b/src/nested_dask/utils/utils.py
@@ -0,0 +1,54 @@
+import nested_pandas as npd
+import pandas as pd
+from nested_pandas import utils as npd_utils
+
+from ..core import NestedFrame
+
+
+def count_nested(df, nested, by=None, join=True) -> NestedFrame:
+    """Counts the number of rows of a nested dataframe.
+
+    Wraps Nested-Pandas count_nested.
+
+    Parameters
+    ----------
+    df: NestedFrame
+        A NestedFrame that contains the desired `nested` series
+        to count.
+    nested: 'str'
+        The label of the nested series to count.
+    by: 'str', optional
+        Specifies a column within nested to count by, returning
+        a count for each unique value in `by`.
+    join: bool, optional
+        Join the output count columns to df and return df, otherwise
+        just return a NestedFrame containing only the count columns.
+
+    Returns
+    -------
+    NestedFrame
+    """
+
+    # The meta varies depending on the parameters
+
+    # first depending on by
+    if by is not None:
+        # will have one column per unique value of the specified column
+        # requires some computation to determine these values
+        # TODO: Requires modification of nested-pandas to always produce
+        # sorted output columns for meta
+        by_cols = sorted(df[nested].nest.to_flat()[by].unique())
+        out_cols = [f"n_{nested}_{col}" for col in by_cols]
+    else:
+        # otherwise just have a single column output
+        out_cols = [f"n_{nested}"]
+
+    # add dtypes
+    meta = npd.NestedFrame({col: 0 for col in out_cols}, index=[])
+
+    # and second depending on join
+    if join:
+        # adds the meta onto the existing meta
+        meta = pd.concat([df._meta, meta])
+
+    return df.map_partitions(lambda x: npd_utils.count_nested(x, nested, by=by, join=join), meta=meta)
diff --git a/tests/nested_dask/test_utils.py b/tests/nested_dask/test_utils.py
@@ -0,0 +1,17 @@
+import nested_dask as nd
+import pytest
+from nested_pandas.utils import count_nested
+
+
+@pytest.mark.parametrize("join", [True, False])
+@pytest.mark.parametrize("by", [None, "band"])
+def test_count_nested(test_dataset, join, by):
+    """test the count_nested wrapper"""
+
+    # count_nested functionality is tested on the nested-pandas side
+    # let's just make sure the behavior here is identical.
+
+    result_dsk = nd.utils.count_nested(test_dataset, "nested", join=join, by=by).compute()
+    result_pd = count_nested(test_dataset.compute(), "nested", join=join, by=by)
+
+    assert result_dsk.equals(result_pd)