Skip to content
This repository was archived by the owner on Dec 1, 2025. It is now read-only.

Commit bd4f3ce

Browse files
authored
Merge pull request #31 from lincc-frameworks/count_nested
Wrapper for count_nested
2 parents 73e988c + 23810a6 commit bd4f3ce

File tree

5 files changed

+74
-1
lines changed

5 files changed

+74
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ classifiers = [
1616
dynamic = ["version"]
1717
requires-python = ">=3.9"
1818
dependencies = [
19-
'nested-pandas==0.1.1',
19+
'nested-pandas==0.1.2',
2020
'numpy',
2121
'dask>=2024.3.0',
2222
'dask[distributed]>=2024.3.0',

src/nested_dask/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
from .core import NestedFrame # noqa
33
from .io import read_parquet # noqa
44
from .datasets import generate_data # noqa
5+
from .utils import count_nested # noqa
56
from ._version import __version__ # noqa

src/nested_dask/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .utils import * # noqa

src/nested_dask/utils/utils.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import nested_pandas as npd
2+
import pandas as pd
3+
from nested_pandas import utils as npd_utils
4+
5+
from ..core import NestedFrame
6+
7+
8+
def count_nested(df, nested, by=None, join=True) -> NestedFrame:
9+
"""Counts the number of rows of a nested dataframe.
10+
11+
Wraps Nested-Pandas count_nested.
12+
13+
Parameters
14+
----------
15+
df: NestedFrame
16+
A NestedFrame that contains the desired `nested` series
17+
to count.
18+
nested: 'str'
19+
The label of the nested series to count.
20+
by: 'str', optional
21+
Specifies a column within nested to count by, returning
22+
a count for each unique value in `by`.
23+
join: bool, optional
24+
Join the output count columns to df and return df, otherwise
25+
just return a NestedFrame containing only the count columns.
26+
27+
Returns
28+
-------
29+
NestedFrame
30+
"""
31+
32+
# The meta varies depending on the parameters
33+
34+
# first depending on by
35+
if by is not None:
36+
# will have one column per unique value of the specified column
37+
# requires some computation to determine these values
38+
# TODO: Requires modification of nested-pandas to always produce
39+
# sorted output columns for meta
40+
by_cols = sorted(df[nested].nest.to_flat()[by].unique())
41+
out_cols = [f"n_{nested}_{col}" for col in by_cols]
42+
else:
43+
# otherwise just have a single column output
44+
out_cols = [f"n_{nested}"]
45+
46+
# add dtypes
47+
meta = npd.NestedFrame({col: 0 for col in out_cols}, index=[])
48+
49+
# and second depending on join
50+
if join:
51+
# adds the meta onto the existing meta
52+
meta = pd.concat([df._meta, meta])
53+
54+
return df.map_partitions(lambda x: npd_utils.count_nested(x, nested, by=by, join=join), meta=meta)

tests/nested_dask/test_utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import nested_dask as nd
2+
import pytest
3+
from nested_pandas.utils import count_nested
4+
5+
6+
@pytest.mark.parametrize("join", [True, False])
7+
@pytest.mark.parametrize("by", [None, "band"])
8+
def test_count_nested(test_dataset, join, by):
9+
"""test the count_nested wrapper"""
10+
11+
# count_nested functionality is tested on the nested-pandas side
12+
# let's just make sure the behavior here is identical.
13+
14+
result_dsk = nd.utils.count_nested(test_dataset, "nested", join=join, by=by).compute()
15+
result_pd = count_nested(test_dataset.compute(), "nested", join=join, by=by)
16+
17+
assert result_dsk.equals(result_pd)

0 commit comments

Comments
 (0)