Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions superset/utils/pandas_postprocessing/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ def histogram(
if groupby is None:
groupby = []

# drop empty values from the target column
df.dropna(subset=[column], inplace=True)
if df.empty:
return df
Comment on lines +52 to +54
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Input DataFrame modified in-place category Design

Tell me more
What is the issue?

The function modifies the input DataFrame in-place by dropping rows with NULLs, which violates the principle of not mutating input parameters and can cause unexpected side effects for the caller.

Why this matters

Callers may not expect their original DataFrame to be modified, leading to data loss in the calling code and potential bugs in downstream processing that depends on the original DataFrame structure.

Suggested change ∙ Feature Preview

Create a copy of the DataFrame before dropping NULLs:

# drop empty values from the target column
df = df.dropna(subset=[column])
if df.empty:
    return df
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


# convert to numeric, coercing errors to NaN
df[column] = to_numeric(df[column], errors="coerce")

Expand Down
67 changes: 67 additions & 0 deletions tests/unit_tests/pandas_postprocessing/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,70 @@ def test_histogram_with_some_non_numeric_values():
histogram(data_with_non_numeric, "a", ["group"], bins)
except ValueError as e:
assert str(e) == "Column 'group' contains non-numeric values" # noqa: PT017


def test_histogram_with_groupby_and_some_null_values():
data_with_groupby_and_some_nulls = DataFrame(
{
"group": ["A", "A", "B", "B", "A", "A", "B", "B", "A", "A"],
"a": [1, 2, 3, 4, 5, None, 7, 8, 9, 10],
"b": [1, 2, 3, 4, 5, None, 7, 8, 9, 10],
}
)

result = histogram(data_with_groupby_and_some_nulls, "a", ["group"], bins)
assert result.shape == (2, bins + 1)
assert result.columns.tolist() == [
"group",
"1.0 - 2.8",
"2.8 - 4.6",
"4.6 - 6.4",
"6.4 - 8.2",
"8.2 - 10.0",
]
assert result.values.tolist() == [["A", 2, 0, 1, 0, 2], ["B", 0, 2, 0, 2, 0]]


def test_histogram_with_no_groupby_and_some_null_values():
data_with_no_groupby_and_some_nulls = DataFrame(
{
"a": [1, 2, 3, 4, 5, None, 7, 8, 9, 10],
"b": [1, 2, 3, 4, 5, None, 7, 8, 9, 10],
}
)

result = histogram(data_with_no_groupby_and_some_nulls, "a", [], bins)
assert result.shape == (1, bins)
assert result.columns.tolist() == [
"1.0 - 2.8",
"2.8 - 4.6",
"4.6 - 6.4",
"6.4 - 8.2",
"8.2 - 10.0",
]
assert result.values.tolist() == [[2, 2, 1, 2, 2]]


def test_histogram_with_groupby_and_all_null_values():
data_with_groupby_and_all_nulls = DataFrame(
{
"group": ["A", "A", "B", "B", "A", "A", "B", "B", "A", "A"],
"a": [None, None, None, None, None, None, None, None, None, None],
"b": [None, None, None, None, None, None, None, None, None, None],
}
)

result = histogram(data_with_groupby_and_all_nulls, "a", ["group"], bins)
assert result.empty


def test_histogram_with_no_groupby_and_all_null_values():
data_with_no_groupby_and_all_nulls = DataFrame(
{
"a": [None, None, None, None, None, None, None, None, None, None],
"b": [None, None, None, None, None, None, None, None, None, None],
}
)

result = histogram(data_with_no_groupby_and_all_nulls, "a", [], bins)
assert result.empty
Loading