Skip to content

Commit 3065773

Browse files
committed
test: enhance null DataFrame tests to include date32 and date64 columns
1 parent b140523 commit 3065773

File tree

2 files changed

+140
-21
lines changed

2 files changed

+140
-21
lines changed

python/tests/test_dataframe.py

Lines changed: 139 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import os
1818
import re
1919
from typing import Any
20-
20+
import datetime
2121
import pyarrow as pa
2222
import pyarrow.parquet as pq
2323
import pytest
@@ -128,7 +128,9 @@ def null_df():
128128
pa.array([4.5, 6.7, None, None], type=pa.float64()),
129129
pa.array(["a", None, "c", None], type=pa.string()),
130130
pa.array([True, None, False, None], type=pa.bool_()),
131-
], names=["int_col", "float_col", "str_col", "bool_col"])
131+
pa.array([10957, None, 18993, None], type=pa.date32()), # 2000-01-01, null, 2022-01-01, null
132+
pa.array([946684800000, None, 1640995200000, None], type=pa.date64()), # 2000-01-01, null, 2022-01-01, null
133+
], names=["int_col", "float_col", "str_col", "bool_col", "date32_col", "date64_col"])
132134

133135
return ctx.create_dataframe([[batch]])
134136

@@ -1524,7 +1526,7 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame:
15241526
def test_dataframe_repr_html_structure(df) -> None:
15251527
"""Test that DataFrame._repr_html_ produces expected HTML output structure."""
15261528
import re
1527-
1529+
15281530
output = df._repr_html_()
15291531

15301532
# Since we've added a fair bit of processing to the html output, lets just verify
@@ -1658,14 +1660,12 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
16581660
local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
16591661

16601662
# Both calls should include styles
1661-
16621663
local_html_1 = local_formatter.format_html([batch], batch.schema)
16631664
local_html_2 = local_formatter.format_html([batch], batch.schema)
16641665

16651666
assert "<style>" in local_html_1
16661667
assert "<style>" in local_html_2
1667-
1668-
1668+
16691669
def test_fill_null_basic(null_df):
16701670
"""Test basic fill_null functionality with a single value."""
16711671
# Fill all nulls with 0
@@ -1674,12 +1674,12 @@ def test_fill_null_basic(null_df):
16741674
result = filled_df.collect()[0]
16751675

16761676
# Check that nulls were filled with 0 (or equivalent)
1677-
assert result.column(0).to_pylist() == [1, 0, 3, 0]
1678-
assert result.column(1).to_pylist() == [4.5, 6.7, 0.0, 0.0]
1677+
assert result.column(0) == pa.array([1, 0, 3, 0])
1678+
assert result.column(1) == pa.array([4.5, 6.7, 0.0, 0.0])
16791679
# String column should be filled with "0"
1680-
assert result.column(2).to_pylist() == ["a", "0", "c", "0"]
1680+
assert result.column(2) == pa.array(["a", "0", "c", "0"])
16811681
# Boolean column should be filled with False (0 converted to bool)
1682-
assert result.column(3).to_pylist() == [True, False, False, False]
1682+
assert result.column(3) == pa.array([True, False, False, False])
16831683

16841684

16851685
def test_fill_null_subset(null_df):
@@ -1690,11 +1690,131 @@ def test_fill_null_subset(null_df):
16901690
result = filled_df.collect()[0]
16911691

16921692
# Check that nulls were filled only in specified columns
1693-
assert result.column(0).to_pylist() == [1, 0, 3, 0]
1694-
assert result.column(1).to_pylist() == [4.5, 6.7, 0.0, 0.0]
1693+
assert result.column(0) == pa.array([1, 0, 3, 0])
1694+
assert result.column(1) == pa.array([4.5, 6.7, 0.0, 0.0])
16951695
# These should still have nulls
16961696
assert None in result.column(2).to_pylist()
16971697
assert None in result.column(3).to_pylist()
1698+
1699+
def test_fill_null_str_column(null_df):
1700+
"""Test filling nulls in string columns with different values."""
1701+
# Fill string nulls with a replacement string
1702+
filled_df = null_df.fill_null("N/A", subset=["str_col"])
1703+
1704+
result = filled_df.collect()[0]
1705+
1706+
# Check that string nulls were filled with "N/A"
1707+
assert result.column(2).to_pylist() == ["a", "N/A", "c", "N/A"]
1708+
1709+
# Other columns should be unchanged
1710+
assert None in result.column(0).to_pylist()
1711+
assert None in result.column(1).to_pylist()
1712+
assert None in result.column(3).to_pylist()
1713+
1714+
# Fill with an empty string
1715+
filled_df = null_df.fill_null("", subset=["str_col"])
1716+
result = filled_df.collect()[0]
1717+
assert result.column(2).to_pylist() == ["a", "", "c", ""]
1718+
1719+
1720+
def test_fill_null_bool_column(null_df):
1721+
"""Test filling nulls in boolean columns with different values."""
1722+
# Fill bool nulls with True
1723+
filled_df = null_df.fill_null(True, subset=["bool_col"])
1724+
1725+
result = filled_df.collect()[0]
1726+
1727+
# Check that bool nulls were filled with True
1728+
assert result.column(3).to_pylist() == [True, True, False, True]
1729+
1730+
# Other columns should be unchanged
1731+
assert None in result.column(0).to_pylist()
1732+
1733+
# Fill bool nulls with False
1734+
filled_df = null_df.fill_null(False, subset=["bool_col"])
1735+
result = filled_df.collect()[0]
1736+
assert result.column(3).to_pylist() == [True, False, False, False]
1737+
1738+
1739+
def test_fill_null_date32_column(null_df):
1740+
"""Test filling nulls in date32 columns."""
1741+
1742+
# Fill date32 nulls with a specific date (1970-01-01)
1743+
epoch_date = datetime.date(1970, 1, 1)
1744+
filled_df = null_df.fill_null(epoch_date, subset=["date32_col"])
1745+
1746+
result = filled_df.collect()[0]
1747+
1748+
# Check that date32 nulls were filled with epoch date
1749+
dates = result.column(4).to_pylist()
1750+
assert dates[0] == datetime.date(2000, 1, 1) # Original value
1751+
assert dates[1] == epoch_date # Filled value
1752+
assert dates[2] == datetime.date(2022, 1, 1) # Original value
1753+
assert dates[3] == epoch_date # Filled value
1754+
1755+
# Other date column should be unchanged
1756+
assert None in result.column(5).to_pylist()
1757+
1758+
1759+
def test_fill_null_date64_column(null_df):
1760+
"""Test filling nulls in date64 columns."""
1761+
1762+
# Fill date64 nulls with a specific date (1970-01-01)
1763+
epoch_date = datetime.date(1970, 1, 1)
1764+
filled_df = null_df.fill_null(epoch_date, subset=["date64_col"])
1765+
1766+
result = filled_df.collect()[0]
1767+
1768+
# Check that date64 nulls were filled with epoch date
1769+
dates = result.column(5).to_pylist()
1770+
assert dates[0] == datetime.date(2000, 1, 1) # Original value
1771+
assert dates[1] == epoch_date # Filled value
1772+
assert dates[2] == datetime.date(2022, 1, 1) # Original value
1773+
assert dates[3] == epoch_date # Filled value
1774+
1775+
# Other date column should be unchanged
1776+
assert None in result.column(4).to_pylist()
1777+
1778+
1779+
def test_fill_null_type_coercion(null_df):
1780+
"""Test type coercion when filling nulls with values of different types."""
1781+
# Try to fill string nulls with a number
1782+
filled_df = null_df.fill_null(42, subset=["str_col"])
1783+
1784+
result = filled_df.collect()[0]
1785+
1786+
# String nulls should be filled with string representation of the number
1787+
assert result.column(2).to_pylist() == ["a", "42", "c", "42"]
1788+
1789+
# Try to fill bool nulls with a string that converts to True
1790+
filled_df = null_df.fill_null("true", subset=["bool_col"])
1791+
result = filled_df.collect()[0]
1792+
1793+
# This behavior depends on the implementation - check it works without error
1794+
# but don't make assertions about exact conversion behavior
1795+
assert None not in result.column(3).to_pylist()
1796+
1797+
1798+
def test_fill_null_multiple_date_columns(null_df):
1799+
"""Test filling nulls in both date column types simultaneously."""
1800+
1801+
# Fill both date column types with the same date
1802+
test_date = datetime.date(2023, 12, 31)
1803+
filled_df = null_df.fill_null(test_date, subset=["date32_col", "date64_col"])
1804+
1805+
result = filled_df.collect()[0]
1806+
1807+
# Check both date columns were filled correctly
1808+
date32_vals = result.column(4).to_pylist()
1809+
date64_vals = result.column(5).to_pylist()
1810+
1811+
assert None not in date32_vals
1812+
assert None not in date64_vals
1813+
1814+
assert date32_vals[1] == test_date
1815+
assert date32_vals[3] == test_date
1816+
assert date64_vals[1] == test_date
1817+
assert date64_vals[3] == test_date
16981818

16991819

17001820
def test_fill_null_specific_types(null_df):
@@ -1705,10 +1825,13 @@ def test_fill_null_specific_types(null_df):
17051825
result = filled_df.collect()[0]
17061826

17071827
# Check that nulls were filled appropriately by type
1708-
assert result.column(0).to_pylist() == [1, 0, 3, 0] # Int gets 0 from "missing" conversion
1709-
assert result.column(1).to_pylist() == [4.5, 6.7, 0.0, 0.0] # Float gets 0.0
1710-
assert result.column(2).to_pylist() == ["a", "missing", "c", "missing"] # String gets "missing"
1711-
assert result.column(3).to_pylist() == [True, False, False, False] # Bool gets False
1828+
1829+
assert result.column(0).to_pylist() == [1, None, 3, None]
1830+
assert result.column(1).to_pylist() == [4.5, 6.7, None, None]
1831+
assert result.column(2).to_pylist() == ["a", "missing", "c", "missing"]
1832+
assert result.column(3).to_pylist() == [True, None, False, None] # Bool gets False
1833+
assert result.column(4).to_pylist() == [datetime.date(2000, 1, 1), None, datetime.date(2022, 1, 1), None]
1834+
assert result.column(5).to_pylist() == [datetime.date(2000, 1, 1), None, datetime.date(2022, 1, 1), None]
17121835

17131836

17141837
def test_fill_null_immutability(null_df):
@@ -1763,7 +1886,3 @@ def test_fill_null_all_null_column(ctx):
17631886
# Check that all nulls were filled
17641887
result = filled_df.collect()[0]
17651888
assert result.column(1).to_pylist() == ["filled", "filled", "filled"]
1766-
1767-
# Original should be unchanged
1768-
original = all_null_df.collect()[0]
1769-
assert original.column(1).null_count == 3

src/dataframe.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ impl PyDataFrame {
9494

9595
#[pymethods]
9696
impl PyDataFrame {
97-
/// Enable selection for `df[col]`, `df[col1, col2, col2]`, and `df[[col1, col2, col3]]`
97+
/// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`
9898
fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult<Self> {
9999
if let Ok(key) = key.extract::<PyBackedStr>() {
100100
// df[col]

0 commit comments

Comments
 (0)