Down to 40 failing tests

jbrockmendel · jbrockmendel · commit ca6e8e82c1e0 · 2025-07-05T09:41:02.000-07:00
diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
@@ -33,3 +33,8 @@
 def using_string_dtype() -> bool:
     _mode_options = _global_config["future"]
     return _mode_options["infer_string"]
+
+
+def using_pyarrow_strict_nans() -> bool:
+    _mode_options = _global_config["mode"]
+    return _mode_options["pyarrow_strict_nans"]
diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi
@@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
 def checknull(val: object) -> bool: ...
 def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
 def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
+def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
     return checknull_with_nat(obj) or obj is C_NA
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def is_pdna_or_none(values: ndarray) -> ndarray:
+    cdef:
+        ndarray[uint8_t] result
+        Py_ssize_t i, N
+        object val
+
+    N = len(values)
+    result = np.zeros(N, dtype=np.uint8)
+
+    for i in range(N):
+        val = values[i]
+        if val is None or val is C_NA:
+            result[i] = True
+    return result.view(bool)
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def is_numeric_na(values: ndarray) -> ndarray:
diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py
@@ -7,7 +7,10 @@
 
 import numpy as np
 
+from pandas._config import using_pyarrow_strict_nans
+
 from pandas._libs import lib
+from pandas._libs.missing import NA
 from pandas.errors import LossySetitemError
 
 from pandas.core.dtypes.cast import np_can_hold_element
@@ -21,7 +24,11 @@
 
 
 def to_numpy_dtype_inference(
-    arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
+    arr: ArrayLike,
+    dtype: npt.DTypeLike | None,
+    na_value,
+    hasna: bool,
+    is_pyarrow: bool = True,
 ) -> tuple[npt.DTypeLike, Any]:
     if dtype is None and is_numeric_dtype(arr.dtype):
         dtype_given = False
@@ -34,7 +41,11 @@ def to_numpy_dtype_inference(
                 else:
                     dtype = arr.dtype.numpy_dtype  # type: ignore[union-attr]
                 if na_value is lib.no_default:
-                    na_value = np.nan
+                    if is_pyarrow and using_pyarrow_strict_nans():
+                        na_value = NA
+                        dtype = np.dtype(object)
+                    else:
+                        na_value = np.nan
         else:
             dtype = arr.dtype.numpy_dtype  # type: ignore[union-attr]
     elif dtype is not None:
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -15,8 +15,10 @@
 
 import numpy as np
 
+from pandas._config import using_pyarrow_strict_nans
+
 from pandas._libs import lib
-from pandas._libs.missing import NA
+from pandas._libs.missing import is_pdna_or_none
 from pandas._libs.tslibs import (
     Timedelta,
     Timestamp,
@@ -324,6 +326,11 @@ def _from_sequence_of_strings(
         """
         Construct a new ExtensionArray from a sequence of strings.
         """
+        mask = isna(strings)
+
+        if isinstance(strings, cls):
+            strings = strings._pa_array
+
         pa_type = to_pyarrow_type(dtype)
         if (
             pa_type is None
@@ -342,22 +349,35 @@ def _from_sequence_of_strings(
             from pandas.core.tools.datetimes import to_datetime
 
             scalars = to_datetime(strings, errors="raise").date
+
+            if isinstance(strings, cls):
+                # Avoid an object path
+                # TODO: this assumes that pyarrows str->date casting is the
+                # same as to_datetime. Is that a fair assumption?
+                scalars = strings._pa_array.cast(pa_type)
+            else:
+                scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
+
         elif pa.types.is_duration(pa_type):
             from pandas.core.tools.timedeltas import to_timedelta
 
             scalars = to_timedelta(strings, errors="raise")
+
             if pa_type.unit != "ns":
                 # GH51175: test_from_sequence_of_strings_pa_array
                 # attempt to parse as int64 reflecting pyarrow's
                 # duration to string casting behavior
                 mask = isna(scalars)
-                if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
-                    strings = pa.array(strings, type=pa.string())
+                if isinstance(strings, cls):
+                    strings = strings._pa_array
+                elif not isinstance(strings, (pa.Array, pa.ChunkedArray)):
+                    strings = pa.array(strings, type=pa.string(), mask=mask)
                 strings = pc.if_else(mask, None, strings)
                 try:
                     scalars = strings.cast(pa.int64())
                 except pa.ArrowInvalid:
                     pass
+
         elif pa.types.is_time(pa_type):
             from pandas.core.tools.times import to_time
 
@@ -373,7 +393,7 @@ def _from_sequence_of_strings(
             if isinstance(strings, (pa.Array, pa.ChunkedArray)):
                 scalars = strings
             else:
-                scalars = pa.array(strings, type=pa.string())
+                scalars = pa.array(strings, type=pa.string(), mask=mask)
             scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
             scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
             scalars = scalars.cast(pa.bool_())
@@ -385,12 +405,16 @@ def _from_sequence_of_strings(
             from pandas.core.tools.numeric import to_numeric
 
             scalars = to_numeric(strings, errors="raise")
-            if not pa.types.is_decimal(pa_type):
+            if not pa.types.is_decimal(pa_type) and isinstance(
+                strings, (pa.Array, pa.ChunkedArray)
+            ):
                 # TODO: figure out why doing this cast breaks with decimal dtype
                 #  in test_from_sequence_of_strings_pa_array
                 mask = strings.is_null()
                 scalars = pa.array(scalars, mask=np.array(mask), type=pa_type)
                 # TODO: could we just do strings.cast(pa_type)?
+            elif mask is not None:
+                scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
 
         else:
             raise NotImplementedError(
@@ -544,23 +568,20 @@ def _box_pa_array(
                 return pa_array
 
             mask = None
-            if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM":
-                # similar to isna(value) but exclude NaN
-                # TODO: cythonize!
-                mask = np.array([x is NA or x is None for x in value], dtype=bool)
-
-            from_pandas = False
-            if pa.types.is_integer(pa_type):
-                # If user specifically asks to cast a numpy float array with NaNs
-                #  to pyarrow integer, we'll treat those NaNs as NA
-                from_pandas = True
+            if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf":
+                try:
+                    arr_value = np.asarray(value)
+                except ValueError:
+                    # e.g. list dtype with mixed-length lists
+                    arr_value = np.asarray(value, dtype=object)
+                # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
+                mask = is_pdna_or_none(arr_value)
+
             try:
-                pa_array = pa.array(
-                    value, type=pa_type, mask=mask, from_pandas=from_pandas
-                )
+                pa_array = pa.array(value, type=pa_type, mask=mask)
             except (pa.ArrowInvalid, pa.ArrowTypeError):
                 # GH50430: let pyarrow infer type, then cast
-                pa_array = pa.array(value, mask=mask, from_pandas=from_pandas)
+                pa_array = pa.array(value, mask=mask)
 
             if pa_type is None and pa.types.is_duration(pa_array.type):
                 # Workaround https://github.com/apache/arrow/issues/37291
@@ -1496,7 +1517,11 @@ def to_numpy(
             pa.types.is_floating(pa_type)
             and (
                 na_value is np.nan
-                or (original_na_value is lib.no_default and is_float_dtype(dtype))
+                or (
+                    original_na_value is lib.no_default
+                    and is_float_dtype(dtype)
+                    and not using_pyarrow_strict_nans()
+                )
             )
         ):
             result = data._pa_array.to_numpy()
@@ -1964,8 +1989,10 @@ def _explode(self):
         fill_value = pa.scalar([None], type=self._pa_array.type)
         mask = counts == 0
         if mask.any():
-            values = values.copy()
-            values[mask] = fill_value
+            # pc.if_else here is similar to `values[mask] = fill_value`
+            #  but this avoids a object-dtype round-trip.
+            pa_values = pc.if_else(~mask, values._pa_array, fill_value)
+            values = type(self)(pa_values)
             counts = counts.copy()
             counts[mask] = 1
         values = values.fillna(fill_value)
@@ -2367,6 +2394,7 @@ def _replace_with_mask(
             replacements = np.array(replacements, dtype=object)
         elif isinstance(replacements, pa.Scalar):
             replacements = replacements.as_py()
+
         result = np.array(values, dtype=object)
         result[mask] = replacements
         return pa.array(result, type=values.type)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -778,6 +778,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
 
             return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy)
 
+        # if dtype.kind == "U":
+        #    dtype = np.dtype(object)
+        # return self.to_numpy(dtype=dtype, copy=copy)
         if not copy:
             return np.asarray(self, dtype=dtype)
         else:
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -484,7 +484,9 @@ def to_numpy(
         array([ True, False, False])
         """
         hasna = self._hasna
-        dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
+        dtype, na_value = to_numpy_dtype_inference(
+            self, dtype, na_value, hasna, is_pyarrow=False
+        )
         if dtype is None:
             dtype = object
 
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -427,6 +427,15 @@ def is_terminal() -> bool:
         validator=is_one_of_factory([True, False, "warn"]),
     )
 
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "pyarrow_strict_nans",
+        True,
+        # TODO: Change this to False before merging
+        "Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA",
+        validator=is_one_of_factory([True, False]),
+    )
+
 
 # user warnings
 chained_assignment = """
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -32,6 +32,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_strict_nans
+
 from pandas._libs import lib
 from pandas._libs.tslibs import timezones
 from pandas.compat import (
@@ -717,7 +719,10 @@ def test_EA_types(self, engine, data, dtype_backend, request):
                 pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
             )
         df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
-        csv_output = df.to_csv(index=False, na_rep=np.nan)  # should be NA?
+        if using_pyarrow_strict_nans():
+            csv_output = df.to_csv(index=False, na_rep="NA")
+        else:
+            csv_output = df.to_csv(index=False, na_rep=np.nan)
         if pa.types.is_binary(pa_dtype):
             csv_output = BytesIO(csv_output)
         else:
@@ -1508,7 +1513,8 @@ def test_pickle_roundtrip(data):
 
 def test_astype_from_non_pyarrow(data):
     # GH49795
-    pd_array = data._pa_array.to_pandas().array
+    np_arr = data.to_numpy()
+    pd_array = pd.array(np_arr, dtype=np_arr.dtype)
     result = pd_array.astype(data.dtype)
     assert not isinstance(pd_array.dtype, ArrowDtype)
     assert isinstance(result.dtype, ArrowDtype)
@@ -1542,7 +1548,9 @@ def test_to_numpy_with_defaults(data):
     else:
         expected = np.array(data._pa_array)
 
-    if data._hasna and not is_numeric_dtype(data.dtype):
+    if data._hasna and (
+        not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans()
+    ):
         expected = expected.astype(object)
         expected[pd.isna(data)] = pd.NA
 
@@ -2864,7 +2872,7 @@ def test_dt_components():
     )
     result = ser.dt.components
     expected = pd.DataFrame(
-        [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
+        [[1, 0, 0, 2, 0, 3, 4], [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA]],
         columns=[
             "days",
             "hours",
@@ -2889,7 +2897,10 @@ def test_dt_components_large_values():
     )
     result = ser.dt.components
     expected = pd.DataFrame(
-        [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]],
+        [
+            [365, 23, 59, 59, 999, 0, 0],
+            [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
+        ],
         columns=[
             "days",
             "hours",