From 31e65e0eec173ab6fcc48d0d686d0c2ace6d19af Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jul 2025 15:47:02 -0700 Subject: [PATCH 01/14] BUG: Decimal(NaN) incorrectly allowed in ArrowEA constructor with timestamp type --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 10fb9503ffb3d..2c209f521b274 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -709,6 +709,8 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) +- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`??`) +- Timedelta ^^^^^^^^^ From 9dcd8fbf7bc4d6bac039364bf9ad2da4d3502b0a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jul 2025 15:49:04 -0700 Subject: [PATCH 02/14] GH ref --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2c209f521b274..f159282717109 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -709,7 +709,6 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) -- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`??`) - Timedelta From 3fb47c78fb54b0ebd34f58aedc11403458042802 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Jul 2025 08:21:07 -0700 Subject: [PATCH 03/14] BUG: ArrowEA constructor with timestamp type --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f159282717109..10fb9503ffb3d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -709,7 +709,6 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) -- Timedelta ^^^^^^^^^ From c18ab05d9c01eeff2d588e1ed50c729819801824 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Jun 2025 10:07:44 -0700 Subject: [PATCH 04/14] POC: consistent NaN treatment for pyarrow dtypes --- pandas/_libs/parsers.pyx | 2 +- pandas/core/arrays/arrow/array.py | 54 ++++++++++++++++++------ pandas/core/arrays/string_.py | 8 +++- pandas/core/generic.py | 19 ++++++++- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/groupby/test_reductions.py | 6 ++- pandas/tests/series/methods/test_rank.py | 9 ++++ 7 files changed, 81 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 43670abca2fac..e115fc67adc2a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1453,7 +1453,7 @@ def _maybe_upcast( if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow arr = arr.to_numpy(na_value=None) - arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) + arr = ArrowExtensionArray(pa.array(arr)) return arr diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 919453b29b7f9..3f6559b98a7a4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,6 +16,7 @@ import numpy as np from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -351,7 +352,7 @@ def _from_sequence_of_strings( # duration to string casting behavior mask = isna(scalars) if not isinstance(strings, (pa.Array, pa.ChunkedArray)): - strings = pa.array(strings, type=pa.string(), from_pandas=True) + strings = pa.array(strings, type=pa.string()) strings = pc.if_else(mask, None, strings) try: scalars = strings.cast(pa.int64()) @@ -372,7 +373,7 @@ def _from_sequence_of_strings( if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings else: - scalars = pa.array(strings, type=pa.string(), from_pandas=True) + scalars = pa.array(strings, type=pa.string()) scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) scalars = scalars.cast(pa.bool_()) @@ -384,6 +385,13 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") + if not pa.types.is_decimal(pa_type): + # TODO: figure out why doing this cast breaks with decimal dtype + # in test_from_sequence_of_strings_pa_array + mask = strings.is_null() + scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) + # TODO: could we just do strings.cast(pa_type)? + else: raise NotImplementedError( f"Converting strings to {pa_type} is not implemented." @@ -426,7 +434,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: """ if isinstance(value, pa.Scalar): pa_scalar = value - elif isna(value): + elif isna(value) and not lib.is_float(value): pa_scalar = pa.scalar(None, type=pa_type) else: # Workaround https://github.com/apache/arrow/issues/37291 @@ -443,7 +451,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: value = value.as_unit(pa_type.unit) value = value._value - pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + pa_scalar = pa.scalar(value, type=pa_type) if pa_type is not None and pa_scalar.type != pa_type: pa_scalar = pa_scalar.cast(pa_type) @@ -475,6 +483,13 @@ def _box_pa_array( if copy: value = value.copy() pa_array = value.__arrow_array__() + + elif hasattr(value, "__arrow_array__"): + # e.g. StringArray + if copy: + value = value.copy() + pa_array = value.__arrow_array__() + else: if ( isinstance(value, np.ndarray) @@ -528,11 +543,24 @@ def _box_pa_array( pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask) return pa_array + mask = None + if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM": + # similar to isna(value) but exclude NaN + # TODO: cythonize! + mask = np.array([x is NA or x is None for x in value], dtype=bool) + + from_pandas = False + if pa.types.is_integer(pa_type): + # If user specifically asks to cast a numpy float array with NaNs + # to pyarrow integer, we'll treat those NaNs as NA + from_pandas = True try: - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array( + value, type=pa_type, mask=mask, from_pandas=from_pandas + ) except (pa.ArrowInvalid, pa.ArrowTypeError): # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, from_pandas=True) + pa_array = pa.array(value, mask=mask, from_pandas=from_pandas) if pa_type is None and pa.types.is_duration(pa_array.type): # Workaround https://github.com/apache/arrow/issues/37291 @@ -540,7 +568,7 @@ def _box_pa_array( value = to_timedelta(value) value = value.to_numpy() - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array(value, type=pa_type) if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: # GH52843: upstream bug for duration types when originally @@ -1187,7 +1215,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) + result = pc.is_in(self._pa_array, value_set=pa.array(values)) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -1994,7 +2022,7 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") chunks = [ *self._pa_array[:key].chunks, - pa.array([value], type=self._pa_array.type, from_pandas=True), + pa.array([value], type=self._pa_array.type), *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() @@ -2048,7 +2076,7 @@ def _rank_calc( pa_type = pa.float64() else: pa_type = pa.uint64() - result = pa.array(ranked, type=pa_type, from_pandas=True) + result = pa.array(ranked, type=pa_type) return result data = self._pa_array.combine_chunks() @@ -2300,7 +2328,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: right, right_type = _to_numpy_and_type(right) pa_type = left_type or right_type result = np.where(cond, left, right) - return pa.array(result, type=pa_type, from_pandas=True) + return pa.array(result, type=pa_type) @classmethod def _replace_with_mask( @@ -2343,7 +2371,7 @@ def _replace_with_mask( replacements = replacements.as_py() result = np.array(values, dtype=object) result[mask] = replacements - return pa.array(result, type=values.type, from_pandas=True) + return pa.array(result, type=values.type) # ------------------------------------------------------------------ # GroupBy Methods @@ -2422,7 +2450,7 @@ def _groupby_op( return type(self)(pa_result) else: # DatetimeArray, TimedeltaArray - pa_result = pa.array(result, from_pandas=True) + pa_result = pa.array(result) return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f52b709a59de9..7f65463a97815 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -481,6 +481,12 @@ def _str_map_str_or_object( if self.dtype.storage == "pyarrow": import pyarrow as pa + # TODO: shouldn't this already be caught my passed mask? + # it isn't in test_extract_expand_capture_groups_index + # mask = mask | np.array( + # [x is libmissing.NA for x in result], dtype=bool + # ) + result = pa.array( result, mask=mask, type=pa.large_string(), from_pandas=True ) @@ -733,7 +739,7 @@ def __arrow_array__(self, type=None): values = self._ndarray.copy() values[self.isna()] = None - return pa.array(values, type=type, from_pandas=True) + return pa.array(values, type=type) def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7f1ccc482f70f..4a788638bae45 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9873,7 +9873,7 @@ def where( def where( self, cond, - other=np.nan, + other=lib.no_default, *, inplace: bool = False, axis: Axis | None = None, @@ -10031,6 +10031,23 @@ def where( stacklevel=2, ) + if other is lib.no_default: + if self.ndim == 1: + if isinstance(self.dtype, ExtensionDtype): + other = self.dtype.na_value + else: + other = np.nan + else: + if self._mgr.nblocks == 1 and isinstance( + self._mgr.blocks[0].values.dtype, ExtensionDtype + ): + # FIXME: checking this is kludgy! + other = self._mgr.blocks[0].values.dtype.na_value + else: + # FIXME: the same problem we had with Series will now + # show up column-by-column! + other = np.nan + other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7e7cd8fb13456..14b65a56f8c05 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -721,7 +721,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - csv_output = df.to_csv(index=False, na_rep=np.nan) + csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA? if pa.types.is_binary(pa_dtype): csv_output = BytesIO(csv_output) else: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 014558bbf4bba..08cf1047f316c 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -381,8 +381,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): df = DataFrame( { "a": [2, 1, 1, 2, 3, 3], - "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], - "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + # TODO: test that has mixed na_value and NaN either working for + # float or raising for int? + "b": [na_value, 3.0, na_value, 4.0, na_value, na_value], + "c": [na_value, 3.0, na_value, 4.0, na_value, na_value], }, dtype=any_real_nullable_dtype, ) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 7c6a7893ba3a0..8363ba118d4d3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -276,6 +276,13 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) + if dtype == "float64[pyarrow]": + # the NaNs are not treated as NA + exp = exp.copy() + if method == "average": + exp[np.isnan(ser)] = 9.5 + elif method == "dense": + exp[np.isnan(ser)] = 6 tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @@ -321,6 +328,8 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] + elif dtype == "float64[pyarrow]": + order = [ranks[0], [NA] * chunk, ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] From 74a22486c394b1ba8de5f0705ffdaba67dd50e58 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Jun 2025 10:23:00 -0700 Subject: [PATCH 05/14] comment --- pandas/tests/extension/base/setitem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 185d6d750cace..99ab5d2f7e86f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -422,6 +422,7 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) + # FIXME: Breaks for pyarrow float dtype bc df.values changes NAs to NaN df.iloc[:] = df.values tm.assert_frame_equal(df, orig) From 9d8fef493f599ff5342a4fa2c96ee1ad953828c1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Jul 2025 09:41:02 -0700 Subject: [PATCH 06/14] Down to 40 failing tests --- pandas/_config/__init__.py | 5 +++ pandas/_libs/missing.pyi | 1 + pandas/_libs/missing.pyx | 18 ++++++++ pandas/core/arrays/_utils.py | 15 ++++++- pandas/core/arrays/arrow/array.py | 66 +++++++++++++++++++--------- pandas/core/arrays/base.py | 3 ++ pandas/core/arrays/masked.py | 4 +- pandas/core/config_init.py | 9 ++++ pandas/tests/extension/test_arrow.py | 21 ++++++--- 9 files changed, 114 insertions(+), 28 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 463e8af7cc561..fbf388224254f 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -33,3 +33,8 @@ def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] + + +def using_pyarrow_strict_nans() -> bool: + _mode_options = _global_config["mode"] + return _mode_options["pyarrow_strict_nans"] diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 6bf30a03cef32..6c76fe49330b6 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... +def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index c7f905c4d0be0..164a47cb5adb7 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA +@cython.wraparound(False) +@cython.boundscheck(False) +def is_pdna_or_none(values: ndarray) -> ndarray: + cdef: + ndarray[uint8_t] result + Py_ssize_t i, N + object val + + N = len(values) + result = np.zeros(N, dtype=np.uint8) + + for i in range(N): + val = values[i] + if val is None or val is C_NA: + result[i] = True + return result.view(bool) + + @cython.wraparound(False) @cython.boundscheck(False) def is_numeric_na(values: ndarray) -> ndarray: diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 6b46396d5efdf..9adde3846ca03 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -7,7 +7,10 @@ import numpy as np +from pandas._config import using_pyarrow_strict_nans + from pandas._libs import lib +from pandas._libs.missing import NA from pandas.errors import LossySetitemError from pandas.core.dtypes.cast import np_can_hold_element @@ -21,7 +24,11 @@ def to_numpy_dtype_inference( - arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool + arr: ArrayLike, + dtype: npt.DTypeLike | None, + na_value, + hasna: bool, + is_pyarrow: bool = True, ) -> tuple[npt.DTypeLike, Any]: if dtype is None and is_numeric_dtype(arr.dtype): dtype_given = False @@ -34,7 +41,11 @@ def to_numpy_dtype_inference( else: dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] if na_value is lib.no_default: - na_value = np.nan + if is_pyarrow and using_pyarrow_strict_nans(): + na_value = NA + dtype = np.dtype(object) + else: + na_value = np.nan else: dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3f6559b98a7a4..a81f69fc314aa 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -15,8 +15,10 @@ import numpy as np +from pandas._config import using_pyarrow_strict_nans + from pandas._libs import lib -from pandas._libs.missing import NA +from pandas._libs.missing import is_pdna_or_none from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -324,6 +326,11 @@ def _from_sequence_of_strings( """ Construct a new ExtensionArray from a sequence of strings. """ + mask = isna(strings) + + if isinstance(strings, cls): + strings = strings._pa_array + pa_type = to_pyarrow_type(dtype) if ( pa_type is None @@ -342,22 +349,35 @@ def _from_sequence_of_strings( from pandas.core.tools.datetimes import to_datetime scalars = to_datetime(strings, errors="raise").date + + if isinstance(strings, cls): + # Avoid an object path + # TODO: this assumes that pyarrows str->date casting is the + # same as to_datetime. Is that a fair assumption? + scalars = strings._pa_array.cast(pa_type) + else: + scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) + elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta scalars = to_timedelta(strings, errors="raise") + if pa_type.unit != "ns": # GH51175: test_from_sequence_of_strings_pa_array # attempt to parse as int64 reflecting pyarrow's # duration to string casting behavior mask = isna(scalars) - if not isinstance(strings, (pa.Array, pa.ChunkedArray)): - strings = pa.array(strings, type=pa.string()) + if isinstance(strings, cls): + strings = strings._pa_array + elif not isinstance(strings, (pa.Array, pa.ChunkedArray)): + strings = pa.array(strings, type=pa.string(), mask=mask) strings = pc.if_else(mask, None, strings) try: scalars = strings.cast(pa.int64()) except pa.ArrowInvalid: pass + elif pa.types.is_time(pa_type): from pandas.core.tools.times import to_time @@ -373,7 +393,7 @@ def _from_sequence_of_strings( if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings else: - scalars = pa.array(strings, type=pa.string()) + scalars = pa.array(strings, type=pa.string(), mask=mask) scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) scalars = scalars.cast(pa.bool_()) @@ -385,12 +405,16 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") - if not pa.types.is_decimal(pa_type): + if not pa.types.is_decimal(pa_type) and isinstance( + strings, (pa.Array, pa.ChunkedArray) + ): # TODO: figure out why doing this cast breaks with decimal dtype # in test_from_sequence_of_strings_pa_array mask = strings.is_null() scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) # TODO: could we just do strings.cast(pa_type)? + elif mask is not None: + scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) else: raise NotImplementedError( @@ -544,23 +568,20 @@ def _box_pa_array( return pa_array mask = None - if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM": - # similar to isna(value) but exclude NaN - # TODO: cythonize! - mask = np.array([x is NA or x is None for x in value], dtype=bool) - - from_pandas = False - if pa.types.is_integer(pa_type): - # If user specifically asks to cast a numpy float array with NaNs - # to pyarrow integer, we'll treat those NaNs as NA - from_pandas = True + if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf": + try: + arr_value = np.asarray(value) + except ValueError: + # e.g. list dtype with mixed-length lists + arr_value = np.asarray(value, dtype=object) + # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like + mask = is_pdna_or_none(arr_value) + try: - pa_array = pa.array( - value, type=pa_type, mask=mask, from_pandas=from_pandas - ) + pa_array = pa.array(value, type=pa_type, mask=mask) except (pa.ArrowInvalid, pa.ArrowTypeError): # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, mask=mask, from_pandas=from_pandas) + pa_array = pa.array(value, mask=mask) if pa_type is None and pa.types.is_duration(pa_array.type): # Workaround https://github.com/apache/arrow/issues/37291 @@ -1496,7 +1517,11 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or (original_na_value is lib.no_default and is_float_dtype(dtype)) + or ( + original_na_value is lib.no_default + and is_float_dtype(dtype) + and not using_pyarrow_strict_nans() + ) ) ): result = data._pa_array.to_numpy() @@ -2369,6 +2394,7 @@ def _replace_with_mask( replacements = np.array(replacements, dtype=object) elif isinstance(replacements, pa.Scalar): replacements = replacements.as_py() + result = np.array(values, dtype=object) result[mask] = replacements return pa.array(result, type=values.type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d0048e122051a..959a2acc8601f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -778,6 +778,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) + # if dtype.kind == "U": + # dtype = np.dtype(object) + # return self.to_numpy(dtype=dtype, copy=copy) if not copy: return np.asarray(self, dtype=dtype) else: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e7a6b207363c3..6438a967eae42 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -484,7 +484,9 @@ def to_numpy( array([ True, False, False]) """ hasna = self._hasna - dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + dtype, na_value = to_numpy_dtype_inference( + self, dtype, na_value, hasna, is_pyarrow=False + ) if dtype is None: dtype = object diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 20fe8cbab1c9f..6e2ecae593d99 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -427,6 +427,15 @@ def is_terminal() -> bool: validator=is_one_of_factory([True, False, "warn"]), ) +with cf.config_prefix("mode"): + cf.register_option( + "pyarrow_strict_nans", + True, + # TODO: Change this to False before merging + "Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA", + validator=is_one_of_factory([True, False]), + ) + # user warnings chained_assignment = """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 14b65a56f8c05..d16c11265e1ca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -32,6 +32,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_strict_nans + from pandas._libs import lib from pandas._libs.tslibs import timezones from pandas.compat import ( @@ -721,7 +723,10 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA? + if using_pyarrow_strict_nans(): + csv_output = df.to_csv(index=False, na_rep="NA") + else: + csv_output = df.to_csv(index=False, na_rep=np.nan) if pa.types.is_binary(pa_dtype): csv_output = BytesIO(csv_output) else: @@ -1512,7 +1517,8 @@ def test_pickle_roundtrip(data): def test_astype_from_non_pyarrow(data): # GH49795 - pd_array = data._pa_array.to_pandas().array + np_arr = data.to_numpy() + pd_array = pd.array(np_arr, dtype=np_arr.dtype) result = pd_array.astype(data.dtype) assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) @@ -1546,7 +1552,9 @@ def test_to_numpy_with_defaults(data): else: expected = np.array(data._pa_array) - if data._hasna and not is_numeric_dtype(data.dtype): + if data._hasna and ( + not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans() + ): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA @@ -2868,7 +2876,7 @@ def test_dt_components(): ) result = ser.dt.components expected = pd.DataFrame( - [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]], + [[1, 0, 0, 2, 0, 3, 4], [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA]], columns=[ "days", "hours", @@ -2893,7 +2901,10 @@ def test_dt_components_large_values(): ) result = ser.dt.components expected = pd.DataFrame( - [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + [ + [365, 23, 59, 59, 999, 0, 0], + [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + ], columns=[ "days", "hours", From f47c746a770dbd2794801c4c90f2a5f2389e452a Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 10:17:51 -0700 Subject: [PATCH 07/14] Fix rank, json tests --- pandas/io/json/_json.py | 14 +++++++++++++ pandas/tests/extension/test_arrow.py | 5 ++++- pandas/tests/series/methods/test_rank.py | 25 ++++++++++++++++++++---- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 6b4f6c05c3123..f8170dd843793 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -994,6 +994,13 @@ def _read_ujson(self) -> DataFrame | Series: else: obj = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: + if self.dtype_backend == "pyarrow": + # The construction above takes "null" to NaN, which we want to + # convert to NA. But .convert_dtypes to pyarrow doesn't allow + # that, so we do a 2-step conversion through numpy-nullable. + obj = obj.convert_dtypes( + infer_objects=False, dtype_backend="numpy_nullable" + ) return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend ) @@ -1071,6 +1078,13 @@ def __next__(self) -> DataFrame | Series: raise ex if self.dtype_backend is not lib.no_default: + if self.dtype_backend == "pyarrow": + # The construction above takes "null" to NaN, which we want to + # convert to NA. But .convert_dtypes to pyarrow doesn't allow + # that, so we do a 2-step conversion through numpy-nullable. + obj = obj.convert_dtypes( + infer_objects=False, dtype_backend="numpy_nullable" + ) return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d16c11265e1ca..482754a9b5f18 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -285,7 +285,10 @@ def test_map(self, data_missing, na_action): tm.assert_numpy_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == "float32[pyarrow]": + if ( + data_missing.dtype == "float32[pyarrow]" + and not using_pyarrow_strict_nans() + ): # map roundtrips through objects, which converts to float64 expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) else: diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 8363ba118d4d3..85b99c87e7cc8 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -271,7 +271,12 @@ def test_rank_signature(self): def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): + if ( + dtype == "int64" + or dtype == "int64[pyarrow]" + or dtype == "uint64[pyarrow]" + or (not using_infer_string and dtype == "str") + ): pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) @@ -283,7 +288,15 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): exp[np.isnan(ser)] = 9.5 elif method == "dense": exp[np.isnan(ser)] = 6 - tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) + elif method == "max": + exp[np.isnan(ser)] = 10 + elif method == "min": + exp[np.isnan(ser)] = 9 + elif method == "first": + exp[np.isnan(ser)] = [9, 10] + + expected = Series(exp, dtype=expected_dtype(dtype, method)) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( @@ -395,8 +408,12 @@ def test_rank_dense_method(self, dtype, ser, exp): def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): - s = ser.dropna() + if ( + dtype == "int64" + or dtype == "int64[pyarrow]" + or (not using_infer_string and dtype == "str") + ): + s = ser.dropna().astype(dtype) else: s = ser.astype(dtype) From 083f7057dd74a271c0c36dadd1c901bda22416f1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 10:33:51 -0700 Subject: [PATCH 08/14] CLN: remove outdated --- pandas/core/arrays/arrow/array.py | 12 ++---------- pandas/core/arrays/base.py | 3 --- pandas/tests/extension/base/setitem.py | 1 - 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a81f69fc314aa..6c4aba95c0c04 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -350,13 +350,7 @@ def _from_sequence_of_strings( scalars = to_datetime(strings, errors="raise").date - if isinstance(strings, cls): - # Avoid an object path - # TODO: this assumes that pyarrows str->date casting is the - # same as to_datetime. Is that a fair assumption? - scalars = strings._pa_array.cast(pa_type) - else: - scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) + scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta @@ -368,9 +362,7 @@ def _from_sequence_of_strings( # attempt to parse as int64 reflecting pyarrow's # duration to string casting behavior mask = isna(scalars) - if isinstance(strings, cls): - strings = strings._pa_array - elif not isinstance(strings, (pa.Array, pa.ChunkedArray)): + if not isinstance(strings, (pa.Array, pa.ChunkedArray)): strings = pa.array(strings, type=pa.string(), mask=mask) strings = pc.if_else(mask, None, strings) try: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 959a2acc8601f..d0048e122051a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -778,9 +778,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - # if dtype.kind == "U": - # dtype = np.dtype(object) - # return self.to_numpy(dtype=dtype, copy=copy) if not copy: return np.asarray(self, dtype=dtype) else: diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 99ab5d2f7e86f..185d6d750cace 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -422,7 +422,6 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) - # FIXME: Breaks for pyarrow float dtype bc df.values changes NAs to NaN df.iloc[:] = df.values tm.assert_frame_equal(df, orig) From a34020325cb8c90750cc7d7c7c0f7d8b331b0172 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 14:12:03 -0700 Subject: [PATCH 09/14] Fix where kludge --- pandas/core/arrays/arrow/array.py | 2 ++ pandas/core/generic.py | 17 ----------------- pandas/tests/extension/test_arrow.py | 7 +++++-- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6c4aba95c0c04..568859d8416c7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -405,6 +405,8 @@ def _from_sequence_of_strings( mask = strings.is_null() scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) # TODO: could we just do strings.cast(pa_type)? + elif isinstance(strings, (pa.Array, pa.ChunkedArray)): + scalars = strings.cast(pa_type) elif mask is not None: scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4a788638bae45..8c471e0f5ece7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10031,23 +10031,6 @@ def where( stacklevel=2, ) - if other is lib.no_default: - if self.ndim == 1: - if isinstance(self.dtype, ExtensionDtype): - other = self.dtype.na_value - else: - other = np.nan - else: - if self._mgr.nblocks == 1 and isinstance( - self._mgr.blocks[0].values.dtype, ExtensionDtype - ): - # FIXME: checking this is kludgy! - other = self._mgr.blocks[0].values.dtype.na_value - else: - # FIXME: the same problem we had with Series will now - # show up column-by-column! - other = np.nan - other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 482754a9b5f18..229c0c8070a4f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1569,8 +1569,11 @@ def test_to_numpy_int_with_na(): data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - expected = np.array([1, np.nan]) - assert isinstance(result[0], float) + if using_pyarrow_strict_nans(): + expected = np.array([1, pd.NA], dtype=object) + else: + expected = np.array([1, np.nan]) + assert isinstance(result[0], float) tm.assert_numpy_array_equal(result, expected) From 587e53f739d88267e6cafd7f4a9c37fde9d738d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 14:21:09 -0700 Subject: [PATCH 10/14] update tests --- pandas/tests/extension/test_arrow.py | 5 ++++- pandas/tests/frame/methods/test_convert_dtypes.py | 6 +++++- pandas/tests/groupby/methods/test_kurt.py | 2 +- pandas/tests/tools/test_to_numeric.py | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 229c0c8070a4f..49a60c917ece0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3537,7 +3537,10 @@ def test_cast_dictionary_different_value_dtype(arrow_type): def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + if using_pyarrow_strict_nans(): + expected = pd.Series([42.0, 42.0, pd.NA], dtype="object") + else: + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ab847e2f8e81e..21f7811100d43 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_strict_nans + import pandas.util._test_decorators as td import pandas as pd @@ -73,6 +75,8 @@ def test_pyarrow_dtype_backend(self): } ) result = df.convert_dtypes(dtype_backend="pyarrow") + + item = None if not using_pyarrow_strict_nans() else np.nan expected = pd.DataFrame( { "a": pd.arrays.ArrowExtensionArray( @@ -80,7 +84,7 @@ def test_pyarrow_dtype_backend(self): ), "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), - "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "d": pd.arrays.ArrowExtensionArray(pa.array([item, 100.5, 200.0])), "e": pd.arrays.ArrowExtensionArray( pa.array( [ diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py index 21b7c50c3c5aa..7aac23c2147fb 100644 --- a/pandas/tests/groupby/methods/test_kurt.py +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -43,7 +43,7 @@ def test_groupby_kurt_arrow_float64(dtype): # Test groupby.kurt() with float64[pyarrow] and Float64 dtypes df = pd.DataFrame( { - "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "x": [1.0, pd.NA, 3.2, 4.8, 2.3, 1.9, 8.9], "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], }, dtype=dtype, diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 893f526fb3eb0..e3471c2e3ac0d 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -898,7 +898,7 @@ def test_to_numeric_dtype_backend_error(dtype_backend): dtype = "double[pyarrow]" else: dtype = "Float64" - expected = Series([np.nan, np.nan, np.nan], dtype=dtype) + expected = Series([pd.NA, pd.NA, pd.NA], dtype=dtype) tm.assert_series_equal(result, expected) From 734465c3d59748b60be39d451d7e3d6d9c39b403 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 14:32:52 -0700 Subject: [PATCH 11/14] Fix remaining tests --- pandas/core/arrays/base.py | 8 ++++++++ pandas/tests/io/test_stata.py | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d0048e122051a..52be1a76363d6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2539,6 +2539,14 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if result is not NotImplemented: return result + # TODO: putting this here is hacky as heck + if self.dtype == "float64[pyarrow]": + # e.g. test_log_arrow_backed_missing_value + new_inputs = [ + x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs + ] + return getattr(ufunc, method)(*new_inputs, **kwargs) + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) def map(self, mapper, na_action: Literal["ignore"] | None = None): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 90fda2c10962b..3ebf4416f7289 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2056,9 +2056,10 @@ def test_writer_118_exceptions(self, temp_file): ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], ) def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): + dtype = "Int64" if dtype_backend == "numpy_nullable" else "int64[pyarrow]" df = DataFrame( { - "a": [1, 2, None], + "a": pd.array([1, 2, None], dtype=dtype), "b": ["a", "b", "c"], "c": [True, False, None], "d": [1.5, 2.5, 3.5], From d2aeeff8b8ad29842de22ee6b661771cf23dd115 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 07:49:33 -0700 Subject: [PATCH 12/14] mypy fixup --- pandas/_libs/missing.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 6c76fe49330b6..64256ae4b36ad 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,4 +14,4 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... -def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ... +def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ... From 73a95d2ce3ff4f7891389d85bc0c2496091855fa Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 12:59:54 -0700 Subject: [PATCH 13/14] old-numpy compat --- pandas/core/arrays/arrow/array.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 568859d8416c7..83b4f6517a3a5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -562,12 +562,8 @@ def _box_pa_array( return pa_array mask = None - if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf": - try: - arr_value = np.asarray(value) - except ValueError: - # e.g. list dtype with mixed-length lists - arr_value = np.asarray(value, dtype=object) + if getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf": + arr_value = np.asarray(value, dtype=object) # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like mask = is_pdna_or_none(arr_value) From ce28027b3b7eca6bae1f138e72257795860c63e4 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 15:45:04 -0700 Subject: [PATCH 14/14] simplify --- pandas/core/arrays/arrow/array.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 83b4f6517a3a5..90c9a38b43b1d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -369,7 +369,6 @@ def _from_sequence_of_strings( scalars = strings.cast(pa.int64()) except pa.ArrowInvalid: pass - elif pa.types.is_time(pa_type): from pandas.core.tools.times import to_time @@ -397,18 +396,10 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") - if not pa.types.is_decimal(pa_type) and isinstance( - strings, (pa.Array, pa.ChunkedArray) - ): - # TODO: figure out why doing this cast breaks with decimal dtype - # in test_from_sequence_of_strings_pa_array - mask = strings.is_null() - scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) - # TODO: could we just do strings.cast(pa_type)? - elif isinstance(strings, (pa.Array, pa.ChunkedArray)): + if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings.cast(pa_type) elif mask is not None: - scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) + scalars = pa.array(scalars, mask=mask, type=pa_type) else: raise NotImplementedError(