Skip to content

Commit ca6e8e8

Browse files
committed
Down to 40 failing tests
1 parent d257666 commit ca6e8e8

File tree

9 files changed

+118
-30
lines changed

9 files changed

+118
-30
lines changed

pandas/_config/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@
3333
def using_string_dtype() -> bool:
3434
_mode_options = _global_config["future"]
3535
return _mode_options["infer_string"]
36+
37+
38+
def using_pyarrow_strict_nans() -> bool:
39+
_mode_options = _global_config["mode"]
40+
return _mode_options["pyarrow_strict_nans"]

pandas/_libs/missing.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
1414
def checknull(val: object) -> bool: ...
1515
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
1616
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
17+
def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ...

pandas/_libs/missing.pyx

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
249249
return checknull_with_nat(obj) or obj is C_NA
250250

251251

252+
@cython.wraparound(False)
253+
@cython.boundscheck(False)
254+
def is_pdna_or_none(values: ndarray) -> ndarray:
255+
cdef:
256+
ndarray[uint8_t] result
257+
Py_ssize_t i, N
258+
object val
259+
260+
N = len(values)
261+
result = np.zeros(N, dtype=np.uint8)
262+
263+
for i in range(N):
264+
val = values[i]
265+
if val is None or val is C_NA:
266+
result[i] = True
267+
return result.view(bool)
268+
269+
252270
@cython.wraparound(False)
253271
@cython.boundscheck(False)
254272
def is_numeric_na(values: ndarray) -> ndarray:

pandas/core/arrays/_utils.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77

88
import numpy as np
99

10+
from pandas._config import using_pyarrow_strict_nans
11+
1012
from pandas._libs import lib
13+
from pandas._libs.missing import NA
1114
from pandas.errors import LossySetitemError
1215

1316
from pandas.core.dtypes.cast import np_can_hold_element
@@ -21,7 +24,11 @@
2124

2225

2326
def to_numpy_dtype_inference(
24-
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
27+
arr: ArrayLike,
28+
dtype: npt.DTypeLike | None,
29+
na_value,
30+
hasna: bool,
31+
is_pyarrow: bool = True,
2532
) -> tuple[npt.DTypeLike, Any]:
2633
if dtype is None and is_numeric_dtype(arr.dtype):
2734
dtype_given = False
@@ -34,7 +41,11 @@ def to_numpy_dtype_inference(
3441
else:
3542
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
3643
if na_value is lib.no_default:
37-
na_value = np.nan
44+
if is_pyarrow and using_pyarrow_strict_nans():
45+
na_value = NA
46+
dtype = np.dtype(object)
47+
else:
48+
na_value = np.nan
3849
else:
3950
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
4051
elif dtype is not None:

pandas/core/arrays/arrow/array.py

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515

1616
import numpy as np
1717

18+
from pandas._config import using_pyarrow_strict_nans
19+
1820
from pandas._libs import lib
19-
from pandas._libs.missing import NA
21+
from pandas._libs.missing import is_pdna_or_none
2022
from pandas._libs.tslibs import (
2123
Timedelta,
2224
Timestamp,
@@ -324,6 +326,11 @@ def _from_sequence_of_strings(
324326
"""
325327
Construct a new ExtensionArray from a sequence of strings.
326328
"""
329+
mask = isna(strings)
330+
331+
if isinstance(strings, cls):
332+
strings = strings._pa_array
333+
327334
pa_type = to_pyarrow_type(dtype)
328335
if (
329336
pa_type is None
@@ -342,22 +349,35 @@ def _from_sequence_of_strings(
342349
from pandas.core.tools.datetimes import to_datetime
343350

344351
scalars = to_datetime(strings, errors="raise").date
352+
353+
if isinstance(strings, cls):
354+
# Avoid an object path
355+
# TODO: this assumes that pyarrows str->date casting is the
356+
# same as to_datetime. Is that a fair assumption?
357+
scalars = strings._pa_array.cast(pa_type)
358+
else:
359+
scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
360+
345361
elif pa.types.is_duration(pa_type):
346362
from pandas.core.tools.timedeltas import to_timedelta
347363

348364
scalars = to_timedelta(strings, errors="raise")
365+
349366
if pa_type.unit != "ns":
350367
# GH51175: test_from_sequence_of_strings_pa_array
351368
# attempt to parse as int64 reflecting pyarrow's
352369
# duration to string casting behavior
353370
mask = isna(scalars)
354-
if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
355-
strings = pa.array(strings, type=pa.string())
371+
if isinstance(strings, cls):
372+
strings = strings._pa_array
373+
elif not isinstance(strings, (pa.Array, pa.ChunkedArray)):
374+
strings = pa.array(strings, type=pa.string(), mask=mask)
356375
strings = pc.if_else(mask, None, strings)
357376
try:
358377
scalars = strings.cast(pa.int64())
359378
except pa.ArrowInvalid:
360379
pass
380+
361381
elif pa.types.is_time(pa_type):
362382
from pandas.core.tools.times import to_time
363383

@@ -373,7 +393,7 @@ def _from_sequence_of_strings(
373393
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
374394
scalars = strings
375395
else:
376-
scalars = pa.array(strings, type=pa.string())
396+
scalars = pa.array(strings, type=pa.string(), mask=mask)
377397
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
378398
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
379399
scalars = scalars.cast(pa.bool_())
@@ -385,12 +405,16 @@ def _from_sequence_of_strings(
385405
from pandas.core.tools.numeric import to_numeric
386406

387407
scalars = to_numeric(strings, errors="raise")
388-
if not pa.types.is_decimal(pa_type):
408+
if not pa.types.is_decimal(pa_type) and isinstance(
409+
strings, (pa.Array, pa.ChunkedArray)
410+
):
389411
# TODO: figure out why doing this cast breaks with decimal dtype
390412
# in test_from_sequence_of_strings_pa_array
391413
mask = strings.is_null()
392414
scalars = pa.array(scalars, mask=np.array(mask), type=pa_type)
393415
# TODO: could we just do strings.cast(pa_type)?
416+
elif mask is not None:
417+
scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
394418

395419
else:
396420
raise NotImplementedError(
@@ -544,23 +568,20 @@ def _box_pa_array(
544568
return pa_array
545569

546570
mask = None
547-
if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM":
548-
# similar to isna(value) but exclude NaN
549-
# TODO: cythonize!
550-
mask = np.array([x is NA or x is None for x in value], dtype=bool)
551-
552-
from_pandas = False
553-
if pa.types.is_integer(pa_type):
554-
# If user specifically asks to cast a numpy float array with NaNs
555-
# to pyarrow integer, we'll treat those NaNs as NA
556-
from_pandas = True
571+
if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf":
572+
try:
573+
arr_value = np.asarray(value)
574+
except ValueError:
575+
# e.g. list dtype with mixed-length lists
576+
arr_value = np.asarray(value, dtype=object)
577+
# similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
578+
mask = is_pdna_or_none(arr_value)
579+
557580
try:
558-
pa_array = pa.array(
559-
value, type=pa_type, mask=mask, from_pandas=from_pandas
560-
)
581+
pa_array = pa.array(value, type=pa_type, mask=mask)
561582
except (pa.ArrowInvalid, pa.ArrowTypeError):
562583
# GH50430: let pyarrow infer type, then cast
563-
pa_array = pa.array(value, mask=mask, from_pandas=from_pandas)
584+
pa_array = pa.array(value, mask=mask)
564585

565586
if pa_type is None and pa.types.is_duration(pa_array.type):
566587
# Workaround https://github.com/apache/arrow/issues/37291
@@ -1496,7 +1517,11 @@ def to_numpy(
14961517
pa.types.is_floating(pa_type)
14971518
and (
14981519
na_value is np.nan
1499-
or (original_na_value is lib.no_default and is_float_dtype(dtype))
1520+
or (
1521+
original_na_value is lib.no_default
1522+
and is_float_dtype(dtype)
1523+
and not using_pyarrow_strict_nans()
1524+
)
15001525
)
15011526
):
15021527
result = data._pa_array.to_numpy()
@@ -1964,8 +1989,10 @@ def _explode(self):
19641989
fill_value = pa.scalar([None], type=self._pa_array.type)
19651990
mask = counts == 0
19661991
if mask.any():
1967-
values = values.copy()
1968-
values[mask] = fill_value
1992+
# pc.if_else here is similar to `values[mask] = fill_value`
1993+
# but this avoids a object-dtype round-trip.
1994+
pa_values = pc.if_else(~mask, values._pa_array, fill_value)
1995+
values = type(self)(pa_values)
19691996
counts = counts.copy()
19701997
counts[mask] = 1
19711998
values = values.fillna(fill_value)
@@ -2367,6 +2394,7 @@ def _replace_with_mask(
23672394
replacements = np.array(replacements, dtype=object)
23682395
elif isinstance(replacements, pa.Scalar):
23692396
replacements = replacements.as_py()
2397+
23702398
result = np.array(values, dtype=object)
23712399
result[mask] = replacements
23722400
return pa.array(result, type=values.type)

pandas/core/arrays/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
778778

779779
return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy)
780780

781+
# if dtype.kind == "U":
782+
# dtype = np.dtype(object)
783+
# return self.to_numpy(dtype=dtype, copy=copy)
781784
if not copy:
782785
return np.asarray(self, dtype=dtype)
783786
else:

pandas/core/arrays/masked.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,9 @@ def to_numpy(
484484
array([ True, False, False])
485485
"""
486486
hasna = self._hasna
487-
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
487+
dtype, na_value = to_numpy_dtype_inference(
488+
self, dtype, na_value, hasna, is_pyarrow=False
489+
)
488490
if dtype is None:
489491
dtype = object
490492

pandas/core/config_init.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,15 @@ def is_terminal() -> bool:
427427
validator=is_one_of_factory([True, False, "warn"]),
428428
)
429429

430+
with cf.config_prefix("mode"):
431+
cf.register_option(
432+
"pyarrow_strict_nans",
433+
True,
434+
# TODO: Change this to False before merging
435+
"Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA",
436+
validator=is_one_of_factory([True, False]),
437+
)
438+
430439

431440
# user warnings
432441
chained_assignment = """

pandas/tests/extension/test_arrow.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import numpy as np
3333
import pytest
3434

35+
from pandas._config import using_pyarrow_strict_nans
36+
3537
from pandas._libs import lib
3638
from pandas._libs.tslibs import timezones
3739
from pandas.compat import (
@@ -717,7 +719,10 @@ def test_EA_types(self, engine, data, dtype_backend, request):
717719
pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
718720
)
719721
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
720-
csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA?
722+
if using_pyarrow_strict_nans():
723+
csv_output = df.to_csv(index=False, na_rep="NA")
724+
else:
725+
csv_output = df.to_csv(index=False, na_rep=np.nan)
721726
if pa.types.is_binary(pa_dtype):
722727
csv_output = BytesIO(csv_output)
723728
else:
@@ -1508,7 +1513,8 @@ def test_pickle_roundtrip(data):
15081513

15091514
def test_astype_from_non_pyarrow(data):
15101515
# GH49795
1511-
pd_array = data._pa_array.to_pandas().array
1516+
np_arr = data.to_numpy()
1517+
pd_array = pd.array(np_arr, dtype=np_arr.dtype)
15121518
result = pd_array.astype(data.dtype)
15131519
assert not isinstance(pd_array.dtype, ArrowDtype)
15141520
assert isinstance(result.dtype, ArrowDtype)
@@ -1542,7 +1548,9 @@ def test_to_numpy_with_defaults(data):
15421548
else:
15431549
expected = np.array(data._pa_array)
15441550

1545-
if data._hasna and not is_numeric_dtype(data.dtype):
1551+
if data._hasna and (
1552+
not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans()
1553+
):
15461554
expected = expected.astype(object)
15471555
expected[pd.isna(data)] = pd.NA
15481556

@@ -2864,7 +2872,7 @@ def test_dt_components():
28642872
)
28652873
result = ser.dt.components
28662874
expected = pd.DataFrame(
2867-
[[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
2875+
[[1, 0, 0, 2, 0, 3, 4], [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA]],
28682876
columns=[
28692877
"days",
28702878
"hours",
@@ -2889,7 +2897,10 @@ def test_dt_components_large_values():
28892897
)
28902898
result = ser.dt.components
28912899
expected = pd.DataFrame(
2892-
[[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]],
2900+
[
2901+
[365, 23, 59, 59, 999, 0, 0],
2902+
[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
2903+
],
28932904
columns=[
28942905
"days",
28952906
"hours",

0 commit comments

Comments
 (0)