Skip to content

Commit cb30719

Browse files
committed
POC: PDEP16 default to masked nullable dtypes
1 parent ebca3c5 commit cb30719

File tree

16 files changed

+145
-92
lines changed

16 files changed

+145
-92
lines changed

pandas/_testing/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@
8989
NpDtype,
9090
)
9191

92+
# Alias so we can update old `assert obj.dtype == np_dtype` checks to PDEP16
93+
# behavior.
94+
to_dtype = pd.core.dtypes.common.pandas_dtype
9295

9396
UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
9497
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]

pandas/_testing/asserters.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,13 +323,19 @@ def _check_types(left, right, obj: str = "Index") -> None:
323323
elif check_exact and check_categorical:
324324
if not left.equals(right):
325325
mismatch = left._values != right._values
326+
if isinstance(left, RangeIndex) and not mismatch.any():
327+
# TODO: probably need to fix RangeIndex.equals?
328+
pass
329+
elif isinstance(right, RangeIndex) and not mismatch.any():
330+
# TODO: probably need to fix some other equals method?
331+
pass
332+
else:
333+
if not isinstance(mismatch, np.ndarray):
334+
mismatch = cast("ExtensionArray", mismatch).fillna(True)
326335

327-
if not isinstance(mismatch, np.ndarray):
328-
mismatch = cast("ExtensionArray", mismatch).fillna(True)
329-
330-
diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
331-
msg = f"{obj} values are different ({np.round(diff, 5)} %)"
332-
raise_assert_detail(obj, msg, left, right)
336+
diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
337+
msg = f"{obj} values are different ({np.round(diff, 5)} %)"
338+
raise_assert_detail(obj, msg, left, right)
333339
else:
334340
# if we have "equiv", this becomes True
335341
exact_bool = bool(exact)

pandas/core/arrays/_mixins.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
127127
# pass those through to the underlying ndarray
128128
return self._ndarray.view(dtype)
129129

130-
dtype = pandas_dtype(dtype)
130+
dtype = pandas_dtype(dtype, allow_numpy_dtypes=True)
131131
arr = self._ndarray
132132

133133
if isinstance(dtype, PeriodDtype):

pandas/core/arrays/string_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -837,7 +837,7 @@ def astype(self, dtype, copy: bool = True):
837837
arr_ea = self.copy()
838838
mask = self.isna()
839839
arr_ea[mask] = "0"
840-
values = arr_ea.astype(dtype.numpy_dtype)
840+
values = arr_ea.to_numpy(dtype=dtype.numpy_dtype)
841841
return FloatingArray(values, mask, copy=False)
842842
elif isinstance(dtype, ExtensionDtype):
843843
# Skip the NumpyExtensionArray.astype method

pandas/core/config_init.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,15 @@ def is_terminal() -> bool:
427427
validator=is_one_of_factory([True, False, "warn"]),
428428
)
429429

430+
with cf.config_prefix("mode"):
431+
cf.register_option(
432+
"pdep16_data_types",
433+
True,
434+
"Whether to default to numpy-nullable dtypes for integer, float, "
435+
"and bool dtypes",
436+
validator=is_one_of_factory([True, False]),
437+
)
438+
430439

431440
# user warnings
432441
chained_assignment = """

pandas/core/construction.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
import numpy as np
1717
from numpy import ma
1818

19-
from pandas._config import using_string_dtype
19+
from pandas._config import (
20+
get_option,
21+
using_string_dtype,
22+
)
2023

2124
from pandas._libs import lib
2225
from pandas._libs.tslibs import (
@@ -612,7 +615,9 @@ def sanitize_array(
612615
if dtype is None:
613616
subarr = data
614617
if data.dtype == object and infer_object:
615-
subarr = maybe_infer_to_datetimelike(data)
618+
subarr = maybe_infer_to_datetimelike(
619+
data, convert_to_nullable_dtype=get_option("mode.pdep16_data_types")
620+
)
616621
elif data.dtype.kind == "U" and using_string_dtype():
617622
from pandas.core.arrays.string_ import StringDtype
618623

@@ -659,7 +664,10 @@ def sanitize_array(
659664
subarr = maybe_convert_platform(data)
660665
if subarr.dtype == object:
661666
subarr = cast(np.ndarray, subarr)
662-
subarr = maybe_infer_to_datetimelike(subarr)
667+
subarr = maybe_infer_to_datetimelike(
668+
subarr,
669+
convert_to_nullable_dtype=get_option("mode.pdep16_data_types"),
670+
)
663671

664672
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
665673

pandas/core/dtypes/cast.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818

1919
import numpy as np
2020

21-
from pandas._config import using_string_dtype
21+
from pandas._config import (
22+
get_option,
23+
using_string_dtype,
24+
)
2225

2326
from pandas._libs import (
2427
Interval,
@@ -135,7 +138,9 @@ def maybe_convert_platform(
135138

136139
if arr.dtype == _dtype_obj:
137140
arr = cast(np.ndarray, arr)
138-
arr = lib.maybe_convert_objects(arr)
141+
arr = lib.maybe_convert_objects(
142+
arr, convert_to_nullable_dtype=get_option("mode.pdep16_data_types")
143+
)
139144

140145
return arr
141146

pandas/core/dtypes/common.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212

1313
import numpy as np
1414

15-
from pandas._config import using_string_dtype
15+
from pandas._config import (
16+
get_option,
17+
using_string_dtype,
18+
)
1619

1720
from pandas._libs import (
1821
Interval,
@@ -1793,14 +1796,36 @@ def validate_all_hashable(*args, error_name: str | None = None) -> None:
17931796
raise TypeError("All elements must be hashable")
17941797

17951798

1796-
def pandas_dtype(dtype) -> DtypeObj:
1799+
def _map_np_dtype(dtype: np.dtype) -> DtypeObj:
1800+
if dtype.kind in "iu":
1801+
from pandas.core.arrays.integer import NUMPY_INT_TO_DTYPE
1802+
1803+
return NUMPY_INT_TO_DTYPE[dtype]
1804+
elif dtype.kind == "f":
1805+
from pandas.core.arrays.floating import NUMPY_FLOAT_TO_DTYPE
1806+
1807+
if dtype.itemsize != 2:
1808+
# TODO: What do we do for float16? float128?
1809+
return NUMPY_FLOAT_TO_DTYPE[dtype]
1810+
1811+
elif dtype.kind == "b":
1812+
from pandas import BooleanDtype
1813+
1814+
return BooleanDtype()
1815+
1816+
return dtype
1817+
1818+
1819+
def pandas_dtype(dtype, allow_numpy_dtypes: bool = False) -> DtypeObj:
17971820
"""
17981821
Convert input into a pandas only dtype object or a numpy dtype object.
17991822
18001823
Parameters
18011824
----------
18021825
dtype : object
18031826
The object to be converted into a dtype.
1827+
allow_numpy_dtypes : bool, default False
1828+
Whether to return pre-PDEP16 numpy dtypes for ints, floats, and bools.
18041829
18051830
Returns
18061831
-------
@@ -1820,10 +1845,18 @@ def pandas_dtype(dtype) -> DtypeObj:
18201845
>>> pd.api.types.pandas_dtype(int)
18211846
dtype('int64')
18221847
"""
1848+
allow_numpy_dtypes = allow_numpy_dtypes or not get_option("mode.pdep16_data_types")
1849+
18231850
# short-circuit
18241851
if isinstance(dtype, np.ndarray):
1825-
return dtype.dtype
1826-
elif isinstance(dtype, (np.dtype, ExtensionDtype)):
1852+
if allow_numpy_dtypes:
1853+
return dtype.dtype
1854+
return _map_np_dtype(dtype.dtype)
1855+
elif isinstance(dtype, np.dtype):
1856+
if allow_numpy_dtypes:
1857+
return dtype
1858+
return _map_np_dtype(dtype)
1859+
elif isinstance(dtype, ExtensionDtype):
18271860
return dtype
18281861

18291862
# builtin aliases
@@ -1879,7 +1912,9 @@ def pandas_dtype(dtype) -> DtypeObj:
18791912
elif npdtype.kind == "O":
18801913
raise TypeError(f"dtype '{dtype}' not understood")
18811914

1882-
return npdtype
1915+
if allow_numpy_dtypes:
1916+
return npdtype
1917+
return _map_np_dtype(npdtype)
18831918

18841919

18851920
def is_all_strings(value: ArrayLike) -> bool:

pandas/core/dtypes/dtypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1777,7 +1777,7 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
17771777
)
17781778
from pandas.core.dtypes.missing import na_value_for_dtype
17791779

1780-
dtype = pandas_dtype(dtype)
1780+
dtype = pandas_dtype(dtype, allow_numpy_dtypes=True)
17811781
if is_string_dtype(dtype):
17821782
dtype = np.dtype("object")
17831783
if not isinstance(dtype, np.dtype):

pandas/core/indexes/base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@
172172
)
173173
import pandas.core.common as com
174174
from pandas.core.construction import (
175+
array as pd_array,
175176
ensure_wrapped_if_datetimelike,
176177
extract_array,
177178
sanitize_array,
@@ -576,6 +577,8 @@ def __new__(
576577
raise ValueError("Index data must be 1-dimensional") from err
577578
raise
578579
arr = ensure_wrapped_if_datetimelike(arr)
580+
if arr.dtype.kind in "iufb" and get_option("mode.pdep16_data_types"):
581+
arr = pd_array(arr, copy=False)
579582

580583
klass = cls._dtype_to_subclass(arr.dtype)
581584

@@ -5391,6 +5394,8 @@ def putmask(self, mask, value) -> Index:
53915394

53925395
# See also: Block.coerce_to_target_dtype
53935396
dtype = self._find_common_type_compat(value)
5397+
assert self.dtype != dtype, (self.dtype, value)
5398+
# FIXME: should raise with useful message to report a bug!
53945399
return self.astype(dtype).putmask(mask, value)
53955400

53965401
values = self._values.copy()

0 commit comments

Comments
 (0)