Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,7 @@ Missing
- Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`)
- Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)
- Bug in :meth:`Series.isin` and :meth:`MultiIndex.isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`)

MultiIndex
^^^^^^^^^^
Expand Down
84 changes: 78 additions & 6 deletions pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,90 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
#define kh_exist_complex128(h, k) (kh_exist(h, k))


// NaN-floats should be in the same equivalency class, see GH 22119
int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some comments before these of what is going on / why doing is & the issue reference

return (
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(b))
)
||
( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
}


// NaNs should be in the same equivalency class, see GH 41836
// PyObject_RichCompareBool for complexobjects has a different behavior
// needs to be replaced
int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
return (
Py_IS_NAN(a->cval.real) &&
Py_IS_NAN(b->cval.real) &&
Py_IS_NAN(a->cval.imag) &&
Py_IS_NAN(b->cval.imag)
)
||
(
Py_IS_NAN(a->cval.real) &&
Py_IS_NAN(b->cval.real) &&
a->cval.imag == b->cval.imag
)
||
(
a->cval.real == b->cval.real &&
Py_IS_NAN(a->cval.imag) &&
Py_IS_NAN(b->cval.imag)
)
||
(
a->cval.real == b->cval.real &&
a->cval.imag == b->cval.imag
);
}

int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);


// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
// which treats NaNs as equivalent
// see GH 41836
int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
Py_ssize_t i;

if (Py_SIZE(a) != Py_SIZE(b)) {
return 0;
}

for (i = 0; i < Py_SIZE(a); ++i) {
if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
return 0;
}
}
return 1;
}


int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
if (Py_TYPE(a) == Py_TYPE(b)) {
// special handling for some built-in types which could have NaNs
// as we would like to have them equivalent, but the usual
// PyObject_RichCompareBool would return False
if (PyFloat_CheckExact(a)) {
return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
}
if (PyComplex_CheckExact(a)) {
return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
}
if (PyTuple_CheckExact(a)) {
return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
}
// frozenset isn't yet supported
}

int result = PyObject_RichCompareBool(a, b, Py_EQ);
if (result < 0) {
PyErr_Clear();
return 0;
}
if (result == 0) { // still could be two NaNs
return PyFloat_CheckExact(a) &&
PyFloat_CheckExact(b) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(b));
}
return result;
}

Expand Down
14 changes: 1 addition & 13 deletions pandas/tests/indexes/multi/test_isin.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import numpy as np
import pytest

from pandas.compat import PYPY

from pandas import MultiIndex
import pandas._testing as tm


@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy")
def test_isin_nan_pypy():
def test_isin_nan():
idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True]))
tm.assert_numpy_array_equal(
Expand All @@ -31,15 +28,6 @@ def test_isin():
assert result.dtype == np.bool_


@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy")
def test_isin_nan_not_pypy():
idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False]))
tm.assert_numpy_array_equal(
idx.isin([("bar", float("nan"))]), np.array([False, False])
)


def test_isin_level_kwarg():
idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)])

Expand Down
71 changes: 71 additions & 0 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.core.algorithms import isin


@contextmanager
Expand Down Expand Up @@ -178,6 +179,67 @@ def test_no_reallocation(self, table_type, dtype):
assert n_buckets_start == clean_table.get_state()["n_buckets"]


class TestPyObjectHashTableWithNans:
def test_nan_float(self):
nan1 = float("nan")
nan2 = float("nan")
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_complex_both(self):
nan1 = complex(float("nan"), float("nan"))
nan2 = complex(float("nan"), float("nan"))
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_complex_real(self):
nan1 = complex(float("nan"), 1)
nan2 = complex(float("nan"), 1)
other = complex(float("nan"), 2)
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
table.get_item(other)
assert str(error.value) == str(other)

def test_nan_complex_imag(self):
nan1 = complex(1, float("nan"))
nan2 = complex(1, float("nan"))
other = complex(2, float("nan"))
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
table.get_item(other)
assert str(error.value) == str(other)

def test_nan_in_tuple(self):
nan1 = (float("nan"),)
nan2 = (float("nan"),)
assert nan1[0] is not nan2[0]
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_in_nested_tuple(self):
nan1 = (1, (2, (float("nan"),)))
nan2 = (1, (2, (float("nan"),)))
other = (1, 2)
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
table.get_item(other)
assert str(error.value) == str(other)


def test_get_labels_groupby_for_Int64(writable):
table = ht.Int64HashTable()
vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
Expand Down Expand Up @@ -426,3 +488,12 @@ def test_mode(self, dtype, type_suffix):
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
assert mode(values, True) == 42
assert np.isnan(mode(values, False))


def test_ismember_tuple_with_nans():
# GH-41836
values = [("a", float("nan")), ("b", 1)]
comps = [("a", float("nan"))]
result = isin(values, comps)
expected = np.array([True, False], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)