From 91dfbee68166583de400e2a302f76be650d5f2ab Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jun 2025 15:43:35 -0700 Subject: [PATCH 1/6] TST: update expecteds for using_string_dtype to fix xfails --- .../arrays/categorical/test_constructors.py | 13 ++---- pandas/tests/arrays/categorical/test_repr.py | 27 ++++++++---- pandas/tests/frame/methods/test_astype.py | 12 +---- pandas/tests/groupby/test_timegrouper.py | 12 +++-- .../tests/indexes/base_class/test_formats.py | 17 ++++--- pandas/tests/io/formats/test_format.py | 44 +++++++++++++++---- 6 files changed, 77 insertions(+), 48 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index d7eb6800e5d07..cf2de894cc0c0 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,10 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -444,13 +440,12 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" - ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects - cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) - assert all(isinstance(x, np.str_) for x in cat.categories) + # We can't pass all-strings because the constructor would cast + # those to StringDtype post-PDEP14 + cat = Categorical(["1", "0", "1", 2], [np.str_("0"), np.str_("1"), 2]) + assert all(isinstance(x, (np.str_, int)) for x in cat.categories) def test_constructor_from_categorical_with_dtype(self): dtype = CategoricalDtype(["a", "b", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 3a2c489920eb0..a82ba24a2c732 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -77,17 +74,19 @@ def test_print_none_width(self): with option_context("display.width", None): assert exp == repr(a) - @pytest.mark.skipif( - using_string_dtype(), - reason="Change once infer_string is set to True by default", - ) - def test_unicode_print(self): + def test_unicode_print(self, using_infer_string): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ ['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc'] Length: 60 Categories (3, object): ['aaaaa', 'bb', 'cccc']""" + if using_infer_string: + expected = expected.replace( + "(3, object): ['aaaaa', 'bb', 'cccc']", + "(3, str): [aaaaa, bb, cccc]", + ) + assert repr(c) == expected c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) @@ -96,6 +95,12 @@ def test_unicode_print(self): Length: 60 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "(3, object): ['ああああ', 'いいいいい', 'ううううううう']", + "(3, str): [ああああ, いいいいい, ううううううう]", + ) + assert repr(c) == expected # unicode option should not affect to Categorical, as it doesn't care @@ -106,6 +111,12 @@ def test_unicode_print(self): Length: 60 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "(3, object): ['ああああ', 'いいいいい', 'ううううううう']", + "(3, str): [ああああ, いいいいい, ううううううう]", + ) + assert repr(c) == expected def test_categorical_repr(self): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index eb1ee4e7b2970..c428bd1820cb1 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -745,10 +743,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") - def test_astype_dt64_to_string( - self, frame_or_series, tz_naive_fixture, using_infer_string - ): + def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): # GH#41409 tz = tz_naive_fixture @@ -766,10 +761,7 @@ def test_astype_dt64_to_string( item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - if using_infer_string: - assert item is np.nan - else: - assert item is pd.NA + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 550efe9187fe8..a64b15c211908 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -76,10 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: - # TODO(infer_string) resample sum introduces 0's - # https://github.com/pandas-dev/pandas/issues/60229 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_groupby_with_timegrouper(self): + def test_groupby_with_timegrouper(self, using_infer_string): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name @@ -116,8 +111,11 @@ def test_groupby_with_timegrouper(self): {"Buyer": 0, "Quantity": 0}, index=exp_dti, ) - # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" + # Cast to object/str to avoid implicit cast when setting + # entry to "CarlCarlCarl" expected = expected.astype({"Buyer": object}) + if using_infer_string: + expected = expected.astype({"Buyer": "str"}) expected.iloc[0, 0] = "CarlCarlCarl" expected.iloc[6, 0] = "CarlCarl" expected.iloc[18, 0] = "Joe" diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 260b4203a4f04..2368b8bce2d9e 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +15,6 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -77,11 +75,13 @@ def test_repr_is_valid_construction_code(self): ), ], ) - def test_string_index_repr(self, index, expected): + def test_string_index_repr(self, index, expected, using_infer_string): result = repr(index) + if using_infer_string: + expected = expected.replace("dtype='object'", "dtype='str'") + assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -121,11 +121,16 @@ def test_string_index_repr(self, index, expected): ), ], ) - def test_string_index_repr_with_unicode_option(self, index, expected): + def test_string_index_repr_with_unicode_option( + self, index, expected, using_infer_string + ): # Enable Unicode option ----------------------------------------- with cf.option_context("display.unicode.east_asian_width", True): result = repr(index) - assert result == expected + + if using_infer_string: + expected = expected.replace("dtype='object'", "dtype='str'") + assert result == expected def test_repr_summary(self): with cf.option_context("display.max_seq_items", 10): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 86682e8160762..a485578b139dc 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -1395,8 +1393,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") - def test_east_asian_unicode_series(self): + def test_east_asian_unicode_series(self, using_infer_string): # not aligned properly because of east asian width # unicode index @@ -1409,6 +1406,8 @@ def test_east_asian_unicode_series(self): "ええええ D\ndtype: object", ] ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode values @@ -1422,7 +1421,8 @@ def test_east_asian_unicode_series(self): "dtype: object", ] ) - + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # both @@ -1439,7 +1439,8 @@ def test_east_asian_unicode_series(self): "dtype: object", ] ) - + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode footer @@ -1452,6 +1453,8 @@ def test_east_asian_unicode_series(self): "ああ あ\nいいいい いい\nう ううう\n" "えええ ええええ\nName: おおおおおおお, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # MultiIndex @@ -1495,6 +1498,8 @@ def test_east_asian_unicode_series(self): "3 ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected s.index = ["ああ", "いいいい", "う", "えええ"] @@ -1503,6 +1508,8 @@ def test_east_asian_unicode_series(self): "えええ ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # Enable Unicode option ----------------------------------------- @@ -1516,6 +1523,8 @@ def test_east_asian_unicode_series(self): "あ a\nいい bb\nううう CCC\n" "ええええ D\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode values @@ -1527,6 +1536,8 @@ def test_east_asian_unicode_series(self): "a あ\nbb いい\nc ううう\n" "ddd ええええ\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # both s = Series( @@ -1539,6 +1550,8 @@ def test_east_asian_unicode_series(self): "う ううう\n" "えええ ええええ\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode footer @@ -1554,6 +1567,8 @@ def test_east_asian_unicode_series(self): "えええ ええええ\n" "Name: おおおおおおお, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # MultiIndex @@ -1599,6 +1614,8 @@ def test_east_asian_unicode_series(self): "3 ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected s.index = ["ああ", "いいいい", "う", "えええ"] @@ -1608,6 +1625,8 @@ def test_east_asian_unicode_series(self): "えええ ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # ambiguous unicode @@ -1621,6 +1640,8 @@ def test_east_asian_unicode_series(self): "¡¡ ううう\n" "えええ ええええ\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected def test_float_trim_zeros(self): @@ -1770,27 +1791,34 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") - def test_format_explicit(self): + def test_format_explicit(self, using_infer_string): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): res = repr(test_sers["onel"]) exp = "0 a\n1 a\n ..\n98 a\n99 a\ndtype: object" + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res res = repr(test_sers["twol"]) exp = "0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype: object" + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res res = repr(test_sers["asc"]) exp = ( "0 a\n1 ab\n ... \n4 abcde\n5 " "abcdef\ndtype: object" ) + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res res = repr(test_sers["desc"]) exp = ( "5 abcdef\n4 abcde\n ... \n1 ab\n0 " "a\ndtype: object" ) + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res def test_ncols(self): From 3ae22561f39f2f239795a09ae779a7e4ca0a3010 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jun 2025 14:39:49 -0700 Subject: [PATCH 2/6] Update to_dict_of_blocks test to hardcode object dtype --- pandas/tests/frame/methods/test_to_dict_of_blocks.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 4f621b4643b70..070cb36035a9a 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,11 +1,10 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, MultiIndex, + Series, ) import pandas._testing as tm from pandas.core.arrays import NumpyExtensionArray @@ -27,10 +26,9 @@ def test_no_copy_blocks(self, float_frame): assert _last_df is not None and not _last_df[column].equals(df[column]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_dict_of_blocks_item_cache(): # Calling to_dict_of_blocks should not poison item_cache - df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": Series(["a", "b", "c", "d"], dtype=object)}) df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) mgr = df._mgr assert len(mgr.blocks) == 3 # i.e. not consolidated From 6c8fc8973a925c48e5f2fb8fa10d5d9bc3d5db6e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jun 2025 14:40:04 -0700 Subject: [PATCH 3/6] Comment --- pandas/tests/indexes/categorical/test_category.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index d9c9fdc62b0bc..262b043adaf58 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -199,6 +199,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) + # TODO(3.0): remove this test once using_string_dtype() is always True @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) From 94deaba8010002834786f5fa08f9d9212013a1c9 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jun 2025 14:40:45 -0700 Subject: [PATCH 4/6] Split test, update expected, targeted xfails --- .../tests/indexes/categorical/test_formats.py | 99 ++++++++++++++++--- 1 file changed, 88 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index b1361b3e8106e..b100740b064ce 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -10,78 +10,132 @@ from pandas import CategoricalIndex -class TestCategoricalIndexRepr: - @pytest.mark.xfail(using_string_dtype(), reason="repr different") - def test_string_categorical_index_repr(self): +class TestCategoricalIndexReprStringCategories: + def test_string_categorical_index_repr(self, using_infer_string): # short idx = CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['a', 'bb', 'ccc']", + "categories=[a, bb, ccc]", + ) assert repr(idx) == expected + @pytest.mark.xfail(using_string_dtype(), reason="Different padding on multi-line") + def test_categorical_index_repr_multiline(self, using_infer_string): # multiple lines idx = CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['a', 'bb', 'ccc']", + "categories=[a, bb, ccc]", + ) assert repr(idx) == expected + @pytest.mark.xfail(using_string_dtype(), reason="Different padding on multi-line") + def test_categorical_index_repr_truncated(self, using_infer_string): # truncated idx = CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['a', 'bb', 'ccc']", + "categories=[a, bb, ccc]", + ) assert repr(idx) == expected + def test_categorical_index_repr_many_categories(self, using_infer_string): # larger categories idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o']", + "categories=[a, b, c, d, ..., k, l, m, o]", + ) assert repr(idx) == expected + def test_categorical_index_repr_unicode(self, using_infer_string): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + @pytest.mark.xfail(using_string_dtype(), reason="Different padding on multi-line") + def test_categorical_index_repr_unicode_multiline(self, using_infer_string): # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + @pytest.mark.xfail(using_string_dtype(), reason="Different padding on multi-line") + def test_categorical_index_repr_unicode_truncated(self, using_infer_string): # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + def test_categorical_index_repr_unicode_many_categories(self, using_infer_string): # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']", + "categories=[あ, い, う, え, ..., し, す, せ, そ]", + ) assert repr(idx) == expected - # Enable Unicode option ----------------------------------------- + def test_categorical_index_repr_east_asian_width(self, using_infer_string): with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + @pytest.mark.xfail(using_string_dtype(), reason="Different padding on multi-line") + def test_categorical_index_repr_east_asian_width_multiline( + self, using_infer_string + ): + with cf.option_context("display.unicode.east_asian_width", True): # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', @@ -90,8 +144,18 @@ def test_string_categorical_index_repr(self): 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + @pytest.mark.xfail(using_string_dtype(), reason="Different padding on multi-line") + def test_categorical_index_repr_east_asian_width_truncated( + self, using_infer_string + ): + with cf.option_context("display.unicode.east_asian_width", True): # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', @@ -101,12 +165,25 @@ def test_string_categorical_index_repr(self): 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected - # larger categories + def test_categorical_index_repr_east_asian_width_many_categories( + self, using_infer_string + ): + with cf.option_context("display.unicode.east_asian_width", True): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']", + "categories=[あ, い, う, え, ..., し, す, せ, そ]", + ) assert repr(idx) == expected From 6d11c3a3c28a1c9f94ba9b47597f569ae02e70f6 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jun 2025 14:45:09 -0700 Subject: [PATCH 5/6] Update json test --- pandas/tests/io/json/test_pandas.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 32eeb30de4b69..b09f23cf97f7c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1566,11 +1566,8 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) - # TODO: We are casting to string which coerces None to NaN before casting back - # to object, ending up with incorrect na values - @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) - def test_to_json_from_json_columns_dtypes(self, orient): + def test_to_json_from_json_columns_dtypes(self, orient, using_infer_string): # GH21892 GH33205 expected = DataFrame.from_dict( { @@ -1591,6 +1588,11 @@ def test_to_json_from_json_columns_dtypes(self, orient): with tm.assert_produces_warning(FutureWarning, match=msg): dfjson = expected.to_json(orient=orient) + if using_infer_string: + # When this is read back in it is inferred to "str" dtype which + # uses NaN instead of None. + expected.loc[0, "Object"] = np.nan + result = read_json( StringIO(dfjson), orient=orient, @@ -1849,11 +1851,11 @@ def test_to_json_indent(self, indent): assert result == expected - @pytest.mark.skipif( - using_string_dtype(), - reason="Adjust expected when infer_string is default, no bug here, " - "just a complicated parametrization", - ) + # @pytest.mark.skipif( + # using_string_dtype(), + # reason="Adjust expected when infer_string is default, no bug here, " + # "just a complicated parametrization", + # ) @pytest.mark.parametrize( "orient,expected", [ From 3102045693fcc43a9d3127bcb6ce61f683ac96c6 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jun 2025 14:53:09 -0700 Subject: [PATCH 6/6] revert commented-out --- pandas/tests/io/json/test_pandas.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b09f23cf97f7c..1bb6522bd7386 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1851,11 +1851,11 @@ def test_to_json_indent(self, indent): assert result == expected - # @pytest.mark.skipif( - # using_string_dtype(), - # reason="Adjust expected when infer_string is default, no bug here, " - # "just a complicated parametrization", - # ) + @pytest.mark.skipif( + using_string_dtype(), + reason="Adjust expected when infer_string is default, no bug here, " + "just a complicated parametrization", + ) @pytest.mark.parametrize( "orient,expected", [