From 6d249e838aefa37f58810fd908f630b4e87b9451 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 7 Jul 2025 09:15:10 +0200 Subject: [PATCH] TST: assert reading of legacy pickles against current data --- .../tests/io/generate_legacy_storage_files.py | 42 ++++++++++++++----- pandas/tests/io/test_pickle.py | 29 ++++++++++++- 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 9bfd8eb9d51d5..08c31c9b0a3c2 100644 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -133,7 +133,8 @@ def create_pickle_data(): data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + # "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "C": Series(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } @@ -180,8 +181,16 @@ def create_pickle_data(): tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] ), ), - "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), - "cat": Series(Categorical(["foo", "bar", "baz"])), + "dup": Series( + np.arange(5).astype(np.float64), + index=Index(["A", "B", "C", "D", "A"], dtype=object), + ), + # "cat": Series(Categorical(["foo", "bar", "baz"])), + "cat": Series( + Categorical.from_codes( + [2, 0, 1], categories=Index(["bar", "baz", "foo"], dtype="object") + ) + ), "dt": Series(date_range("20130101", periods=5)), "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")), "period": Series([Period("2000Q1")] * 5), @@ -210,26 +219,36 @@ def create_pickle_data(): "dup": DataFrame( np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] ), - "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}), + # "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}), + "cat_onecol": DataFrame( + { + "A": Categorical.from_codes( + [1, 0], categories=Index(["bar", "foo"], dtype="object") + ) + } + ), "cat_and_float": DataFrame( { - "A": Categorical(["foo", "bar", "baz"]), + # "A": Categorical(["foo", "bar", "baz"]), + "A": Categorical.from_codes( + [2, 0, 1], categories=Index(["bar", "baz", "foo"], dtype="object") + ), "B": np.arange(3).astype(np.int64), } ), "mixed_dup": mixed_dup_df, "dt_mixed_tzs": DataFrame( { - "A": Timestamp("20130102", tz="US/Eastern"), - "B": Timestamp("20130603", tz="CET"), + "A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"), + "B": Timestamp("20130603", tz="CET").as_unit("ns"), }, index=range(5), ), "dt_mixed2_tzs": DataFrame( { - "A": Timestamp("20130102", tz="US/Eastern"), - "B": Timestamp("20130603", tz="CET"), - "C": Timestamp("20130603", tz="UTC"), + "A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"), + "B": Timestamp("20130603", tz="CET").as_unit("ns"), + "C": Timestamp("20130603", tz="UTC").as_unit("ns"), }, index=range(5), ), @@ -245,6 +264,9 @@ def create_pickle_data(): "normal": Timestamp("2011-01-01"), "nat": NaT, "tz": Timestamp("2011-01-01", tz="US/Eastern"), + # kept because those are present in the legacy pickles (<= 1.4) + "freq": Timestamp("2011-01-01"), + "both": Timestamp("2011-01-01", tz="Asia/Tokyo"), } off = { diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index bab2c1561eb99..4f54ee0c7f4c0 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -43,6 +43,7 @@ ) import pandas._testing as tm from pandas.tests.io.generate_legacy_storage_files import create_pickle_data +from pandas.util.version import Version import pandas.io.common as icom from pandas.tseries.offsets import ( @@ -56,7 +57,7 @@ # --------------------- def compare_element(result, expected, typ): if isinstance(expected, Index): - tm.assert_index_equal(expected, result) + tm.assert_index_equal(result, expected) return if typ.startswith("sp_"): @@ -81,15 +82,39 @@ def test_pickles(datapath): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") + current_data = create_pickle_data() + # For loop for compat with --strict-data-files for legacy_pickle in Path(__file__).parent.glob("data/legacy_pickle/*/*.p*kl*"): + legacy_version = Version(legacy_pickle.parent.name) legacy_pickle = datapath(legacy_pickle) data = pd.read_pickle(legacy_pickle) for typ, dv in data.items(): for dt, result in dv.items(): - expected = data[typ][dt] + expected = current_data[typ][dt] + + if ( + typ == "timestamp" + and dt in ("tz", "both") + and legacy_version < Version("1.3.0") + ): + # convert to wall time + # (bug since pandas 2.0 that tz gets dropped for older pickle files) + expected = expected.tz_convert(None) + + if typ in ("frame", "sp_frame"): + expected.columns = expected.columns.astype("object") + + if typ == "frame" and dt == "mi": + expected.index = expected.index.set_levels( + [level.astype("object") for level in expected.index.levels], + ) + if typ == "mi": + expected = expected.set_levels( + [level.astype("object") for level in expected.levels], + ) if typ == "series" and dt == "ts": # GH 7748