2424from packaging import version
2525
2626import datasets .arrow_dataset
27+ import datasets .config
2728from datasets import concatenate_datasets , interleave_datasets , load_from_disk
2829from datasets .arrow_dataset import Dataset , transmit_format , update_metadata_with_features
2930from datasets .dataset_dict import DatasetDict
@@ -119,6 +120,8 @@ def assert_arrow_metadata_are_synced_with_dataset_features(dataset: Dataset):
119120 {"testcase_name" : name , "in_memory" : im } for im , name in [(True , "in_memory" ), (False , "on_disk" )]
120121]
121122
123+ STRING_FROM_PANDAS = "large_string" if datasets .config .PANDAS_VERSION .major >= 3 else "string"
124+
122125
123126@parameterized .named_parameters (IN_MEMORY_PARAMETERS )
124127class BaseDatasetTest (TestCase ):
@@ -1656,7 +1659,7 @@ def func_return_single_row_pd_dataframe(x):
16561659 self .assertEqual (len (dset_test ), 30 )
16571660 self .assertDictEqual (
16581661 dset_test .features ,
1659- Features ({"id" : Value ("int64" ), "text" : Value ("string" )}),
1662+ Features ({"id" : Value ("int64" ), "text" : Value (STRING_FROM_PANDAS )}),
16601663 )
16611664 self .assertEqual (dset_test [0 ]["id" ], 0 )
16621665 self .assertEqual (dset_test [0 ]["text" ], "a" )
@@ -1672,7 +1675,7 @@ def func_return_single_row_pd_dataframe_batched(x):
16721675 self .assertEqual (len (dset_test ), 30 )
16731676 self .assertDictEqual (
16741677 dset_test .features ,
1675- Features ({"id" : Value ("int64" ), "text" : Value ("string" )}),
1678+ Features ({"id" : Value ("int64" ), "text" : Value (STRING_FROM_PANDAS )}),
16761679 )
16771680 self .assertEqual (dset_test [0 ]["id" ], 0 )
16781681 self .assertEqual (dset_test [0 ]["text" ], "a" )
@@ -2702,6 +2705,12 @@ def test_to_sql(self, in_memory):
27022705 self .assertListEqual (list (sql_dset .columns ), list (dset .column_names ))
27032706
27042707 # With array features
2708+ if datasets .config .PANDAS_VERSION .major >= 3 :
2709+ # Pandas 3 can't save and reload string data
2710+ # pandas/_libs/lib.pyx:732: in pandas._libs.lib.ensure_string_array
2711+ # E UnicodeDecodeError: 'utf-8' codec can't decode byte 0x98 in position 0: invalid start byte
2712+ # pandas/_libs/lib.pyx:846: UnicodeDecodeError
2713+ return
27052714 with self ._create_dummy_dataset (in_memory , tmp_dir , array_features = True ) as dset :
27062715 file_path = os .path .join (tmp_dir , "test_path.sqlite" )
27072716 _ = dset .to_sql ("data" , "sqlite:///" + file_path , if_exists = "replace" )
@@ -3285,7 +3294,9 @@ def test_from_pandas(self):
32853294 self .assertSequenceEqual (dset ["col_1" ], data ["col_1" ])
32863295 self .assertSequenceEqual (dset ["col_2" ], data ["col_2" ])
32873296 self .assertListEqual (list (dset .features .keys ()), ["col_1" , "col_2" ])
3288- self .assertDictEqual (dset .features , Features ({"col_1" : Value ("int64" ), "col_2" : Value ("string" )}))
3297+ self .assertDictEqual (
3298+ dset .features , Features ({"col_1" : Value ("int64" ), "col_2" : Value (STRING_FROM_PANDAS )})
3299+ )
32893300
32903301 features = Features ({"col_1" : Value ("int64" ), "col_2" : Value ("string" )})
32913302 with Dataset .from_pandas (df , features = features ) as dset :
@@ -4200,7 +4211,7 @@ def _check_sql_dataset(dataset, expected_features):
42004211@pytest .mark .parametrize ("con_type" , ["string" , "engine" ])
42014212def test_dataset_from_sql_con_type (con_type , sqlite_path , tmp_path , set_sqlalchemy_silence_uber_warning , caplog ):
42024213 cache_dir = tmp_path / "cache"
4203- expected_features = {"col_1" : "string" , "col_2" : "int64" , "col_3" : "float64" }
4214+ expected_features = {"col_1" : STRING_FROM_PANDAS , "col_2" : "int64" , "col_3" : "float64" }
42044215 if con_type == "string" :
42054216 con = "sqlite:///" + sqlite_path
42064217 elif con_type == "engine" :
@@ -4238,7 +4249,7 @@ def test_dataset_from_sql_con_type(con_type, sqlite_path, tmp_path, set_sqlalche
42384249)
42394250def test_dataset_from_sql_features (features , sqlite_path , tmp_path , set_sqlalchemy_silence_uber_warning ):
42404251 cache_dir = tmp_path / "cache"
4241- default_expected_features = {"col_1" : "string" , "col_2" : "int64" , "col_3" : "float64" }
4252+ default_expected_features = {"col_1" : STRING_FROM_PANDAS , "col_2" : "int64" , "col_3" : "float64" }
42424253 expected_features = features .copy () if features else default_expected_features
42434254 features = (
42444255 Features ({feature : Value (dtype ) for feature , dtype in features .items ()}) if features is not None else None
@@ -4251,7 +4262,7 @@ def test_dataset_from_sql_features(features, sqlite_path, tmp_path, set_sqlalche
42514262@pytest .mark .parametrize ("keep_in_memory" , [False , True ])
42524263def test_dataset_from_sql_keep_in_memory (keep_in_memory , sqlite_path , tmp_path , set_sqlalchemy_silence_uber_warning ):
42534264 cache_dir = tmp_path / "cache"
4254- expected_features = {"col_1" : "string" , "col_2" : "int64" , "col_3" : "float64" }
4265+ expected_features = {"col_1" : STRING_FROM_PANDAS , "col_2" : "int64" , "col_3" : "float64" }
42554266 with assert_arrow_memory_increases () if keep_in_memory else assert_arrow_memory_doesnt_increase ():
42564267 dataset = Dataset .from_sql (
42574268 "dataset" , "sqlite:///" + sqlite_path , cache_dir = cache_dir , keep_in_memory = keep_in_memory
0 commit comments