@@ -141,7 +141,6 @@ def create_config_id(
141
141
self ,
142
142
config_kwargs : dict ,
143
143
custom_features : Optional [Features ] = None ,
144
- fingerprint : Optional [str ] = None ,
145
144
) -> str :
146
145
"""
147
146
The config id is used to build the cache directory.
@@ -156,47 +155,43 @@ def create_config_id(
156
155
"""
157
156
# Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
158
157
suffix : Optional [str ] = None
159
-
160
- if fingerprint is not None :
161
- suffix = fingerprint
162
- else :
163
- config_kwargs_to_add_to_suffix = config_kwargs .copy ()
164
- # name and version are already used to build the cache directory
165
- config_kwargs_to_add_to_suffix .pop ("name" , None )
166
- config_kwargs_to_add_to_suffix .pop ("version" , None )
167
- # data dir handling (when specified it points to the manually downloaded data):
168
- # it was previously ignored before the introduction of config id because we didn't want
169
- # to change the config name. Now it's fine to take it into account for the config id.
170
- # config_kwargs_to_add_to_suffix.pop("data_dir", None)
171
- if "data_dir" in config_kwargs_to_add_to_suffix :
172
- if config_kwargs_to_add_to_suffix ["data_dir" ] is None :
173
- config_kwargs_to_add_to_suffix .pop ("data_dir" , None )
174
- else :
175
- # canonicalize the data dir to avoid two paths to the same location having different
176
- # hashes
177
- data_dir = config_kwargs_to_add_to_suffix ["data_dir" ]
178
- data_dir = os .path .normpath (data_dir )
179
- config_kwargs_to_add_to_suffix ["data_dir" ] = data_dir
180
- if config_kwargs_to_add_to_suffix :
181
- # we don't care about the order of the kwargs
182
- config_kwargs_to_add_to_suffix = {
183
- k : config_kwargs_to_add_to_suffix [k ] for k in sorted (config_kwargs_to_add_to_suffix )
184
- }
185
- if all (isinstance (v , (str , bool , int , float )) for v in config_kwargs_to_add_to_suffix .values ()):
186
- suffix = "," .join (
187
- str (k ) + "=" + urllib .parse .quote_plus (str (v )) for k , v in config_kwargs_to_add_to_suffix .items ()
188
- )
189
- if len (suffix ) > 32 : # hash if too long
190
- suffix = Hasher .hash (config_kwargs_to_add_to_suffix )
191
- else :
158
+ config_kwargs_to_add_to_suffix = config_kwargs .copy ()
159
+ # name and version are already used to build the cache directory
160
+ config_kwargs_to_add_to_suffix .pop ("name" , None )
161
+ config_kwargs_to_add_to_suffix .pop ("version" , None )
162
+ # data dir handling (when specified it points to the manually downloaded data):
163
+ # it was previously ignored before the introduction of config id because we didn't want
164
+ # to change the config name. Now it's fine to take it into account for the config id.
165
+ # config_kwargs_to_add_to_suffix.pop("data_dir", None)
166
+ if "data_dir" in config_kwargs_to_add_to_suffix :
167
+ if config_kwargs_to_add_to_suffix ["data_dir" ] is None :
168
+ config_kwargs_to_add_to_suffix .pop ("data_dir" , None )
169
+ else :
170
+ # canonicalize the data dir to avoid two paths to the same location having different
171
+ # hashes
172
+ data_dir = config_kwargs_to_add_to_suffix ["data_dir" ]
173
+ data_dir = os .path .normpath (data_dir )
174
+ config_kwargs_to_add_to_suffix ["data_dir" ] = data_dir
175
+ if config_kwargs_to_add_to_suffix :
176
+ # we don't care about the order of the kwargs
177
+ config_kwargs_to_add_to_suffix = {
178
+ k : config_kwargs_to_add_to_suffix [k ] for k in sorted (config_kwargs_to_add_to_suffix )
179
+ }
180
+ if all (isinstance (v , (str , bool , int , float )) for v in config_kwargs_to_add_to_suffix .values ()):
181
+ suffix = "," .join (
182
+ str (k ) + "=" + urllib .parse .quote_plus (str (v )) for k , v in config_kwargs_to_add_to_suffix .items ()
183
+ )
184
+ if len (suffix ) > 32 : # hash if too long
192
185
suffix = Hasher .hash (config_kwargs_to_add_to_suffix )
186
+ else :
187
+ suffix = Hasher .hash (config_kwargs_to_add_to_suffix )
193
188
194
- if custom_features is not None :
195
- m = Hasher ()
196
- if suffix :
197
- m .update (suffix )
198
- m .update (custom_features )
199
- suffix = m .hexdigest ()
189
+ if custom_features is not None :
190
+ m = Hasher ()
191
+ if suffix :
192
+ m .update (suffix )
193
+ m .update (custom_features )
194
+ suffix = m .hexdigest ()
200
195
201
196
if suffix :
202
197
config_id = self .name + "-" + suffix
@@ -318,7 +313,7 @@ def __init__(
318
313
data_dir : Optional [str ] = None ,
319
314
storage_options : Optional [dict ] = None ,
320
315
writer_batch_size : Optional [int ] = None ,
321
- fingerprint : Optional [str ] = None ,
316
+ config_id : Optional [str ] = None ,
322
317
** config_kwargs ,
323
318
):
324
319
# DatasetBuilder name
@@ -349,7 +344,7 @@ def __init__(
349
344
self .config , self .config_id = self ._create_builder_config (
350
345
config_name = config_name ,
351
346
custom_features = features ,
352
- fingerprint = fingerprint ,
347
+ config_id = config_id ,
353
348
** config_kwargs ,
354
349
)
355
350
@@ -540,7 +535,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
540
535
return self .get_all_exported_dataset_infos ().get (self .config .name , DatasetInfo ())
541
536
542
537
def _create_builder_config (
543
- self , config_name = None , custom_features = None , fingerprint = None , ** config_kwargs
538
+ self , config_name = None , custom_features = None , config_id = None , ** config_kwargs
544
539
) -> tuple [BuilderConfig , str ]:
545
540
"""Create and validate BuilderConfig object as well as a unique config id for this config.
546
541
Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
@@ -608,11 +603,13 @@ def _create_builder_config(
608
603
)
609
604
610
605
# compute the config id that is going to be used for caching
611
- config_id = builder_config .create_config_id (
612
- config_kwargs ,
613
- custom_features = custom_features ,
614
- fingerprint = fingerprint ,
615
- )
606
+ if config_id is not None :
607
+ config_id = builder_config .name + "-" + config_id
608
+ else :
609
+ config_id = builder_config .create_config_id (
610
+ config_kwargs ,
611
+ custom_features = custom_features ,
612
+ )
616
613
is_custom = (config_id not in self .builder_configs ) and config_id != "default"
617
614
if is_custom :
618
615
logger .info (f"Using custom data configuration { config_id } " )
0 commit comments