Skip to content

Commit 726e16d

Browse files
committed
Changed name to config_id in builder
1 parent 494dc23 commit 726e16d

File tree

4 files changed

+47
-51
lines changed

4 files changed

+47
-51
lines changed

src/datasets/arrow_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1078,7 +1078,7 @@ def from_generator(
10781078
Fingerprint that will be used to generate dataset ID.
10791079
By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset.
10801080
1081-
<Added version="3.6.0"/>
1081+
<Added version="3.6.1"/>
10821082
**kwargs (additional keyword arguments):
10831083
Keyword arguments to be passed to :[`GeneratorConfig`].
10841084

src/datasets/builder.py

Lines changed: 45 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,6 @@ def create_config_id(
141141
self,
142142
config_kwargs: dict,
143143
custom_features: Optional[Features] = None,
144-
fingerprint: Optional[str] = None,
145144
) -> str:
146145
"""
147146
The config id is used to build the cache directory.
@@ -156,47 +155,43 @@ def create_config_id(
156155
"""
157156
# Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
158157
suffix: Optional[str] = None
159-
160-
if fingerprint is not None:
161-
suffix = fingerprint
162-
else:
163-
config_kwargs_to_add_to_suffix = config_kwargs.copy()
164-
# name and version are already used to build the cache directory
165-
config_kwargs_to_add_to_suffix.pop("name", None)
166-
config_kwargs_to_add_to_suffix.pop("version", None)
167-
# data dir handling (when specified it points to the manually downloaded data):
168-
# it was previously ignored before the introduction of config id because we didn't want
169-
# to change the config name. Now it's fine to take it into account for the config id.
170-
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
171-
if "data_dir" in config_kwargs_to_add_to_suffix:
172-
if config_kwargs_to_add_to_suffix["data_dir"] is None:
173-
config_kwargs_to_add_to_suffix.pop("data_dir", None)
174-
else:
175-
# canonicalize the data dir to avoid two paths to the same location having different
176-
# hashes
177-
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
178-
data_dir = os.path.normpath(data_dir)
179-
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
180-
if config_kwargs_to_add_to_suffix:
181-
# we don't care about the order of the kwargs
182-
config_kwargs_to_add_to_suffix = {
183-
k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
184-
}
185-
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
186-
suffix = ",".join(
187-
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
188-
)
189-
if len(suffix) > 32: # hash if too long
190-
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
191-
else:
158+
config_kwargs_to_add_to_suffix = config_kwargs.copy()
159+
# name and version are already used to build the cache directory
160+
config_kwargs_to_add_to_suffix.pop("name", None)
161+
config_kwargs_to_add_to_suffix.pop("version", None)
162+
# data dir handling (when specified it points to the manually downloaded data):
163+
# it was previously ignored before the introduction of config id because we didn't want
164+
# to change the config name. Now it's fine to take it into account for the config id.
165+
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
166+
if "data_dir" in config_kwargs_to_add_to_suffix:
167+
if config_kwargs_to_add_to_suffix["data_dir"] is None:
168+
config_kwargs_to_add_to_suffix.pop("data_dir", None)
169+
else:
170+
# canonicalize the data dir to avoid two paths to the same location having different
171+
# hashes
172+
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
173+
data_dir = os.path.normpath(data_dir)
174+
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
175+
if config_kwargs_to_add_to_suffix:
176+
# we don't care about the order of the kwargs
177+
config_kwargs_to_add_to_suffix = {
178+
k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
179+
}
180+
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
181+
suffix = ",".join(
182+
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
183+
)
184+
if len(suffix) > 32: # hash if too long
192185
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
186+
else:
187+
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
193188

194-
if custom_features is not None:
195-
m = Hasher()
196-
if suffix:
197-
m.update(suffix)
198-
m.update(custom_features)
199-
suffix = m.hexdigest()
189+
if custom_features is not None:
190+
m = Hasher()
191+
if suffix:
192+
m.update(suffix)
193+
m.update(custom_features)
194+
suffix = m.hexdigest()
200195

201196
if suffix:
202197
config_id = self.name + "-" + suffix
@@ -318,7 +313,7 @@ def __init__(
318313
data_dir: Optional[str] = None,
319314
storage_options: Optional[dict] = None,
320315
writer_batch_size: Optional[int] = None,
321-
fingerprint: Optional[str] = None,
316+
config_id: Optional[str] = None,
322317
**config_kwargs,
323318
):
324319
# DatasetBuilder name
@@ -349,7 +344,7 @@ def __init__(
349344
self.config, self.config_id = self._create_builder_config(
350345
config_name=config_name,
351346
custom_features=features,
352-
fingerprint=fingerprint,
347+
config_id=config_id,
353348
**config_kwargs,
354349
)
355350

@@ -540,7 +535,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
540535
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
541536

542537
def _create_builder_config(
543-
self, config_name=None, custom_features=None, fingerprint=None, **config_kwargs
538+
self, config_name=None, custom_features=None, config_id=None, **config_kwargs
544539
) -> tuple[BuilderConfig, str]:
545540
"""Create and validate BuilderConfig object as well as a unique config id for this config.
546541
Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
@@ -608,11 +603,13 @@ def _create_builder_config(
608603
)
609604

610605
# compute the config id that is going to be used for caching
611-
config_id = builder_config.create_config_id(
612-
config_kwargs,
613-
custom_features=custom_features,
614-
fingerprint=fingerprint,
615-
)
606+
if config_id is not None:
607+
config_id = builder_config.name + "-" + config_id
608+
else:
609+
config_id = builder_config.create_config_id(
610+
config_kwargs,
611+
custom_features=custom_features,
612+
)
616613
is_custom = (config_id not in self.builder_configs) and config_id != "default"
617614
if is_custom:
618615
logger.info(f"Using custom data configuration {config_id}")

src/datasets/io/generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(
3333
generator=generator,
3434
gen_kwargs=gen_kwargs,
3535
split=split,
36-
fingerprint=fingerprint,
36+
config_id=fingerprint,
3737
**kwargs,
3838
)
3939

src/datasets/packaged_modules/generator/generator.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ class GeneratorConfig(datasets.BuilderConfig):
1010
gen_kwargs: Optional[dict] = None
1111
features: Optional[datasets.Features] = None
1212
split: datasets.NamedSplit = datasets.Split.TRAIN
13-
fingerprint: Optional[str] = None
1413

1514
def __post_init__(self):
1615
super().__post_init__()

0 commit comments

Comments
 (0)