Skip to content

Commit e6fc266

Browse files
author
Fahad Alghanim
committed
Fix duplicate kwargs in load_dataset_builder
Avoid TypeError when builder_kwargs and config_kwargs share keys like base_path. Fixes #4910
1 parent 518bf32 commit e6fc266

2 files changed

Lines changed: 25 additions & 0 deletions

File tree

src/datasets/load.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,14 @@ def load_dataset_builder(
11491149
dataset_name = builder_kwargs.pop("dataset_name", None)
11501150
info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None
11511151

1152+
# Avoid passing duplicate keyword arguments to the builder.
1153+
# `builder_kwargs` can contain keys like `base_path`, and users may also pass them via `config_kwargs`.
1154+
# In that case, Python raises: "TypeError: got multiple values for keyword argument ...".
1155+
# Keep the user-provided values (config_kwargs) by dropping overlaps from builder_kwargs.
1156+
if config_kwargs:
1157+
for key in set(builder_kwargs).intersection(config_kwargs):
1158+
builder_kwargs.pop(key, None)
1159+
11521160
if (
11531161
path in _PACKAGED_DATASETS_MODULES
11541162
and data_files is None
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import pytest
2+
3+
from datasets.load import load_dataset_builder
4+
5+
6+
def test_load_dataset_builder_does_not_fail_with_duplicate_builder_kwargs(tmp_path):
7+
# Regression test for https://github.com/huggingface/datasets/issues/4910
8+
# Some module factories provide `base_path` in `builder_kwargs`, and users can also pass `base_path`
9+
# via `config_kwargs`, which used to raise:
10+
# TypeError: ... got multiple values for keyword argument 'base_path'
11+
train_csv = tmp_path / "train.csv"
12+
train_csv.write_text("col\n1\n", encoding="utf-8")
13+
14+
custom_base_path = str(tmp_path / "custom_base_path")
15+
builder = load_dataset_builder("csv", data_files=str(train_csv), base_path=custom_base_path)
16+
assert builder.base_path == custom_base_path
17+

0 commit comments

Comments
 (0)