Skip to content

Commit 350f4fd

Browse files
lhoestqstevhliuPolina Kazakova
authored
Rename "pattern" to "path" in YAML data_files configs (#6044)
* rename pattern to path * docs * better _raise_if_data_files_field_not_valid * Apply suggestions from code review Co-authored-by: Steven Liu <[email protected]> * fix check * fix * only "path" (removed plural) * Apply suggestions from code review Co-authored-by: Polina Kazakova <[email protected]> * style --------- Co-authored-by: Steven Liu <[email protected]> Co-authored-by: Polina Kazakova <[email protected]>
1 parent 4472a87 commit 350f4fd

File tree

6 files changed

+68
-60
lines changed

6 files changed

+68
-60
lines changed

docs/source/repository_structure.mdx

Lines changed: 26 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,15 @@ In this simple case, you'll get a dataset with two splits: `train` (containing e
2424

2525
## Splits
2626

27-
If you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md using glob patterns.
27+
If you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md.
2828

2929
For example, given a repository like this one:
3030

3131
```
3232
my_dataset_repository/
3333
├── README.md
34-
├── directory1/
35-
│ └── bees.csv
36-
└── directory2/
37-
└── more_bees.csv
34+
├── data.csv
35+
└── holdout.csv
3836
```
3937

4038
You can define your splits by adding the `configs` field in the YAML block at the top of your README.md:
@@ -45,27 +43,23 @@ configs:
4543
- config_name: default
4644
data_files:
4745
- split: train
48-
pattern: "directory1/*.csv"
46+
path: "data.csv"
4947
- split: test
50-
pattern: "directory2/*.csv"
48+
path: "holdout.csv"
5149
---
5250
```
5351

54-
<Tip warning={true}>
55-
Note that `config_name` field is required even if you have a single configuration.
56-
</Tip>
5752

58-
Having several patterns per split is also supported:
53+
You can select multiple files per split using a list of paths:
5954

6055
```
6156
my_dataset_repository/
6257
├── README.md
63-
├── directory1/
64-
│ └── bees.csv
65-
├── directory1bis/
66-
│ └── more_bees.csv
67-
└── directory2/
68-
└── even_more_bees.csv
58+
├── data/
59+
│ ├── abc.csv
60+
│ └── def.csv
61+
└── holdout/
62+
└── ghi.csv
6963
```
7064

7165
```yaml
@@ -74,32 +68,34 @@ configs:
7468
- config_name: default
7569
data_files:
7670
- split: train
77-
pattern:
78-
- "directory1/*.csv"
79-
- "directory1bis/*.csv"
71+
path:
72+
- "data/abc.csv"
73+
- "data/def.csv"
8074
- split: test
81-
pattern:
82-
- "directory2/*.csv"
75+
path: "holdout/ghi.csv"
8376
---
8477
```
8578

86-
Custom split names are also supported:
79+
Or you can use glob patterns to automatically list all the files you need:
80+
8781
```yaml
82+
---
8883
configs:
8984
- config_name: default
9085
data_files:
91-
- split: random
92-
pattern:
93-
- "directory1bis/*.csv"
9486
- split: train
95-
pattern:
96-
- "directory1/*.csv"
87+
path: "data/*.csv"
9788
- split: test
98-
pattern:
99-
- "directory2/*.csv"
89+
path: "holdout/*.csv"
10090
---
10191
```
10292

93+
<Tip warning={true}>
94+
95+
Note that `config_name` field is required even if you have a single configuration.
96+
97+
</Tip>
98+
10399
## Configurations
104100

105101
Your dataset might have several subsets of data that you want to be able to load separately. In that case you can define a list of configurations inside the `configs` field in YAML:

src/datasets/arrow_dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5516,7 +5516,7 @@ def push_to_hub(
55165516
}
55175517
default_metadata_configs_to_dump = {
55185518
"data_files": [
5519-
{"split": _resolved_split, "pattern": f"data/{_resolved_split}-*"}
5519+
{"split": _resolved_split, "path": f"data/{_resolved_split}-*"}
55205520
for _resolved_split in _resolved_splits
55215521
]
55225522
}
@@ -5529,18 +5529,18 @@ def push_to_hub(
55295529
else:
55305530
data_files_to_dump = {}
55315531
# add the new split
5532-
data_files_to_dump[split] = f"{data_dir}/{split}-*"
5532+
data_files_to_dump[split] = [f"{data_dir}/{split}-*"]
55335533
metadata_config_to_dump = {
55345534
"data_files": [
55355535
{
55365536
"split": _split,
5537-
"pattern": _pattern[0] if isinstance(_pattern, list) and len(_pattern) == 1 else _pattern,
5537+
"path": _pattern[0] if len(_pattern) == 1 else _pattern,
55385538
}
55395539
for _split, _pattern in data_files_to_dump.items()
55405540
]
55415541
}
55425542
else:
5543-
metadata_config_to_dump = {"data_files": [{"split": split, "pattern": f"{data_dir}/{split}-*"}]}
5543+
metadata_config_to_dump = {"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"}]}
55445544
# push to the deprecated dataset_infos.json
55455545
if config.DATASETDICT_INFOS_FILENAME in repo_files:
55465546
download_config = DownloadConfig()

src/datasets/data_files.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,17 +102,20 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[
102102
elif isinstance(patterns, list):
103103
if any(isinstance(pattern, dict) for pattern in patterns):
104104
for pattern in patterns:
105-
if not isinstance(pattern, dict) or sorted(pattern) != ["pattern", "split"]:
105+
if not (
106+
isinstance(pattern, dict)
107+
and len(pattern) == 2
108+
and "split" in pattern
109+
and isinstance(pattern.get("path"), (str, list))
110+
):
106111
raise ValueError(
107-
f"Expected each pattern in a list of patterns to be a string or a list, but got {pattern}"
112+
f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
108113
)
109114
splits = [pattern["split"] for pattern in patterns]
110115
if len(set(splits)) != len(splits):
111116
raise ValueError(f"Some splits are duplicated in data_files: {splits}")
112117
return {
113-
str(pattern["split"]): pattern["pattern"]
114-
if isinstance(pattern["pattern"], list)
115-
else [pattern["pattern"]]
118+
str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
116119
for pattern in patterns
117120
}
118121
else:

src/datasets/dataset_dict.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,7 +1660,7 @@ def push_to_hub(
16601660
info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes
16611661

16621662
metadata_config_to_dump = {
1663-
"data_files": [{"split": split, "pattern": f"{data_dir}/{split}-*"} for split in self.keys()],
1663+
"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()],
16641664
}
16651665

16661666
api = HfApi(endpoint=config.HF_ENDPOINT)
@@ -1704,7 +1704,7 @@ def push_to_hub(
17041704
}
17051705
default_metadata_configs_to_dump = {
17061706
"data_files": [
1707-
{"split": _resolved_split, "pattern": f"data/{_resolved_split}-*"}
1707+
{"split": _resolved_split, "path": f"data/{_resolved_split}-*"}
17081708
for _resolved_split in _resolved_splits
17091709
]
17101710
}

src/datasets/utils/metadata.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict):
128128
yaml_error_message = textwrap.dedent(
129129
f"""
130130
Expected data_files in YAML to be either a string or a list of strings
131-
or a list of dicts with two keys: 'split' and 'pattern', but got {yaml_data_files}
131+
or a list of dicts with two keys: 'split' and 'path', but got {yaml_data_files}
132132
Examples of data_files in YAML:
133133
134134
data_files: data.csv
@@ -141,22 +141,31 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict):
141141
142142
data_files:
143143
- split: train
144-
pattern: train/*
144+
path: train/*
145145
- split: test
146-
pattern: test/*
146+
path: test/*
147+
148+
data_files:
149+
- split: train
150+
path:
151+
- train/part1/*
152+
- train/part2/*
153+
- split: test
154+
path: test/*
147155
"""
148156
)
149157
if not isinstance(yaml_data_files, (list, str)):
150158
raise ValueError(yaml_error_message)
151159
if isinstance(yaml_data_files, list):
152160
for yaml_data_files_item in yaml_data_files:
153-
if not isinstance(yaml_data_files_item, str) and not (
154-
isinstance(yaml_data_files_item, dict)
155-
and sorted(yaml_data_files_item)
156-
== [
157-
"pattern",
158-
"split",
159-
]
161+
if (
162+
not isinstance(yaml_data_files_item, (str, dict))
163+
or isinstance(yaml_data_files_item, dict)
164+
and not (
165+
len(yaml_data_files_item) == 2
166+
and "split" in yaml_data_files_item
167+
and isinstance(yaml_data_files_item.get("path"), (str, list))
168+
)
160169
):
161170
raise ValueError(yaml_error_message)
162171

tests/test_upstream_hub.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -614,19 +614,19 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(self, temp
614614
{
615615
"config_name": "config1",
616616
"data_files": [
617-
{"split": "train", "pattern": "config1/train-*"},
617+
{"split": "train", "path": "config1/train-*"},
618618
],
619619
},
620620
{
621621
"config_name": "config2",
622622
"data_files": [
623-
{"split": "train", "pattern": "config2/train-*"},
623+
{"split": "train", "path": "config2/train-*"},
624624
],
625625
},
626626
{
627627
"config_name": "default",
628628
"data_files": [
629-
{"split": "train", "pattern": "data/train-*"},
629+
{"split": "train", "path": "data/train-*"},
630630
],
631631
},
632632
]
@@ -743,22 +743,22 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(self,
743743
{
744744
"config_name": "config1",
745745
"data_files": [
746-
{"split": "train", "pattern": "config1/train-*"},
747-
{"split": "random", "pattern": "config1/random-*"},
746+
{"split": "train", "path": "config1/train-*"},
747+
{"split": "random", "path": "config1/random-*"},
748748
],
749749
},
750750
{
751751
"config_name": "config2",
752752
"data_files": [
753-
{"split": "train", "pattern": "config2/train-*"},
754-
{"split": "random", "pattern": "config2/random-*"},
753+
{"split": "train", "path": "config2/train-*"},
754+
{"split": "random", "path": "config2/random-*"},
755755
],
756756
},
757757
{
758758
"config_name": "default",
759759
"data_files": [
760-
{"split": "train", "pattern": "data/train-*"},
761-
{"split": "random", "pattern": "data/random-*"},
760+
{"split": "train", "path": "data/train-*"},
761+
{"split": "random", "path": "data/random-*"},
762762
],
763763
},
764764
]

0 commit comments

Comments
 (0)