Skip to content

Commit a84b716

Browse files
willmjdushyantbehl
andauthored
build(deps): upgrade trl (#540)
* deps: upgrade trl Signed-off-by: Will Johnson <[email protected]> * docs: offline data preprocessing note Signed-off-by: Will Johnson <[email protected]> * fix: None for tokenized to temp comply with trl Signed-off-by: Will Johnson <[email protected]> --------- Signed-off-by: Will Johnson <[email protected]> Co-authored-by: Dushyant Behl <[email protected]>
1 parent 0585642 commit a84b716

File tree

4 files changed

+11
-38
lines changed

4 files changed

+11
-38
lines changed

docs/offline-data-preprocessing.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ python scripts/offline_data_processing.py \
3737

3838
Additionally, once the offline data processing is complete, users can leverage the shards stored in `output_dir` for tuning by passing it through the `--training_data_path` flag or passing it via `data_paths` argument in data config yaml, provided they find the sharded datasets beneficial for training.
3939

40+
**NOTE**: The offline data preprocessing script is not compatible with processing image datasets for vision models.
41+
4042
## Example Usage
4143
### Applying Chat Template
4244

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ dependencies = [
3434
"sentencepiece>=0.1.99,<0.3",
3535
"tokenizers>=0.13.3,<1.0",
3636
"tqdm>=4.66.2,<5.0",
37-
"trl>=0.13,<0.17",
37+
"trl>=0.13,<0.18",
3838
"peft>=0.8.0,<0.14",
3939
"protobuf>=5.28.0,<6.0.0",
4040
"datasets>=2.15.0,<4.0",

tests/data/test_data_preprocessing.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -562,23 +562,6 @@ def test_is_pretokenized_data(data, result):
562562
False,
563563
DataCollatorForCompletionOnlyLM,
564564
),
565-
(
566-
False,
567-
None,
568-
Dataset.from_list(
569-
[
570-
{
571-
"input_ids": [9437, 29, 210],
572-
"attention_mask": [1, 1, 1],
573-
"labels": [1, 20, 30],
574-
}
575-
]
576-
),
577-
1024,
578-
None,
579-
False,
580-
DataCollatorForSeq2Seq,
581-
),
582565
(
583566
False,
584567
"\n### Label:",
@@ -592,23 +575,6 @@ def test_is_pretokenized_data(data, result):
592575
False,
593576
DataCollatorForCompletionOnlyLM,
594577
),
595-
(
596-
False,
597-
None,
598-
Dataset.from_list(
599-
[
600-
{
601-
"input_ids": [9437, 29, 210],
602-
"attention_mask": [1, 1, 1],
603-
"labels": [1, 20, 30],
604-
}
605-
]
606-
),
607-
1024,
608-
"\n### Text:",
609-
False,
610-
DataCollatorForSeq2Seq,
611-
),
612578
(
613579
False,
614580
None,

tuning/data/data_preprocessing_utils.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,14 @@ def get_data_collator(
9494
if is_traindata_tokenized:
9595
# Note that this automatically pads labels with -100
9696
# TODO check if this is sufficient for preprocessed
97-
return DataCollatorForSeq2Seq(
98-
tokenizer=tokenizer, padding=True, max_length=max_seq_length
99-
)
97+
# TODO with the release of trl v0.17.0, DataCollatorForSeq2Seq
98+
# was removed from tokenized data processing, should eventually
99+
# be added back in with support directly in fms-hf-tuning, not
100+
# dependent on trl.
101+
# return DataCollatorForSeq2Seq(
102+
# tokenizer=tokenizer, padding=True, max_length=max_seq_length
103+
# )
104+
return None
100105

101106
# TODO: near term - how response template ids are parsed out needs to be cleaned.
102107
# The [2:] here applies if response template has \n prefix, it is needed to strip \n,

0 commit comments

Comments
 (0)