build(deps): upgrade trl (#540)

willmj · dushyantbehl · web-flow · commit a84b71690916 · 2025-04-25T12:21:30.000-04:00
* deps: upgrade trl

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* docs: offline data preprocessing note

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

* fix: None for tokenized to temp comply with trl

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;

---------

Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;
Co-authored-by: Dushyant Behl &lt;dushyantbehl@users.noreply.github.com&gt;
diff --git a/docs/offline-data-preprocessing.md b/docs/offline-data-preprocessing.md
@@ -37,6 +37,8 @@ python scripts/offline_data_processing.py \
 
 Additionally, once the offline data processing is complete, users can leverage the shards stored in `output_dir` for tuning by passing it through the `--training_data_path` flag or passing it via `data_paths` argument in data config yaml, provided they find the sharded datasets beneficial for training.
 
+**NOTE**: The offline data preprocessing script is not compatible with processing image datasets for vision models. 
+
 ## Example Usage
 ### Applying Chat Template
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
 "sentencepiece>=0.1.99,<0.3",
 "tokenizers>=0.13.3,<1.0",
 "tqdm>=4.66.2,<5.0",
-"trl>=0.13,<0.17",
+"trl>=0.13,<0.18",
 "peft>=0.8.0,<0.14",
 "protobuf>=5.28.0,<6.0.0",
 "datasets>=2.15.0,<4.0",
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
@@ -562,23 +562,6 @@ def test_is_pretokenized_data(data, result):
             False,
             DataCollatorForCompletionOnlyLM,
         ),
-        (
-            False,
-            None,
-            Dataset.from_list(
-                [
-                    {
-                        "input_ids": [9437, 29, 210],
-                        "attention_mask": [1, 1, 1],
-                        "labels": [1, 20, 30],
-                    }
-                ]
-            ),
-            1024,
-            None,
-            False,
-            DataCollatorForSeq2Seq,
-        ),
         (
             False,
             "\n### Label:",
@@ -592,23 +575,6 @@ def test_is_pretokenized_data(data, result):
             False,
             DataCollatorForCompletionOnlyLM,
         ),
-        (
-            False,
-            None,
-            Dataset.from_list(
-                [
-                    {
-                        "input_ids": [9437, 29, 210],
-                        "attention_mask": [1, 1, 1],
-                        "labels": [1, 20, 30],
-                    }
-                ]
-            ),
-            1024,
-            "\n### Text:",
-            False,
-            DataCollatorForSeq2Seq,
-        ),
         (
             False,
             None,
diff --git a/tuning/data/data_preprocessing_utils.py b/tuning/data/data_preprocessing_utils.py
@@ -94,9 +94,14 @@ def get_data_collator(
     if is_traindata_tokenized:
         # Note that this automatically pads labels with -100
         # TODO check if this is sufficient for preprocessed
-        return DataCollatorForSeq2Seq(
-            tokenizer=tokenizer, padding=True, max_length=max_seq_length
-        )
+        # TODO with the release of trl v0.17.0, DataCollatorForSeq2Seq
+        # was removed from tokenized data processing, should eventually
+        # be added back in with support directly in fms-hf-tuning, not
+        # dependent on trl.
+        # return DataCollatorForSeq2Seq(
+        #     tokenizer=tokenizer, padding=True, max_length=max_seq_length
+        # )
+        return None
 
     # TODO: near term - how response template ids are parsed out needs to be cleaned.
     # The [2:] here applies if response template has \n prefix, it is needed to strip \n,