ServiceNow · jlamypoirier · Mar 28, 2025 · Mar 7, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/docs/quick-start.md b/docs/quick-start.md
@@ -492,9 +492,10 @@ Save the following as `fast-llm-tutorial/train-config.yaml`:
       train_iters: 100  # (1)!
       logs:
         interval: 10
-      validation:
-        iterations: 25
-        interval: 100
+      evaluations:
+        validation:
+          iterations: 25
+          interval: 100
       export:  # (2)!
         format: llama
         interval: 100
@@ -508,10 +509,10 @@ Save the following as `fast-llm-tutorial/train-config.yaml`:
       batch_size: 480  # (5)!
     data:
       datasets:
-        Training:
+        training:
           type: file
           path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml  # (6)!
-        Validation:
+        validation:
           type: file
           path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (6)!
     optimizer:
@@ -549,9 +550,10 @@ Save the following as `fast-llm-tutorial/train-config.yaml`:
       train_iters: 100_000  # (1)!
       logs:
         interval: 10
-      validation:
-        iterations: 25
-        interval: 1000
+      evaluations:
+        validation:
+          iterations: 25
+          interval: 1000
       checkpoint:
         interval: 1000
         keep: 5
@@ -569,10 +571,10 @@ Save the following as `fast-llm-tutorial/train-config.yaml`:
       batch_size: 512  # (5)!
     data:
       datasets:
-        Training:
+        training:
           type: file
           path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml  # (6)!
-        Validation:
+        validation:
           type: file
           path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (6)!
     optimizer:  # (7)!

diff --git a/docs/recipes/continue-training.md b/docs/recipes/continue-training.md
@@ -33,9 +33,10 @@ This is not much different from a pretraining config. We will:
       train_iters: 100_000
       logs:
         interval: 10
-      validation:
-        iterations: 25
-        interval: 1000
+      evaluations:
+        validation:
+          iterations: 25
+          interval: 1000
       checkpoint:
         interval: 1000
         keep: 5
@@ -48,9 +49,13 @@ This is not much different from a pretraining config. We will:
       sequence_length: 4096
       batch_size: 256
     data:
-      format: file
-      path: fast-llm-tutorial/dataset.json  # (2)!
-      split: [99, 1, 0]  
+      datasets:
+        training:
+          type: file
+          path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml  # (2)!
+        validation:
+          type: file
+          path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (2)!  
     optimizer:  
       weight_decay: 0.1
       beta_1: 0.9
@@ -84,8 +89,9 @@ This is not much different from a pretraining config. We will:
       logs:
         interval: 10
       validation:
-        iterations: 25
-        interval: 1000
+        Validation:
+          iterations: 25
+          interval: 1000
       checkpoint:
         interval: 1000
         keep: 5
@@ -98,9 +104,13 @@ This is not much different from a pretraining config. We will:
       sequence_length: 8192
       batch_size: 256
     data:
-      format: file
-      path: fast-llm-tutorial/dataset.json  # (2)!
-      split: [99, 1, 0]  
+      datasets:
+        training:
+          type: file
+          path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml  # (6)!
+        validation:
+          type: file
+          path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml  # (6)! 
     optimizer:  
       weight_decay: 0.1
       beta_1: 0.9
@@ -129,7 +139,7 @@ This is not much different from a pretraining config. We will:
     ```
 
 1.  A the model will be saved in Hugging Face format to `~/results` directory every 20,000 iterations.
-2.  Location of the dataset metadata file generated in Step 4.
+2.  Location of the dataset metadata file generated in Step 4 of quick start guide.
 3.  The learning-rate can be used to trade-off between learning and forgetting. A higher learning-rate will learn quickly on our new dataset but will cause forgetting. A lower learning-rate will instead retain more of the pretrained model's knowledge, but will slow down adapting to the new domain.
 4.  Config of the pretrained model. We load the model downloaded from the repository earlier.
 5.  This tells Fast-LLM to load the weights of the pretrained model. If we wanted to use the model's configuration, but train from scratch, we could use the same config but set this to `no`.

diff --git a/docs/recipes/data-configuration.md b/docs/recipes/data-configuration.md
@@ -13,10 +13,10 @@ We already saw an example dataset configuration in the [quick-start guide](../qu
     ```yaml
     data:
       datasets:
-        Training:
+        training:
           type: file
           path: fast-llm-tutorial/dataset/fast_llm_config_training.yaml
-        Validation:
+        validation:
           type: file
           path: fast-llm-tutorial/dataset/fast_llm_config_validation.yaml
     ```
@@ -25,14 +25,24 @@ We already saw an example dataset configuration in the [quick-start guide](../qu
 
 In this section we are interested in generalizing step 3. For more details on steps 1 and 2, please refer to the quick-start guide or [this example](data-configuration.md).
 
+The section `data.datasets` holds descriptions of datasets used in training, validation, and testing.  
+
+The Training and Testing phases must have predetermined dataset names: `training` and `testing`, respectively. Each of these phases can have only one dataset.  
+
+For validation datasets, the rules are different. There can be as many validation datasets as needed, and their names are arbitrary. In the example above, the dataset name `validation` is chosen for simplicity. The datasets names used for validation and their application details are specified in the training config `evaluations` sections.  
+
+Adding multiple validation datasets increases flexibility in tracking the accuracy of your trained model. One possible scenario is using a separate validation dataset for each blended training dataset, allowing you to track training progress on each subset separately and observe how the model performs in real time on different subsets of your training data.  
+
+Below are examples of how to configure various aspects of training and validation datasets.
+
 ## Example 1: Blending multiple datasets
 
 In this example, we have three datasets and want to sample from each of them during training with probabilities 0.70, 0.25 and 0.05. For this, we use the `blended` type which takes other datasets as arguments:
 
 ```yaml
 data:
   datasets:
-    Training:
+    training:
       type: blended
       datasets:
         - type: file
@@ -54,7 +64,7 @@ In this example, we have a large dataset that comes pre-shuffled, so shuffling i
 ```yaml
 data:
   datasets:
-    Training:
+    training:
       type: file
       path: path/to/dataset.yaml
   sampling:
@@ -68,10 +78,10 @@ In this example, we want to disable shuffling entirely, but only for the validat
 ```yaml
 data:
   datasets:
-    Training:
+    training:
       type: file
       path: path/to/training_dataset.yaml
-    Validation:
+    validation:
       type: sampled
       dataset:
         type: file
@@ -91,7 +101,7 @@ In this example, we have a blend of datasets as in example 1, but we wish to set
 ```yaml
 data:
   datasets:
-    Training:
+    training:
       type: blended
       datasets:
         - type: sampled
@@ -118,7 +128,34 @@ data:
 !!! note "Default seed"
     In the absence of explicit seed, Fast-LLM uses a default seed (`data.sampling`'s default) instead, and uses seed shifts to ensure different seeds for each phase and for the various blended datasets.
 
-## Example 5: Advanced scenario
+
+## Example 5: Specifying Multiple Validation Datasets  
+
+In this example, we show how to specify multiple validation datasets and configure how often they are applied, along with their usage attributes in the `training.evaluations` section.  
+
+Please note that the same dataset names must be used in the `training.evaluations` section. If a validation dataset is specified in the `datasets` section but not in `training.evaluations`, it will not be used for validation.  
+
+```yaml
+training:
+  evaluations:
+    the_stack:
+      iterations: 25
+      interval: 50
+    fineweb:
+      iterations: 25
+      interval: 100
+data:
+  datasets:
+    the_stack:
+      type: file
+      path: path/to/validation_the_stack_dataset.yaml
+    fineweb:
+      type: file
+      path: path/to/validation_fineweb_dataset.yaml
+
+```
+
+## Example 6: Advanced scenario
 
 In this example, we combine everything we learned so far to create a complex scenario, where:
 
@@ -129,7 +166,7 @@ In this example, we combine everything we learned so far to create a complex sce
 ```yaml
 data:
   datasets:
-    Training:
+    training:
       type: blended
       datasets:
         - type: sampled
@@ -156,7 +193,7 @@ data:
           # Seed = default + train_shift + 2 * blend_shift, shuffle = skip_first_epoch
           path: path/to/dataset_3.yaml
       weights: [0.70, 0.25, 0.05]
-    Validation:
+    validation:
         type: sampled
         dataset:
           type: file
@@ -174,10 +211,10 @@ data:
     ```yaml
     data:
       datasets:
-        Training:
+        training:
           type: file
           path: path/to/training_dataset_config.yaml
-        Validation:
+        validation:
           type: file
           path: path/to/validation_dataset_config.yaml
       sampling:

diff --git a/docs/recipes/instruction-finetuning.md b/docs/recipes/instruction-finetuning.md
@@ -114,9 +114,10 @@ training:
   train_iters: 5_000
   logs:
     interval: 1
-  validation:
-    iterations: 25
-    interval: 1000
+  evaluations:
+    validation:
+      iterations: 25
+      interval: 1000
   checkpoint:
     interval: 1000
     keep: 5
@@ -131,10 +132,10 @@ batch:
   cross_document_attention: no # (1)!
 data:
   datasets:
-    Training:
+    training:
       type: file
       path: ./sft-tutorial/tokenized/Llama-3.1-8B/fast_llm_config_training.yaml
-    Validation:
+    validation:
       type: file
       path: ./sft-tutorial/tokenized/Llama-3.1-8B/fast_llm_config_validation.yaml
   truncate_documents: no # (2)!

diff --git a/docs/recipes/train.md b/docs/recipes/train.md
@@ -19,9 +19,10 @@ Let's start from the following training configuration:
       train_iters: 100_000
       logs:
         interval: 10
-      validation:
-        iterations: 25
-        interval: 1000
+      evaluations:
+        validation:
+          iterations: 25
+          interval: 1000
       checkpoint:
         interval: 1000
         keep: 5
@@ -34,9 +35,13 @@ Let's start from the following training configuration:
       sequence_length: 4096
       batch_size: 256
     data:
-      format: file
-      path: fast-llm-tutorial/dataset/fast_llm_dataset.json
-      split: [99, 1, 0]
+      datasets:
+        training:
+          type: file
+          path: path/to/training_dataset_config.yaml
+        validation:
+          type: file
+          path: path/to/validation_dataset_config.yaml
     optimizer:
       weight_decay: 0.1
       beta_1: 0.9
@@ -63,9 +68,10 @@ Let's start from the following training configuration:
       train_iters: 100_000
       logs:
         interval: 10
-      validation:
-        iterations: 25
-        interval: 1000
+      evaluations:
+        validation:
+          iterations: 25
+          interval: 1000
       checkpoint:
         interval: 1000
         keep: 5
@@ -78,9 +84,13 @@ Let's start from the following training configuration:
       sequence_length: 8192
       batch_size: 256
     data:
-      format: file
-      path: fast-llm-tutorial/dataset/fast_llm_dataset.json
-      split: [99, 1, 0]
+      datasets:
+        training:
+          type: file
+          path: path/to/training_dataset_config.yaml
+        validation:
+          type: file
+          path: path/to/validation_dataset_config.yaml
     optimizer:
       weight_decay: 0.1
       beta_1: 0.9

diff --git a/examples/mistral.yaml b/examples/mistral.yaml
@@ -3,16 +3,17 @@ training:
   num_workers: 8
   logs:
     interval: 10
-  validation:
-    iterations: null
+  evaluations:
+    validation:
+      iterations: null
   test_iters: 0
 batch:
   sequence_length: 4096
   micro_batch_size: 2
   batch_size: 64
 data:
   datasets:
-    Training:
+    training:
       type: random
 optimizer:
   learning_rate: