From 146b25a020aaa60adc2f723ea3890aef84de5084 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Wed, 11 Dec 2024 16:28:48 -0500
Subject: [PATCH] Split `generate_data` into multiple discrete steps

This doesn't move things out into separate files yet, but it does
split the existing functionality of `generate_date` into multiple
discrete steps and changes `generate_date` to just call those steps.

This is a step towards cleaner separation between the steps and
creating top-level Python APIs for each discrete step for advanced
use-cases that don't just want an entire single step generation pipeline.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 src/instructlab/sdg/__init__.py               |   4 +-
 ...y_to_samples.py => preprocess_taxonomy.py} |   6 +-
 src/instructlab/sdg/datamixing.py             |  23 +-
 src/instructlab/sdg/generate_data.py          | 428 +++++++++++-------
 src/instructlab/sdg/pipeline.py               |   5 +-
 src/instructlab/sdg/taxonomy.py               |  81 +++-
 src/instructlab/sdg/utils/taxonomy.py         |   4 +
 tests/conftest.py                             |   2 +-
 8 files changed, 367 insertions(+), 186 deletions(-)
 rename src/instructlab/sdg/cli/{taxonomy_to_samples.py => preprocess_taxonomy.py} (94%)

diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
index a3576662..ccef90d8 100644
--- a/src/instructlab/sdg/__init__.py
+++ b/src/instructlab/sdg/__init__.py
@@ -29,7 +29,7 @@
     "FULL_PIPELINES_PACKAGE",
     "SIMPLE_PIPELINES_PACKAGE",
     "generate_data",
-    "taxonomy_to_samples",
+    "preprocess_taxonomy",
 )
 
 # Local
@@ -62,6 +62,6 @@
     PipelineContext,
 )
 from .registry import BlockRegistry, PromptRegistry
-from .taxonomy import taxonomy_to_samples
+from .taxonomy import preprocess_taxonomy
 from .utils import GenerateException
 from .utils.taxonomy import TaxonomyReadingException
diff --git a/src/instructlab/sdg/cli/taxonomy_to_samples.py b/src/instructlab/sdg/cli/preprocess_taxonomy.py
similarity index 94%
rename from src/instructlab/sdg/cli/taxonomy_to_samples.py
rename to src/instructlab/sdg/cli/preprocess_taxonomy.py
index 112f764b..b7d22532 100644
--- a/src/instructlab/sdg/cli/taxonomy_to_samples.py
+++ b/src/instructlab/sdg/cli/preprocess_taxonomy.py
@@ -8,7 +8,7 @@
     DEFAULT_CHUNK_WORD_COUNT,
     DEFAULT_SERVER_CTX_SIZE,
     DEFAULT_TAXONOMY_BASE,
-    taxonomy_to_samples,
+    preprocess_taxonomy,
 )
 from instructlab.sdg.utils.logging import setup_logger
 
@@ -68,7 +68,7 @@
 
     args = parser.parse_args()
     setup_logger(args.log_level)
-    taxonomy_to_samples(
+    preprocess_taxonomy(
         args.taxonomy_path,
         args.output_dir,
         chunk_word_count=args.chunk_word_count,
@@ -78,5 +78,5 @@
     )
 
 """
-python -m instructlab.sdg.cli.taxonomy_to_samples --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
+python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
 """
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
index e6ca8675..de31e136 100644
--- a/src/instructlab/sdg/datamixing.py
+++ b/src/instructlab/sdg/datamixing.py
@@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
         Create the final mixed dataset by loading, sampling, and
         concatenating all datasets in this recipe
         """
-        if not self.dataset_added:
+        if not self.datasets:
             logger.error("No dataset added to the recipe")
 
         mixed_ds = self._load_and_sample_datasets(num_proc)
@@ -726,19 +726,36 @@ def collect(
                 sampling_size=self.NUM_SYNTH_SKILLS,
             )
 
+    def _write_mixed_recipe(self, recipe, output_file_recipe):
+        """
+        Write the recipes created during data mixing without writing the actual
+        mixed datasets to disk.
+        """
+        full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
+        recipe.save_recipe(full_recipe_path)
+
     def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
         """
         Mix the generated leaf node data into a single dataset and write it to
         disk. The heavy lifting is delegated to the Recipe class.
         """
+        self._write_mixed_recipe(recipe, output_file_recipe)
         if recipe.dataset_added:
-            full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
-            recipe.save_recipe(full_recipe_path)
             recipe.save_mixed_dataset(
                 os.path.join(self.output_dir, output_file_data),
                 self.num_procs,
             )
 
+    def write_recipes(self):
+        self._write_mixed_recipe(
+            self.knowledge_recipe,
+            self.output_file_knowledge_recipe,
+        )
+        self._write_mixed_recipe(
+            self.skills_recipe,
+            self.output_file_skills_recipe,
+        )
+
     def generate(self):
         self._gen_mixed_data(
             self.knowledge_recipe,
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 31643457..f0593dea 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -5,10 +5,11 @@
 from importlib import resources
 from pathlib import Path
 from typing import Optional
-import dataclasses
+import glob
 import json
 import logging
 import os
+import shutil
 import time
 
 # Third Party
@@ -19,7 +20,7 @@
 
 # First Party
 from instructlab.sdg.blocks.llmblock import DEFAULT_MAX_NUM_TOKENS
-from instructlab.sdg.datamixing import DataMixer, _get_question_hack, _get_response_hack
+from instructlab.sdg.datamixing import DataMixer, Recipe, _get_question_hack, _get_response_hack
 from instructlab.sdg.eval_data import generate_eval_task_data, mmlubench_pipe_init
 from instructlab.sdg.pipeline import (
     FULL_PIPELINES_PACKAGE,
@@ -27,8 +28,9 @@
     Pipeline,
     PipelineContext,
 )
-from instructlab.sdg.taxonomy import taxonomy_to_samples
+from instructlab.sdg.taxonomy import preprocess_taxonomy
 from instructlab.sdg.utils import GenerateException, models
+from instructlab.sdg.utils.taxonomy import _unescape
 from instructlab.sdg.utils.json import jldump, jlload
 
 logger = logging.getLogger(__name__)
@@ -36,10 +38,6 @@
 _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."
 
 
-def _unescape(s):
-    return bytes(s, "utf-8").decode("utf-8").strip()
-
-
 def _convert_to_messages(sample):
     """
     Convert a sample dictionary to contain 'messages' and 'metadata' columns required for training.
@@ -110,56 +108,6 @@ def _gen_train_data(
     jldump(messages_data, output_file_messages)
 
 
-def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
-    res = []
-    for i in range(3):
-        idx = i + 1
-        user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"]
-        res.append(
-            {
-                "system": system_prompt,
-                "user": _unescape(user),
-                "assistant": _unescape(seed_example[f"icl_response_{idx}"]),
-            }
-        )
-    return res
-
-
-def _gen_test_data(
-    seed_examples,
-    output_file_test,
-    system_prompt,
-):
-    """
-    Generate test data in the format needed by the legacy Linux training
-    in instructlab/instructlab.
-    """
-    test_data = []
-    for seed_example in seed_examples:
-        if "icl_query_1" in seed_example:
-            test_data.extend(
-                _knowledge_seed_example_to_test_data(seed_example, system_prompt)
-            )
-            continue
-
-        # skill seed example
-
-        user = seed_example["seed_question"]  # question
-
-        if seed_example["leaf_node_type"] == "grounded_skill":
-            user += "\n" + seed_example["seed_context"]  # context
-
-        test_data.append(
-            {
-                "system": system_prompt,
-                "user": _unescape(user),
-                "assistant": _unescape(seed_example["seed_response"]),  # answer
-            }
-        )
-
-    jldump(test_data, output_file_test)
-
-
 def _check_pipeline_dir(pipeline):
     for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]:
         if not os.path.exists(os.path.join(pipeline, file)):
@@ -240,7 +188,7 @@ def load_pipeline(yaml_basename):
 
 
 def _mixer_init(
-    ctx,
+    num_procs,
     output_dir,
     date_suffix,
     knowledge_auxiliary_inst,
@@ -254,95 +202,36 @@ def _mixer_init(
         output_dir,
         date_suffix,
         system_prompt,
-        ctx.dataset_num_procs,
+        num_procs,
         knowledge_auxiliary_inst,
     )
 
 
-# This is part of the public API, and used by instructlab.
-# TODO - parameter removal needs to be done in sync with a CLI change.
-# to be removed: logger
-def generate_data(
+def _extract_leaf_node_path_and_type(sample):
+    leaf_node_path = sample.get("leaf_node_path", "unknown")
+    leaf_node_type = sample.get("leaf_node_type")
+    return leaf_node_path, leaf_node_type
+
+
+def generate_taxonomy(
     client: openai.OpenAI,
+    input_dir: str,
+    output_dir: str,
     logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
-    system_prompt: Optional[str] = None,
-    use_legacy_pretraining_format: Optional[bool] = True,
     model_family: Optional[str] = None,
-    model_name: Optional[str] = None,
+    model_id: Optional[str] = None,
     num_cpus: Optional[int] = None,
     num_instructions_to_generate: Optional[int] = 30,
-    taxonomy: Optional[str] = None,  # TODO rename to taxonomy_path to match config
-    taxonomy_base: Optional[str] = None,
-    output_dir: Optional[str] = None,
     console_output=True,
-    yaml_rules: Optional[str] = None,
-    chunk_word_count=None,
-    server_ctx_size=None,
     pipeline: Optional[str] = "simple",
     batch_size: Optional[int] = None,
     checkpoint_dir: Optional[str] = None,
     max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
-) -> None:
-    """Generate data for training and testing a model.
-
-    This currently serves as the primary interface from the `ilab` CLI to the `sdg` library.
-    It is somewhat a transitionary measure, as this function existed back when all of the
-    functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to
-    use the SDG library constructs directly, and this function will likely be removed.
-
-    Args:
-        pipeline: This argument may be either an alias defined in a user or site "data directory"
-                  or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches),
-                  or an absolute path to a directory containing the pipeline YAML files.
-                  We expect three files to be present in this directory: "knowledge.yaml",
-                    "freeform_skills.yaml", and "grounded_skills.yaml".
-    """
-    generate_start = time.time()
-
-    system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT
-
-    # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp
-    if batch_size is None:
-        batch_size = 0
-
-    output_dir = Path(output_dir)
-    output_dir.mkdir(exist_ok=True)
-    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
-    preprocessed_output_dir = output_dir.joinpath(f"preprocessed_{date_suffix}")
-
-    # This writes samples to disk in our output_dir and returns the
-    # list of files created
-    sample_files = taxonomy_to_samples(
-        taxonomy,
-        preprocessed_output_dir,
-        chunk_word_count=chunk_word_count,
-        server_ctx_size=server_ctx_size,
-        taxonomy_base=taxonomy_base,
-        yaml_rules=yaml_rules,
-    )
-
-    name = Path(model_name).stem  # Just in case it is a file path
-    output_file_messages = f"messages_{name}_{date_suffix}.jsonl"
-    output_file_test = f"test_{name}_{date_suffix}.jsonl"
-    output_file_train = f"train_{name}_{date_suffix}.jsonl"
-
-    all_samples = []
-    for sample_file in sample_files:
-        all_samples.extend(jlload(sample_file))
-    _gen_test_data(
-        all_samples,
-        os.path.join(output_dir, output_file_test),
-        system_prompt,
-    )
-
-    logger.debug(f"Generating to: {os.path.join(output_dir, output_file_test)}")
-
-    model_family = models.get_model_family(model_family, model_name)
-
+):
     ctx = _context_init(
         client,
         model_family,
-        model_name,
+        model_id,
         num_instructions_to_generate,
         checkpoint_dir,
         1,  # save_freq
@@ -355,89 +244,290 @@ def generate_data(
         ctx, pipeline
     )
 
-    # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
-    mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None)
-    mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)
-
-    mixer = _mixer_init(
-        ctx,
-        output_dir,
-        date_suffix,
-        knowledge_pipe.auxiliary_inst,
-        system_prompt,
-    )
-
     if console_output:
         logger.info(
             "Synthesizing new instructions. If you aren't satisfied with the generated instructions, interrupt training (Ctrl-C) and try adjusting your YAML files. Adding more examples may help."
         )
 
-    generated_data = []
-    empty_input_sample_files = []
-    for sample_file in sample_files:
-        logger.debug("Generating data from input sample file: %s", sample_file)
-        samples = jlload(sample_file)
+    input_files = glob.glob(f"{input_dir}/*.jsonl")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    empty_input_files = []
+    for input_file in input_files:
+        logger.debug("Generating data from input file: %s", input_file)
+        samples = jlload(input_file)
         if not samples:
             raise GenerateException(
-                "Error: No samples found in input file {sample_file}"
+                "Error: No samples found in input file {input_file}"
             )
         # For now we assume every sample in the file is the same type
         first_sample = samples[0]
-        leaf_node_path = first_sample["leaf_node_path"]
-        leaf_node_type = first_sample["leaf_node_type"]
-        is_knowledge = False
+        leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample)
         if leaf_node_type == "knowledge":
             pipe = knowledge_pipe
-            is_knowledge = True
         elif leaf_node_type == "grounded_skill":
             pipe = grounded_skills_pipe
         else:
             pipe = freeform_skills_pipe
 
         samples_ds = Dataset.from_list(samples)
-        logger.debug("Samples: %s", samples_ds)
+        logger.debug("Generating from samples: %s", samples_ds)
 
         new_generated_data = pipe.generate(samples_ds, leaf_node_path)
         if len(new_generated_data) == 0:
-            empty_input_sample_files.append(sample_file)
-            logger.warning("Empty generated dataset for sample file: %s", sample_file)
+            empty_input_files.append(input_file)
+            logger.warning("Empty generated dataset for input file: %s", input_file)
             continue
-        generated_data.append(new_generated_data)
 
-        logger.info("Generated %d samples", len(generated_data))
-        logger.debug("Generated data: %s", generated_data)
+        output_file = os.path.join(output_dir, os.path.basename(input_file))
+        jldump(new_generated_data, output_file)
+        logger.info("Generated %d samples", len(new_generated_data))
+        logger.debug("Generated data: %s", new_generated_data)
+
+    if len(empty_input_files) > 0:
+        logger.warning(
+            "Input sample files with empty sdg output: {}".format(
+                " ".join(empty_input_files)
+            )
+        )
+
+def generate_taxonomy_eval(
+    client: openai.OpenAI,
+    input_dir: str,
+    output_dir: str,
+    date_suffix: str,
+    model_family: Optional[str] = None,
+    model_id: Optional[str] = None,
+    num_cpus: Optional[int] = None,
+    num_instructions_to_generate: Optional[int] = 30,
+    batch_size: Optional[int] = None,
+    max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+):
+    ctx = _context_init(
+        client,
+        model_family,
+        model_id,
+        num_instructions_to_generate,
+        None,  # disable checkpoints for eval pipeline
+        1,  # save_freq
+        batch_size=batch_size,
+        batch_num_workers=num_cpus,
+        max_num_tokens=max_num_tokens,
+    )
+    mmlu_bench_pipe = mmlubench_pipe_init(ctx)
+
+    input_files = glob.glob(f"{input_dir}/*.jsonl")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    for input_file in input_files:
+        logger.debug("Generating eval data from input file: %s", input_file)
+        samples = jlload(input_file)
+        if not samples:
+            raise GenerateException(
+                "Error: No samples found in input file {input_file}"
+            )
+        samples_ds = Dataset.from_list(samples)
+        # For now we assume every sample in the file is the same type
+        first_sample = samples[0]
+        leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample)
+        is_knowledge = False
+        if leaf_node_type == "knowledge":
+            is_knowledge = True
 
         if is_knowledge:
-            # generate mmlubench data for the current leaf node
             generate_eval_task_data(
                 mmlu_bench_pipe,
                 leaf_node_path,
-                samples,
+                samples_ds,
                 output_dir,
                 date_suffix,
             )
 
+def postprocess_taxonomy(
+    input_dir: str,
+    output_dir: str,
+    date_suffix: str,
+    pipeline: Optional[str] = "simple",
+    num_procs: Optional[int] = PipelineContext.DEFAULT_DATASET_NUM_PROCS,
+    system_prompt: Optional[str] = _SYS_PROMPT,
+    use_legacy_pretraining_format: Optional[bool] = True,
+):
+    knowledge_pipe, _, _ = _sdg_init(None, pipeline)
+    mixer = _mixer_init(
+        num_procs,
+        output_dir,
+        date_suffix,
+        knowledge_pipe.auxiliary_inst,
+        system_prompt,
+    )
+
+    input_files = glob.glob(f"{input_dir}/*.jsonl")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    output_file_messages = f"messages_{date_suffix}.jsonl"
+    output_file_train = f"train_{date_suffix}.jsonl"
+
+    all_generated_data = []
+    for input_file in input_files:
+        logger.debug("Postprocessing generated taxonomy date in input file: %s", input_file)
+        samples = jlload(input_file)
+        if not samples:
+            raise GenerateException(
+                "Error: No samples found in input file {input_file}"
+            )
+        # For now we assume every sample in the file is the same type
+        first_sample = samples[0]
+        leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample)
+        is_knowledge = False
+        if leaf_node_type == "knowledge":
+            is_knowledge = True
+
+        samples_ds = Dataset.from_list(samples)
+        logger.debug("Postprocessing from samples: %s", samples_ds)
+        all_generated_data.append(samples_ds)
+
         mixer.collect(
             leaf_node_path,
-            new_generated_data,
+            samples_ds,
             is_knowledge,
             use_legacy_pretraining_format,
         )
 
     _gen_train_data(
-        generated_data,
+        all_generated_data,
         os.path.join(output_dir, output_file_train),
         os.path.join(output_dir, output_file_messages),
         system_prompt,
     )
 
-    mixer.generate()
+    mixer.write_recipes()
+
+def mix_datasets(
+    recipe_file: str,
+    output_file: str,
+    num_proc: Optional[int] = 8,
+):
+    recipe = Recipe(recipe_file)
+    if recipe.datasets:
+        recipe.save_mixed_dataset(output_file, num_proc)
+    else:
+        logger.info("Not mixing empty recipe file: %s", recipe_file)
+
+# This is part of the public API, and used by instructlab.
+# TODO - parameter removal needs to be done in sync with a CLI change.
+# to be removed: logger
+def generate_data(
+    client: openai.OpenAI,
+    logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
+    system_prompt: Optional[str] = None,
+    use_legacy_pretraining_format: Optional[bool] = True,
+    model_family: Optional[str] = None,
+    model_name: Optional[str] = None,
+    num_cpus: Optional[int] = None,
+    num_instructions_to_generate: Optional[int] = 30,
+    taxonomy: Optional[str] = None,  # TODO rename to taxonomy_path to match config
+    taxonomy_base: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    console_output=True,
+    yaml_rules: Optional[str] = None,
+    chunk_word_count=None,
+    server_ctx_size=None,
+    pipeline: Optional[str] = "simple",
+    batch_size: Optional[int] = None,
+    checkpoint_dir: Optional[str] = None,
+    max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+) -> None:
+    """Generate data for training and testing a model.
+
+    This currently serves as the primary interface from the `ilab` CLI to the `sdg` library.
+    It is somewhat a transitionary measure, as this function existed back when all of the
+    functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to
+    use the SDG library constructs directly, and this function will likely be removed.
+
+    Args:
+        pipeline: This argument may be either an alias defined in a user or site "data directory"
+                  or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches),
+                  or an absolute path to a directory containing the pipeline YAML files.
+                  We expect three files to be present in this directory: "knowledge.yaml",
+                    "freeform_skills.yaml", and "grounded_skills.yaml".
+    """
+    generate_start = time.time()
+
+    system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT
+
+    # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp
+    if batch_size is None:
+        batch_size = 0
+
+    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file_test = output_dir.joinpath(f"test_{date_suffix}.jsonl")
+    preprocessed_dir = output_dir.joinpath(f"preprocessed_{date_suffix}")
+    generated_dir = output_dir.joinpath(f"generated_{date_suffix}")
+
+    # This writes samples to disk in our output_dir and returns the
+    # list of files created
+    preprocess_taxonomy(
+        taxonomy,
+        output_dir=preprocessed_dir,
+        chunk_word_count=chunk_word_count,
+        server_ctx_size=server_ctx_size,
+        taxonomy_base=taxonomy_base,
+        yaml_rules=yaml_rules,
+        test_output_file=output_file_test,
+        system_prompt=system_prompt,
+    )
+
+    generate_taxonomy(
+        client,
+        input_dir=preprocessed_dir,
+        output_dir=generated_dir,
+        logger=logger,
+        model_family=model_family,
+        model_id=model_name,
+        num_cpus=num_cpus,
+        num_instructions_to_generate=num_instructions_to_generate,
+        console_output=console_output,
+        pipeline=pipeline,
+        batch_size=batch_size,
+        checkpoint_dir=checkpoint_dir,
+        max_num_tokens=max_num_tokens,
+    )
+
+    generate_taxonomy_eval(
+        input_dir=preprocessed_dir,
+        output_dir=output_dir,
+        date_suffix=date_suffix,
+        client=client,
+        model_family=model_family,
+        model_id=model_name,
+        num_cpus=num_cpus,
+        num_instructions_to_generate=num_instructions_to_generate,
+        batch_size=batch_size,
+        max_num_tokens=max_num_tokens,
+    )
+
+    postprocess_taxonomy(
+        input_dir=generated_dir,
+        output_dir=output_dir,
+        date_suffix=date_suffix,
+        pipeline=pipeline,
+        system_prompt=system_prompt,
+        use_legacy_pretraining_format=use_legacy_pretraining_format,
+    )
+
+    mix_datasets(
+        recipe_file=f"{output_dir}/skills_recipe_{date_suffix}.yaml",
+        output_file=f"{output_dir}/skills_train_msgs_{date_suffix}.jsonl",
+    )
+    mix_datasets(
+        recipe_file=f"{output_dir}/knowledge_recipe_{date_suffix}.yaml",
+        output_file=f"{output_dir}/knowledge_train_msgs_{date_suffix}.jsonl",
+    )
 
     generate_duration = time.time() - generate_start
     logger.info(f"Generation took {generate_duration:.2f}s")
-    if len(empty_input_sample_files) > 0:
-        logger.warning(
-            "Input sample files with empty sdg output: {}".format(
-                " ".join(empty_input_sample_files)
-            )
-        )
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 59613a8e..ce362668 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -16,7 +16,7 @@
 
 # First Party
 from instructlab.sdg.checkpointing import Checkpointer
-from instructlab.sdg.utils import pandas
+from instructlab.sdg.utils import models, pandas
 
 # Local
 from .blocks import llmblock
@@ -71,6 +71,9 @@ class PipelineContext:  # pylint: disable=too-many-instance-attributes
     batch_size: int = DEFAULT_BATCH_SIZE
     batch_num_workers: Optional[int] = None
 
+    def __post_init__(self):
+        self.model_family = models.get_model_family(self.model_family, self.model_id)
+
     @property
     def batching_enabled(self) -> bool:
         """Batching is enabled IFF the batch size is specified and the number of
diff --git a/src/instructlab/sdg/taxonomy.py b/src/instructlab/sdg/taxonomy.py
index bc017fa5..88d4726d 100644
--- a/src/instructlab/sdg/taxonomy.py
+++ b/src/instructlab/sdg/taxonomy.py
@@ -17,6 +17,7 @@
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
     read_taxonomy_leaf_nodes,
+    _unescape,
 )
 
 logger = logging.getLogger(__name__)
@@ -51,14 +52,66 @@ def _locate_docling_models():
     return docling_model_path
 
 
-def taxonomy_to_samples(
-    taxonomy_path,
+def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
+    res = []
+    for i in range(3):
+        idx = i + 1
+        user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"]
+        test_sample = {
+            "user": _unescape(user),
+            "assistant": _unescape(seed_example[f"icl_response_{idx}"]),
+        }
+        if system_prompt:
+            test_sample["system"] = system_prompt
+        res.append(test_sample)
+    return res
+
+
+def _gen_test_data(
+    seed_examples,
+    output_file_test,
+    system_prompt,
+):
+    """
+    Generate test data in the format needed by the legacy Linux training
+    in instructlab/instructlab.
+    """
+    test_data = []
+    for seed_example in seed_examples:
+        if "icl_query_1" in seed_example:
+            test_data.extend(
+                _knowledge_seed_example_to_test_data(seed_example, system_prompt)
+            )
+            continue
+
+        # skill seed example
+
+        user = seed_example["seed_question"]  # question
+
+        if seed_example["leaf_node_type"] == "grounded_skill":
+            user += "\n" + seed_example["seed_context"]  # context
+
+        test_sample = {
+            "user": _unescape(user),
+            "assistant": _unescape(seed_example["seed_response"]),  # answer
+        }
+        if system_prompt:
+            test_sample["system"] = system_prompt
+        test_data.append(test_sample)
+
+    jldump(test_data, output_file_test)
+
+
+def preprocess_taxonomy(
+    taxonomy_dir,
     output_dir,
     chunk_word_count=DEFAULT_CHUNK_WORD_COUNT,  # TODO: Remove chunk_word_count param
     server_ctx_size=DEFAULT_SERVER_CTX_SIZE,  # TODO: Remove server_ctx_size param
     taxonomy_base=DEFAULT_TAXONOMY_BASE,
     teacher_model_path: Optional[str] = None,
     yaml_rules: Optional[str] = None,
+    test_output_file: Optional[str] = None,
+    system_prompt: Optional[str] = None,
 ):
     """
     Preprocess a taxonomy into input samples suitable for use with
@@ -76,8 +129,9 @@ def taxonomy_to_samples(
     - Write these samples to disk, with one file per taxonomy leaf node.
 
     Args:
-        taxonomy_path: The path to the taxonomy
+        taxonomy_dir: The path to the taxonomy
         output_dir: Where to write the samples create for use with data generation
+        test_output_file: Path to write the test samples jsonl file
         chunk_word_count: The target number of words per document chunk
         server_ctx_size: The maximum number of tokens the inference server used
                          during data generation can handle
@@ -87,6 +141,7 @@ def taxonomy_to_samples(
         teacher_model_path: Path to the teacher model on disk, which we'll use to
                             load its tokenizer for use with document chunking.
         yaml_rules: Path to a custom YAML rules file for YAML linting.
+        system_prompt: System prompt to use when generating test samples
 
     Returns:
         List[str]: The list of output sample files written to disk.
@@ -97,23 +152,27 @@ def taxonomy_to_samples(
     output_dir.mkdir(exist_ok=True)
     output_files = []
 
-    if not (taxonomy_path and os.path.exists(taxonomy_path)):
-        raise GenerateException(f"Error: taxonomy ({taxonomy_path}) does not exist.")
+    if not (taxonomy_dir and os.path.exists(taxonomy_dir)):
+        raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.")
 
     document_output_dir = output_dir.joinpath("documents")
     docling_model_path = _locate_docling_models()
 
     leaf_nodes = read_taxonomy_leaf_nodes(
-        taxonomy_path, taxonomy_base, yaml_rules, document_output_dir
+        taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir
     )
     if not leaf_nodes:
         raise GenerateException("Error: No new leaf nodes found in the taxonomy.")
 
+    # TODO: This is all a temporary hack here, as we either need to
+    # remove, deprecate, or otherwise determine the right way to
+    # support test samples
+    all_samples = []
     for leaf_node in leaf_nodes.values():
         leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
         samples = leaf_node_to_samples(
             leaf_node,
-            taxonomy_path,
+            taxonomy_dir,
             server_ctx_size,
             chunk_word_count,
             document_output_dir,
@@ -127,8 +186,16 @@ def taxonomy_to_samples(
         logger.debug("Samples: %s", samples)
 
         output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl")
+        all_samples.extend(samples)
         jldump(samples, output_file)
         output_files.append(str(output_file))
 
+    if test_output_file:
+        _gen_test_data(
+            all_samples,
+            test_output_file,
+            system_prompt,
+        )
+        logger.debug(f"Generating test data to: {test_output_file}")
     logger.info("Taxonomy converted to samples and written to %s", output_dir)
     return output_files
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index c8c1faf6..409eb198 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -491,3 +491,7 @@ def leaf_node_to_samples(
         samples = _skill_leaf_node_to_samples(leaf_node)
     samples = _enrich_metadata(samples, leaf_node)
     return Dataset.from_list(samples)
+
+
+def _unescape(s):
+    return bytes(s, "utf-8").decode("utf-8").strip()
diff --git a/tests/conftest.py b/tests/conftest.py
index ed3fd8c4..be3f249a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,7 +30,7 @@ def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
 
 def get_ctx(**kwargs) -> PipelineContext:
     kwargs.setdefault("client", mock.MagicMock())
-    kwargs.setdefault("model_family", "test")
+    kwargs.setdefault("model_family", "merlinite")
     kwargs.setdefault("model_id", "test-model")
     kwargs.setdefault("num_instructions_to_generate", 10)
     kwargs.setdefault("dataset_num_procs", 1)