From 146b25a020aaa60adc2f723ea3890aef84de5084 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Wed, 11 Dec 2024 16:28:48 -0500 Subject: [PATCH] Split `generate_data` into multiple discrete steps This doesn't move things out into separate files yet, but it does split the existing functionality of `generate_date` into multiple discrete steps and changes `generate_date` to just call those steps. This is a step towards cleaner separation between the steps and creating top-level Python APIs for each discrete step for advanced use-cases that don't just want an entire single step generation pipeline. Signed-off-by: Ben Browning --- src/instructlab/sdg/__init__.py | 4 +- ...y_to_samples.py => preprocess_taxonomy.py} | 6 +- src/instructlab/sdg/datamixing.py | 23 +- src/instructlab/sdg/generate_data.py | 428 +++++++++++------- src/instructlab/sdg/pipeline.py | 5 +- src/instructlab/sdg/taxonomy.py | 81 +++- src/instructlab/sdg/utils/taxonomy.py | 4 + tests/conftest.py | 2 +- 8 files changed, 367 insertions(+), 186 deletions(-) rename src/instructlab/sdg/cli/{taxonomy_to_samples.py => preprocess_taxonomy.py} (94%) diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py index a3576662..ccef90d8 100644 --- a/src/instructlab/sdg/__init__.py +++ b/src/instructlab/sdg/__init__.py @@ -29,7 +29,7 @@ "FULL_PIPELINES_PACKAGE", "SIMPLE_PIPELINES_PACKAGE", "generate_data", - "taxonomy_to_samples", + "preprocess_taxonomy", ) # Local @@ -62,6 +62,6 @@ PipelineContext, ) from .registry import BlockRegistry, PromptRegistry -from .taxonomy import taxonomy_to_samples +from .taxonomy import preprocess_taxonomy from .utils import GenerateException from .utils.taxonomy import TaxonomyReadingException diff --git a/src/instructlab/sdg/cli/taxonomy_to_samples.py b/src/instructlab/sdg/cli/preprocess_taxonomy.py similarity index 94% rename from src/instructlab/sdg/cli/taxonomy_to_samples.py rename to src/instructlab/sdg/cli/preprocess_taxonomy.py index 112f764b..b7d22532 100644 --- a/src/instructlab/sdg/cli/taxonomy_to_samples.py +++ b/src/instructlab/sdg/cli/preprocess_taxonomy.py @@ -8,7 +8,7 @@ DEFAULT_CHUNK_WORD_COUNT, DEFAULT_SERVER_CTX_SIZE, DEFAULT_TAXONOMY_BASE, - taxonomy_to_samples, + preprocess_taxonomy, ) from instructlab.sdg.utils.logging import setup_logger @@ -68,7 +68,7 @@ args = parser.parse_args() setup_logger(args.log_level) - taxonomy_to_samples( + preprocess_taxonomy( args.taxonomy_path, args.output_dir, chunk_word_count=args.chunk_word_count, @@ -78,5 +78,5 @@ ) """ -python -m instructlab.sdg.cli.taxonomy_to_samples --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output +python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output """ diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index e6ca8675..de31e136 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc): Create the final mixed dataset by loading, sampling, and concatenating all datasets in this recipe """ - if not self.dataset_added: + if not self.datasets: logger.error("No dataset added to the recipe") mixed_ds = self._load_and_sample_datasets(num_proc) @@ -726,19 +726,36 @@ def collect( sampling_size=self.NUM_SYNTH_SKILLS, ) + def _write_mixed_recipe(self, recipe, output_file_recipe): + """ + Write the recipes created during data mixing without writing the actual + mixed datasets to disk. + """ + full_recipe_path = os.path.join(self.output_dir, output_file_recipe) + recipe.save_recipe(full_recipe_path) + def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data): """ Mix the generated leaf node data into a single dataset and write it to disk. The heavy lifting is delegated to the Recipe class. """ + self._write_mixed_recipe(recipe, output_file_recipe) if recipe.dataset_added: - full_recipe_path = os.path.join(self.output_dir, output_file_recipe) - recipe.save_recipe(full_recipe_path) recipe.save_mixed_dataset( os.path.join(self.output_dir, output_file_data), self.num_procs, ) + def write_recipes(self): + self._write_mixed_recipe( + self.knowledge_recipe, + self.output_file_knowledge_recipe, + ) + self._write_mixed_recipe( + self.skills_recipe, + self.output_file_skills_recipe, + ) + def generate(self): self._gen_mixed_data( self.knowledge_recipe, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 31643457..f0593dea 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -5,10 +5,11 @@ from importlib import resources from pathlib import Path from typing import Optional -import dataclasses +import glob import json import logging import os +import shutil import time # Third Party @@ -19,7 +20,7 @@ # First Party from instructlab.sdg.blocks.llmblock import DEFAULT_MAX_NUM_TOKENS -from instructlab.sdg.datamixing import DataMixer, _get_question_hack, _get_response_hack +from instructlab.sdg.datamixing import DataMixer, Recipe, _get_question_hack, _get_response_hack from instructlab.sdg.eval_data import generate_eval_task_data, mmlubench_pipe_init from instructlab.sdg.pipeline import ( FULL_PIPELINES_PACKAGE, @@ -27,8 +28,9 @@ Pipeline, PipelineContext, ) -from instructlab.sdg.taxonomy import taxonomy_to_samples +from instructlab.sdg.taxonomy import preprocess_taxonomy from instructlab.sdg.utils import GenerateException, models +from instructlab.sdg.utils.taxonomy import _unescape from instructlab.sdg.utils.json import jldump, jlload logger = logging.getLogger(__name__) @@ -36,10 +38,6 @@ _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant." -def _unescape(s): - return bytes(s, "utf-8").decode("utf-8").strip() - - def _convert_to_messages(sample): """ Convert a sample dictionary to contain 'messages' and 'metadata' columns required for training. @@ -110,56 +108,6 @@ def _gen_train_data( jldump(messages_data, output_file_messages) -def _knowledge_seed_example_to_test_data(seed_example, system_prompt): - res = [] - for i in range(3): - idx = i + 1 - user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"] - res.append( - { - "system": system_prompt, - "user": _unescape(user), - "assistant": _unescape(seed_example[f"icl_response_{idx}"]), - } - ) - return res - - -def _gen_test_data( - seed_examples, - output_file_test, - system_prompt, -): - """ - Generate test data in the format needed by the legacy Linux training - in instructlab/instructlab. - """ - test_data = [] - for seed_example in seed_examples: - if "icl_query_1" in seed_example: - test_data.extend( - _knowledge_seed_example_to_test_data(seed_example, system_prompt) - ) - continue - - # skill seed example - - user = seed_example["seed_question"] # question - - if seed_example["leaf_node_type"] == "grounded_skill": - user += "\n" + seed_example["seed_context"] # context - - test_data.append( - { - "system": system_prompt, - "user": _unescape(user), - "assistant": _unescape(seed_example["seed_response"]), # answer - } - ) - - jldump(test_data, output_file_test) - - def _check_pipeline_dir(pipeline): for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]: if not os.path.exists(os.path.join(pipeline, file)): @@ -240,7 +188,7 @@ def load_pipeline(yaml_basename): def _mixer_init( - ctx, + num_procs, output_dir, date_suffix, knowledge_auxiliary_inst, @@ -254,95 +202,36 @@ def _mixer_init( output_dir, date_suffix, system_prompt, - ctx.dataset_num_procs, + num_procs, knowledge_auxiliary_inst, ) -# This is part of the public API, and used by instructlab. -# TODO - parameter removal needs to be done in sync with a CLI change. -# to be removed: logger -def generate_data( +def _extract_leaf_node_path_and_type(sample): + leaf_node_path = sample.get("leaf_node_path", "unknown") + leaf_node_type = sample.get("leaf_node_type") + return leaf_node_path, leaf_node_type + + +def generate_taxonomy( client: openai.OpenAI, + input_dir: str, + output_dir: str, logger: logging.Logger = logger, # pylint: disable=redefined-outer-name - system_prompt: Optional[str] = None, - use_legacy_pretraining_format: Optional[bool] = True, model_family: Optional[str] = None, - model_name: Optional[str] = None, + model_id: Optional[str] = None, num_cpus: Optional[int] = None, num_instructions_to_generate: Optional[int] = 30, - taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config - taxonomy_base: Optional[str] = None, - output_dir: Optional[str] = None, console_output=True, - yaml_rules: Optional[str] = None, - chunk_word_count=None, - server_ctx_size=None, pipeline: Optional[str] = "simple", batch_size: Optional[int] = None, checkpoint_dir: Optional[str] = None, max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, -) -> None: - """Generate data for training and testing a model. - - This currently serves as the primary interface from the `ilab` CLI to the `sdg` library. - It is somewhat a transitionary measure, as this function existed back when all of the - functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to - use the SDG library constructs directly, and this function will likely be removed. - - Args: - pipeline: This argument may be either an alias defined in a user or site "data directory" - or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches), - or an absolute path to a directory containing the pipeline YAML files. - We expect three files to be present in this directory: "knowledge.yaml", - "freeform_skills.yaml", and "grounded_skills.yaml". - """ - generate_start = time.time() - - system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT - - # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp - if batch_size is None: - batch_size = 0 - - output_dir = Path(output_dir) - output_dir.mkdir(exist_ok=True) - date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") - preprocessed_output_dir = output_dir.joinpath(f"preprocessed_{date_suffix}") - - # This writes samples to disk in our output_dir and returns the - # list of files created - sample_files = taxonomy_to_samples( - taxonomy, - preprocessed_output_dir, - chunk_word_count=chunk_word_count, - server_ctx_size=server_ctx_size, - taxonomy_base=taxonomy_base, - yaml_rules=yaml_rules, - ) - - name = Path(model_name).stem # Just in case it is a file path - output_file_messages = f"messages_{name}_{date_suffix}.jsonl" - output_file_test = f"test_{name}_{date_suffix}.jsonl" - output_file_train = f"train_{name}_{date_suffix}.jsonl" - - all_samples = [] - for sample_file in sample_files: - all_samples.extend(jlload(sample_file)) - _gen_test_data( - all_samples, - os.path.join(output_dir, output_file_test), - system_prompt, - ) - - logger.debug(f"Generating to: {os.path.join(output_dir, output_file_test)}") - - model_family = models.get_model_family(model_family, model_name) - +): ctx = _context_init( client, model_family, - model_name, + model_id, num_instructions_to_generate, checkpoint_dir, 1, # save_freq @@ -355,89 +244,290 @@ def generate_data( ctx, pipeline ) - # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline) - mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None) - mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx) - - mixer = _mixer_init( - ctx, - output_dir, - date_suffix, - knowledge_pipe.auxiliary_inst, - system_prompt, - ) - if console_output: logger.info( "Synthesizing new instructions. If you aren't satisfied with the generated instructions, interrupt training (Ctrl-C) and try adjusting your YAML files. Adding more examples may help." ) - generated_data = [] - empty_input_sample_files = [] - for sample_file in sample_files: - logger.debug("Generating data from input sample file: %s", sample_file) - samples = jlload(sample_file) + input_files = glob.glob(f"{input_dir}/*.jsonl") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + empty_input_files = [] + for input_file in input_files: + logger.debug("Generating data from input file: %s", input_file) + samples = jlload(input_file) if not samples: raise GenerateException( - "Error: No samples found in input file {sample_file}" + "Error: No samples found in input file {input_file}" ) # For now we assume every sample in the file is the same type first_sample = samples[0] - leaf_node_path = first_sample["leaf_node_path"] - leaf_node_type = first_sample["leaf_node_type"] - is_knowledge = False + leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample) if leaf_node_type == "knowledge": pipe = knowledge_pipe - is_knowledge = True elif leaf_node_type == "grounded_skill": pipe = grounded_skills_pipe else: pipe = freeform_skills_pipe samples_ds = Dataset.from_list(samples) - logger.debug("Samples: %s", samples_ds) + logger.debug("Generating from samples: %s", samples_ds) new_generated_data = pipe.generate(samples_ds, leaf_node_path) if len(new_generated_data) == 0: - empty_input_sample_files.append(sample_file) - logger.warning("Empty generated dataset for sample file: %s", sample_file) + empty_input_files.append(input_file) + logger.warning("Empty generated dataset for input file: %s", input_file) continue - generated_data.append(new_generated_data) - logger.info("Generated %d samples", len(generated_data)) - logger.debug("Generated data: %s", generated_data) + output_file = os.path.join(output_dir, os.path.basename(input_file)) + jldump(new_generated_data, output_file) + logger.info("Generated %d samples", len(new_generated_data)) + logger.debug("Generated data: %s", new_generated_data) + + if len(empty_input_files) > 0: + logger.warning( + "Input sample files with empty sdg output: {}".format( + " ".join(empty_input_files) + ) + ) + +def generate_taxonomy_eval( + client: openai.OpenAI, + input_dir: str, + output_dir: str, + date_suffix: str, + model_family: Optional[str] = None, + model_id: Optional[str] = None, + num_cpus: Optional[int] = None, + num_instructions_to_generate: Optional[int] = 30, + batch_size: Optional[int] = None, + max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, +): + ctx = _context_init( + client, + model_family, + model_id, + num_instructions_to_generate, + None, # disable checkpoints for eval pipeline + 1, # save_freq + batch_size=batch_size, + batch_num_workers=num_cpus, + max_num_tokens=max_num_tokens, + ) + mmlu_bench_pipe = mmlubench_pipe_init(ctx) + + input_files = glob.glob(f"{input_dir}/*.jsonl") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + for input_file in input_files: + logger.debug("Generating eval data from input file: %s", input_file) + samples = jlload(input_file) + if not samples: + raise GenerateException( + "Error: No samples found in input file {input_file}" + ) + samples_ds = Dataset.from_list(samples) + # For now we assume every sample in the file is the same type + first_sample = samples[0] + leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample) + is_knowledge = False + if leaf_node_type == "knowledge": + is_knowledge = True if is_knowledge: - # generate mmlubench data for the current leaf node generate_eval_task_data( mmlu_bench_pipe, leaf_node_path, - samples, + samples_ds, output_dir, date_suffix, ) +def postprocess_taxonomy( + input_dir: str, + output_dir: str, + date_suffix: str, + pipeline: Optional[str] = "simple", + num_procs: Optional[int] = PipelineContext.DEFAULT_DATASET_NUM_PROCS, + system_prompt: Optional[str] = _SYS_PROMPT, + use_legacy_pretraining_format: Optional[bool] = True, +): + knowledge_pipe, _, _ = _sdg_init(None, pipeline) + mixer = _mixer_init( + num_procs, + output_dir, + date_suffix, + knowledge_pipe.auxiliary_inst, + system_prompt, + ) + + input_files = glob.glob(f"{input_dir}/*.jsonl") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + output_file_messages = f"messages_{date_suffix}.jsonl" + output_file_train = f"train_{date_suffix}.jsonl" + + all_generated_data = [] + for input_file in input_files: + logger.debug("Postprocessing generated taxonomy date in input file: %s", input_file) + samples = jlload(input_file) + if not samples: + raise GenerateException( + "Error: No samples found in input file {input_file}" + ) + # For now we assume every sample in the file is the same type + first_sample = samples[0] + leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample) + is_knowledge = False + if leaf_node_type == "knowledge": + is_knowledge = True + + samples_ds = Dataset.from_list(samples) + logger.debug("Postprocessing from samples: %s", samples_ds) + all_generated_data.append(samples_ds) + mixer.collect( leaf_node_path, - new_generated_data, + samples_ds, is_knowledge, use_legacy_pretraining_format, ) _gen_train_data( - generated_data, + all_generated_data, os.path.join(output_dir, output_file_train), os.path.join(output_dir, output_file_messages), system_prompt, ) - mixer.generate() + mixer.write_recipes() + +def mix_datasets( + recipe_file: str, + output_file: str, + num_proc: Optional[int] = 8, +): + recipe = Recipe(recipe_file) + if recipe.datasets: + recipe.save_mixed_dataset(output_file, num_proc) + else: + logger.info("Not mixing empty recipe file: %s", recipe_file) + +# This is part of the public API, and used by instructlab. +# TODO - parameter removal needs to be done in sync with a CLI change. +# to be removed: logger +def generate_data( + client: openai.OpenAI, + logger: logging.Logger = logger, # pylint: disable=redefined-outer-name + system_prompt: Optional[str] = None, + use_legacy_pretraining_format: Optional[bool] = True, + model_family: Optional[str] = None, + model_name: Optional[str] = None, + num_cpus: Optional[int] = None, + num_instructions_to_generate: Optional[int] = 30, + taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config + taxonomy_base: Optional[str] = None, + output_dir: Optional[str] = None, + console_output=True, + yaml_rules: Optional[str] = None, + chunk_word_count=None, + server_ctx_size=None, + pipeline: Optional[str] = "simple", + batch_size: Optional[int] = None, + checkpoint_dir: Optional[str] = None, + max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, +) -> None: + """Generate data for training and testing a model. + + This currently serves as the primary interface from the `ilab` CLI to the `sdg` library. + It is somewhat a transitionary measure, as this function existed back when all of the + functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to + use the SDG library constructs directly, and this function will likely be removed. + + Args: + pipeline: This argument may be either an alias defined in a user or site "data directory" + or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches), + or an absolute path to a directory containing the pipeline YAML files. + We expect three files to be present in this directory: "knowledge.yaml", + "freeform_skills.yaml", and "grounded_skills.yaml". + """ + generate_start = time.time() + + system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT + + # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp + if batch_size is None: + batch_size = 0 + + date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + output_file_test = output_dir.joinpath(f"test_{date_suffix}.jsonl") + preprocessed_dir = output_dir.joinpath(f"preprocessed_{date_suffix}") + generated_dir = output_dir.joinpath(f"generated_{date_suffix}") + + # This writes samples to disk in our output_dir and returns the + # list of files created + preprocess_taxonomy( + taxonomy, + output_dir=preprocessed_dir, + chunk_word_count=chunk_word_count, + server_ctx_size=server_ctx_size, + taxonomy_base=taxonomy_base, + yaml_rules=yaml_rules, + test_output_file=output_file_test, + system_prompt=system_prompt, + ) + + generate_taxonomy( + client, + input_dir=preprocessed_dir, + output_dir=generated_dir, + logger=logger, + model_family=model_family, + model_id=model_name, + num_cpus=num_cpus, + num_instructions_to_generate=num_instructions_to_generate, + console_output=console_output, + pipeline=pipeline, + batch_size=batch_size, + checkpoint_dir=checkpoint_dir, + max_num_tokens=max_num_tokens, + ) + + generate_taxonomy_eval( + input_dir=preprocessed_dir, + output_dir=output_dir, + date_suffix=date_suffix, + client=client, + model_family=model_family, + model_id=model_name, + num_cpus=num_cpus, + num_instructions_to_generate=num_instructions_to_generate, + batch_size=batch_size, + max_num_tokens=max_num_tokens, + ) + + postprocess_taxonomy( + input_dir=generated_dir, + output_dir=output_dir, + date_suffix=date_suffix, + pipeline=pipeline, + system_prompt=system_prompt, + use_legacy_pretraining_format=use_legacy_pretraining_format, + ) + + mix_datasets( + recipe_file=f"{output_dir}/skills_recipe_{date_suffix}.yaml", + output_file=f"{output_dir}/skills_train_msgs_{date_suffix}.jsonl", + ) + mix_datasets( + recipe_file=f"{output_dir}/knowledge_recipe_{date_suffix}.yaml", + output_file=f"{output_dir}/knowledge_train_msgs_{date_suffix}.jsonl", + ) generate_duration = time.time() - generate_start logger.info(f"Generation took {generate_duration:.2f}s") - if len(empty_input_sample_files) > 0: - logger.warning( - "Input sample files with empty sdg output: {}".format( - " ".join(empty_input_sample_files) - ) - ) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 59613a8e..ce362668 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -16,7 +16,7 @@ # First Party from instructlab.sdg.checkpointing import Checkpointer -from instructlab.sdg.utils import pandas +from instructlab.sdg.utils import models, pandas # Local from .blocks import llmblock @@ -71,6 +71,9 @@ class PipelineContext: # pylint: disable=too-many-instance-attributes batch_size: int = DEFAULT_BATCH_SIZE batch_num_workers: Optional[int] = None + def __post_init__(self): + self.model_family = models.get_model_family(self.model_family, self.model_id) + @property def batching_enabled(self) -> bool: """Batching is enabled IFF the batch size is specified and the number of diff --git a/src/instructlab/sdg/taxonomy.py b/src/instructlab/sdg/taxonomy.py index bc017fa5..88d4726d 100644 --- a/src/instructlab/sdg/taxonomy.py +++ b/src/instructlab/sdg/taxonomy.py @@ -17,6 +17,7 @@ from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, read_taxonomy_leaf_nodes, + _unescape, ) logger = logging.getLogger(__name__) @@ -51,14 +52,66 @@ def _locate_docling_models(): return docling_model_path -def taxonomy_to_samples( - taxonomy_path, +def _knowledge_seed_example_to_test_data(seed_example, system_prompt): + res = [] + for i in range(3): + idx = i + 1 + user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"] + test_sample = { + "user": _unescape(user), + "assistant": _unescape(seed_example[f"icl_response_{idx}"]), + } + if system_prompt: + test_sample["system"] = system_prompt + res.append(test_sample) + return res + + +def _gen_test_data( + seed_examples, + output_file_test, + system_prompt, +): + """ + Generate test data in the format needed by the legacy Linux training + in instructlab/instructlab. + """ + test_data = [] + for seed_example in seed_examples: + if "icl_query_1" in seed_example: + test_data.extend( + _knowledge_seed_example_to_test_data(seed_example, system_prompt) + ) + continue + + # skill seed example + + user = seed_example["seed_question"] # question + + if seed_example["leaf_node_type"] == "grounded_skill": + user += "\n" + seed_example["seed_context"] # context + + test_sample = { + "user": _unescape(user), + "assistant": _unescape(seed_example["seed_response"]), # answer + } + if system_prompt: + test_sample["system"] = system_prompt + test_data.append(test_sample) + + jldump(test_data, output_file_test) + + +def preprocess_taxonomy( + taxonomy_dir, output_dir, chunk_word_count=DEFAULT_CHUNK_WORD_COUNT, # TODO: Remove chunk_word_count param server_ctx_size=DEFAULT_SERVER_CTX_SIZE, # TODO: Remove server_ctx_size param taxonomy_base=DEFAULT_TAXONOMY_BASE, teacher_model_path: Optional[str] = None, yaml_rules: Optional[str] = None, + test_output_file: Optional[str] = None, + system_prompt: Optional[str] = None, ): """ Preprocess a taxonomy into input samples suitable for use with @@ -76,8 +129,9 @@ def taxonomy_to_samples( - Write these samples to disk, with one file per taxonomy leaf node. Args: - taxonomy_path: The path to the taxonomy + taxonomy_dir: The path to the taxonomy output_dir: Where to write the samples create for use with data generation + test_output_file: Path to write the test samples jsonl file chunk_word_count: The target number of words per document chunk server_ctx_size: The maximum number of tokens the inference server used during data generation can handle @@ -87,6 +141,7 @@ def taxonomy_to_samples( teacher_model_path: Path to the teacher model on disk, which we'll use to load its tokenizer for use with document chunking. yaml_rules: Path to a custom YAML rules file for YAML linting. + system_prompt: System prompt to use when generating test samples Returns: List[str]: The list of output sample files written to disk. @@ -97,23 +152,27 @@ def taxonomy_to_samples( output_dir.mkdir(exist_ok=True) output_files = [] - if not (taxonomy_path and os.path.exists(taxonomy_path)): - raise GenerateException(f"Error: taxonomy ({taxonomy_path}) does not exist.") + if not (taxonomy_dir and os.path.exists(taxonomy_dir)): + raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.") document_output_dir = output_dir.joinpath("documents") docling_model_path = _locate_docling_models() leaf_nodes = read_taxonomy_leaf_nodes( - taxonomy_path, taxonomy_base, yaml_rules, document_output_dir + taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir ) if not leaf_nodes: raise GenerateException("Error: No new leaf nodes found in the taxonomy.") + # TODO: This is all a temporary hack here, as we either need to + # remove, deprecate, or otherwise determine the right way to + # support test samples + all_samples = [] for leaf_node in leaf_nodes.values(): leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") samples = leaf_node_to_samples( leaf_node, - taxonomy_path, + taxonomy_dir, server_ctx_size, chunk_word_count, document_output_dir, @@ -127,8 +186,16 @@ def taxonomy_to_samples( logger.debug("Samples: %s", samples) output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl") + all_samples.extend(samples) jldump(samples, output_file) output_files.append(str(output_file)) + if test_output_file: + _gen_test_data( + all_samples, + test_output_file, + system_prompt, + ) + logger.debug(f"Generating test data to: {test_output_file}") logger.info("Taxonomy converted to samples and written to %s", output_dir) return output_files diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index c8c1faf6..409eb198 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -491,3 +491,7 @@ def leaf_node_to_samples( samples = _skill_leaf_node_to_samples(leaf_node) samples = _enrich_metadata(samples, leaf_node) return Dataset.from_list(samples) + + +def _unescape(s): + return bytes(s, "utf-8").decode("utf-8").strip() diff --git a/tests/conftest.py b/tests/conftest.py index ed3fd8c4..be3f249a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,7 +30,7 @@ def testdata_path() -> typing.Generator[pathlib.Path, None, None]: def get_ctx(**kwargs) -> PipelineContext: kwargs.setdefault("client", mock.MagicMock()) - kwargs.setdefault("model_family", "test") + kwargs.setdefault("model_family", "merlinite") kwargs.setdefault("model_id", "test-model") kwargs.setdefault("num_instructions_to_generate", 10) kwargs.setdefault("dataset_num_procs", 1)