Skip to content

Commit

Permalink
Split generate_data into multiple discrete steps
Browse files Browse the repository at this point in the history
This doesn't move things out into separate files yet, but it does
split the existing functionality of `generate_date` into multiple
discrete steps and changes `generate_date` to just call those steps.

This is a step towards cleaner separation between the steps and
creating top-level Python APIs for each discrete step for advanced
use-cases that don't just want an entire single step generation pipeline.

Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
bbrowning committed Dec 11, 2024
1 parent a80a3f7 commit 146b25a
Show file tree
Hide file tree
Showing 8 changed files with 367 additions and 186 deletions.
4 changes: 2 additions & 2 deletions src/instructlab/sdg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"FULL_PIPELINES_PACKAGE",
"SIMPLE_PIPELINES_PACKAGE",
"generate_data",
"taxonomy_to_samples",
"preprocess_taxonomy",
)

# Local
Expand Down Expand Up @@ -62,6 +62,6 @@
PipelineContext,
)
from .registry import BlockRegistry, PromptRegistry
from .taxonomy import taxonomy_to_samples
from .taxonomy import preprocess_taxonomy
from .utils import GenerateException
from .utils.taxonomy import TaxonomyReadingException
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
DEFAULT_CHUNK_WORD_COUNT,
DEFAULT_SERVER_CTX_SIZE,
DEFAULT_TAXONOMY_BASE,
taxonomy_to_samples,
preprocess_taxonomy,
)
from instructlab.sdg.utils.logging import setup_logger

Expand Down Expand Up @@ -68,7 +68,7 @@

args = parser.parse_args()
setup_logger(args.log_level)
taxonomy_to_samples(
preprocess_taxonomy(
args.taxonomy_path,
args.output_dir,
chunk_word_count=args.chunk_word_count,
Expand All @@ -78,5 +78,5 @@
)

"""
python -m instructlab.sdg.cli.taxonomy_to_samples --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
"""
23 changes: 20 additions & 3 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
Create the final mixed dataset by loading, sampling, and
concatenating all datasets in this recipe
"""
if not self.dataset_added:
if not self.datasets:
logger.error("No dataset added to the recipe")

mixed_ds = self._load_and_sample_datasets(num_proc)
Expand Down Expand Up @@ -726,19 +726,36 @@ def collect(
sampling_size=self.NUM_SYNTH_SKILLS,
)

def _write_mixed_recipe(self, recipe, output_file_recipe):
"""
Write the recipes created during data mixing without writing the actual
mixed datasets to disk.
"""
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)

def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
"""
Mix the generated leaf node data into a single dataset and write it to
disk. The heavy lifting is delegated to the Recipe class.
"""
self._write_mixed_recipe(recipe, output_file_recipe)
if recipe.dataset_added:
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)
recipe.save_mixed_dataset(
os.path.join(self.output_dir, output_file_data),
self.num_procs,
)

def write_recipes(self):
self._write_mixed_recipe(
self.knowledge_recipe,
self.output_file_knowledge_recipe,
)
self._write_mixed_recipe(
self.skills_recipe,
self.output_file_skills_recipe,
)

def generate(self):
self._gen_mixed_data(
self.knowledge_recipe,
Expand Down
Loading

0 comments on commit 146b25a

Please sign in to comment.