11# Master configuration file for Synthetic Data Kit
2+ # Workshop-ready configuration with Cerebras defaults
23
34# Global paths configuration
45paths :
56 # Input data location (directory containing files to process)
67 input : " data/input" # Directory containing PDF, HTML, DOCX, PPT, TXT files
7-
8+
89 # Output locations (4-stage pipeline directories)
910 output :
1011 parsed : " data/parsed" # Stage 1: Where parsed text files are saved (ingest output)
@@ -25,19 +26,19 @@ vllm:
2526 max_retries : 3 # Number of retries for API calls
2627 retry_delay : 1.0 # Initial delay between retries (seconds)
2728 sleep_time : 0.1 # Small delay in seconds between batches to avoid rate limits
28-
29- # API endpoint configuration
29+
30+ # API endpoint configuration (Cerebras defaults)
3031api-endpoint :
31- api_base : " https://api.llama.com /v1" # Optional base URL for API endpoint (null for default API)
32- api_key : " llama_api_key " # API key for API endpoint or compatible service (can use env var instead)
33- model : " Llama-4-Maverick-17B-128E-Instruct-FP8 " # Default model to use
34- max_retries : 3 # Number of retries for API calls
35- retry_delay : 1.0 # Initial delay between retries (seconds)
36- sleep_time : 0.5 # Small delay in seconds between batches to avoid rate limits
32+ api_base : " https://api.cerebras.ai /v1" # Cerebras API endpoint
33+ api_key : " YOUR_CEREBRAS_API_KEY " # Replace with your Cerebras API key
34+ model : " llama3.3-70b " # Cerebras Llama 3.3 70B model
35+ max_retries : 3 # Number of retries for API calls
36+ retry_delay : 1.0 # Initial delay between retries (seconds)
37+ sleep_time : 0.5 # Small delay in seconds between batches to avoid rate limits
3738
3839# Ingest configuration
3940ingest :
40- default_format : " txt" # Default output format for parsed files
41+ default_format : " txt" # Default output format for parsed files (txt for text-based workflows)
4142 youtube_captions : " auto" # Options: "auto", "manual" - caption preference
4243
4344# LLM generation parameters
@@ -47,17 +48,17 @@ generation:
4748 chunk_size : 4000 # Size of text chunks for processing
4849 overlap : 200 # Overlap between chunks to maintain context
4950 max_tokens : 4096 # Maximum tokens in LLM responses
50- num_pairs : 25 # Default number of QA pairs to generate
51+ num_pairs : 25 # Default number of QA pairs to generate per chunk
5152 num_cot_examples : 5 # Default number of Chain of Thought examples to generate
5253 num_cot_enhance_examples : null # Maximum number of conversations to enhance (null = enhance all)
53- batch_size : 32 # Number of requests to batch together (for create )
54+ batch_size : 5 # Number of requests to batch together (optimized for Cerebras )
5455 max_context_length : 8000 # Context Length of the MODEL. Useful while Generating Summary
5556 summary_overlap : 0 # Overlap between chunks to maintain context. Useful while Generating Summary
56-
57+
5758# Content curation parameters
5859curate :
59- threshold : 7.0 # Default quality threshold (1-10)
60- batch_size : 5 # Number of items per batch for rating (smaller batches for API stability)
60+ threshold : 7.5 # Default quality threshold (1-10) - keep only high-quality pairs
61+ batch_size : 3 # Number of items per batch for rating (smaller batches for API stability)
6162 inference_batch : 5 # Number of batches to process at once with VLLM
6263 temperature : 0.1 # Temperature for rating (lower = more consistent)
6364
@@ -72,16 +73,20 @@ prompts:
7273 # Summary generation prompt
7374 summary : |
7475 Summarize this document in 3-5 sentences, focusing on the main topic and key concepts.
75-
76+
7677 # QA pair generation prompt
7778 qa_generation : |
78- Create question-answer pairs from this text for LLM training.
79-
79+ Create {num_pairs} question-answer pairs from this text for LLM training.
80+
81+ Document context: {summary}
82+
8083 Rules:
8184 1. Questions must be about important facts in the text
8285 2. Answers must be directly supported by the text
83- 3. Return JSON format only:
84-
86+ 3. Make questions diverse and educational
87+ 4. Return ONLY valid JSON, no markdown, no explanation, no code blocks
88+
89+ Expected format:
8590 [
8691 {{
8792 "question": "Question 1?",
@@ -92,47 +97,50 @@ prompts:
9297 "answer": "Answer 2."
9398 }}
9499 ]
95-
96- Text:
100+
101+ Text to create questions from :
97102 {text}
98-
103+
99104 # QA pair rating prompt
100105 qa_rating : |
101106 Rate each question-answer pair on a scale from 1-10, based on:
102107 - Accuracy (0-3): factual correctness
103108 - Relevance (0-2): relevance to content
104109 - Clarity (0-2): clear language
105110 - Usefulness (0-3): value for model learning
106-
111+
107112 YOU MUST RETURN A VALID JSON OBJECT OR ARRAY WITH THIS EXACT SCHEMA:
108113 {{
109114 "question": "Exact question text",
110115 "answer": "Exact answer text",
111116 "rating": 8
112117 }}
113-
118+
114119 OR FOR MULTIPLE PAIRS:
115120 [
116121 {{"question": "Q1", "answer": "A1", "rating": 8}},
117122 {{"question": "Q2", "answer": "A2", "rating": 9}}
118123 ]
119-
124+
120125 *** YOUR RESPONSE MUST BE VALID JSON AND NOTHING ELSE - NO EXPLANATION, NO MARKDOWN ***
121-
126+
122127 QA pairs to rate:
123128 {pairs}
124-
129+
125130 # Chain of Thought generation prompt
126131 cot_generation : |
127- Create complex reasoning examples from this text that demonstrate chain-of-thought thinking.
128-
132+ Create {num_examples} complex reasoning examples from this text that demonstrate chain-of-thought thinking.
133+
129134 Each example should have:
130135 1. A challenging question that requires step-by-step reasoning
131136 2. Detailed reasoning steps that break down the problem
132137 3. A concise final answer
133-
134- Return JSON format only:
135-
138+
139+ Rules:
140+ - Return ONLY valid JSON, no markdown, no explanation, no code blocks
141+ - Each example must have "question", "reasoning", and "answer" fields
142+
143+ Expected format:
136144 [
137145 {{
138146 "question": "Complex question about the text?",
@@ -145,18 +153,18 @@ prompts:
145153 "answer": "Final answer drawn from the reasoning."
146154 }}
147155 ]
148-
149- Text:
156+
157+ Text to create reasoning examples from :
150158 {text}
151-
159+
152160 # Chain of Thought enhancement prompt
153161 cot_enhancement : |
154162 You are an expert reasoning assistant. Your task is to enhance the given conversations by adding chain-of-thought reasoning.
155-
163+
156164 For each conversation, add detailed step-by-step reasoning to the assistant's responses while preserving the original answer.
157-
165+
158166 {include_simple_steps} = Whether to add reasoning to simple responses too. If false, only add reasoning to complex responses.
159-
167+
160168 Return the enhanced conversations as a JSON array matching this format:
161169 [
162170 [
@@ -170,6 +178,6 @@ prompts:
170178 {{"role": "assistant", "content": "Let me work through this:\n\n1. I'll start by...\n2. Next...\n\nIn conclusion, [original answer]"}}
171179 ]
172180 ]
173-
181+
174182 Original conversations:
175183 {conversations}
0 commit comments