Update config.yaml

init27 · init27 · commit 2e970482567b · 2025-10-28T07:46:11.000-07:00
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -1,10 +1,11 @@
 # Master configuration file for Synthetic Data Kit
+# Workshop-ready configuration with Cerebras defaults
 
 # Global paths configuration
 paths:
   # Input data location (directory containing files to process)
   input: "data/input"           # Directory containing PDF, HTML, DOCX, PPT, TXT files
-  
+
   # Output locations (4-stage pipeline directories)
   output:
     parsed: "data/parsed"       # Stage 1: Where parsed text files are saved (ingest output)
@@ -25,19 +26,19 @@ vllm:
   max_retries: 3                       # Number of retries for API calls
   retry_delay: 1.0                     # Initial delay between retries (seconds)
   sleep_time: 0.1                      # Small delay in seconds between batches to avoid rate limits
-  
-# API endpoint configuration
+
+# API endpoint configuration (Cerebras defaults)
 api-endpoint:
-  api_base: "https://api.llama.com/v1" # Optional base URL for API endpoint (null for default API)
-  api_key: "llama_api_key"               # API key for API endpoint or compatible service (can use env var instead)
-  model: "Llama-4-Maverick-17B-128E-Instruct-FP8" # Default model to use
-  max_retries: 3                       # Number of retries for API calls
-  retry_delay: 1.0                     # Initial delay between retries (seconds)
-  sleep_time: 0.5                      # Small delay in seconds between batches to avoid rate limits
+  api_base: "https://api.cerebras.ai/v1" # Cerebras API endpoint
+  api_key: "YOUR_CEREBRAS_API_KEY"        # Replace with your Cerebras API key
+  model: "llama3.3-70b"                   # Cerebras Llama 3.3 70B model
+  max_retries: 3                          # Number of retries for API calls
+  retry_delay: 1.0                        # Initial delay between retries (seconds)
+  sleep_time: 0.5                         # Small delay in seconds between batches to avoid rate limits
 
 # Ingest configuration
 ingest:
-  default_format: "txt"  # Default output format for parsed files
+  default_format: "txt"  # Default output format for parsed files (txt for text-based workflows)
   youtube_captions: "auto"  # Options: "auto", "manual" - caption preference
 
 # LLM generation parameters
@@ -47,17 +48,17 @@ generation:
   chunk_size: 4000   # Size of text chunks for processing
   overlap: 200       # Overlap between chunks to maintain context
   max_tokens: 4096   # Maximum tokens in LLM responses
-  num_pairs: 25      # Default number of QA pairs to generate
+  num_pairs: 25      # Default number of QA pairs to generate per chunk
   num_cot_examples: 5  # Default number of Chain of Thought examples to generate
   num_cot_enhance_examples: null  # Maximum number of conversations to enhance (null = enhance all)
-  batch_size: 32     # Number of requests to batch together (for create)
+  batch_size: 5      # Number of requests to batch together (optimized for Cerebras)
   max_context_length: 8000       # Context Length of the MODEL. Useful while Generating Summary
   summary_overlap: 0       # Overlap between chunks to maintain context. Useful while Generating Summary
-  
+
 # Content curation parameters
 curate:
-  threshold: 7.0     # Default quality threshold (1-10)
-  batch_size: 5      # Number of items per batch for rating (smaller batches for API stability)
+  threshold: 7.5     # Default quality threshold (1-10) - keep only high-quality pairs
+  batch_size: 3      # Number of items per batch for rating (smaller batches for API stability)
   inference_batch: 5 # Number of batches to process at once with VLLM
   temperature: 0.1   # Temperature for rating (lower = more consistent)
 
@@ -72,16 +73,20 @@ prompts:
   # Summary generation prompt
   summary: |
     Summarize this document in 3-5 sentences, focusing on the main topic and key concepts.
-  
+
   # QA pair generation prompt
   qa_generation: |
-    Create question-answer pairs from this text for LLM training.
-    
+    Create {num_pairs} question-answer pairs from this text for LLM training.
+
+    Document context: {summary}
+
     Rules:
     1. Questions must be about important facts in the text
     2. Answers must be directly supported by the text
-    3. Return JSON format only:
-    
+    3. Make questions diverse and educational
+    4. Return ONLY valid JSON, no markdown, no explanation, no code blocks
+
+    Expected format:
     [
       {{
         "question": "Question 1?",
@@ -92,47 +97,50 @@ prompts:
         "answer": "Answer 2."
       }}
     ]
-    
-    Text:
+
+    Text to create questions from:
     {text}
-  
+
   # QA pair rating prompt
   qa_rating: |
     Rate each question-answer pair on a scale from 1-10, based on:
     - Accuracy (0-3): factual correctness
     - Relevance (0-2): relevance to content
     - Clarity (0-2): clear language
     - Usefulness (0-3): value for model learning
-    
+
     YOU MUST RETURN A VALID JSON OBJECT OR ARRAY WITH THIS EXACT SCHEMA:
     {{
       "question": "Exact question text",
       "answer": "Exact answer text",
       "rating": 8
     }}
-    
+
     OR FOR MULTIPLE PAIRS:
     [
       {{"question": "Q1", "answer": "A1", "rating": 8}},
       {{"question": "Q2", "answer": "A2", "rating": 9}}
     ]
-    
+
     *** YOUR RESPONSE MUST BE VALID JSON AND NOTHING ELSE - NO EXPLANATION, NO MARKDOWN ***
-    
+
     QA pairs to rate:
     {pairs}
-    
+
   # Chain of Thought generation prompt
   cot_generation: |
-    Create complex reasoning examples from this text that demonstrate chain-of-thought thinking.
-    
+    Create {num_examples} complex reasoning examples from this text that demonstrate chain-of-thought thinking.
+
     Each example should have:
     1. A challenging question that requires step-by-step reasoning
     2. Detailed reasoning steps that break down the problem
     3. A concise final answer
-    
-    Return JSON format only:
-    
+
+    Rules:
+    - Return ONLY valid JSON, no markdown, no explanation, no code blocks
+    - Each example must have "question", "reasoning", and "answer" fields
+
+    Expected format:
     [
       {{
         "question": "Complex question about the text?",
@@ -145,18 +153,18 @@ prompts:
         "answer": "Final answer drawn from the reasoning."
       }}
     ]
-    
-    Text:
+
+    Text to create reasoning examples from:
     {text}
-  
+
   # Chain of Thought enhancement prompt
   cot_enhancement: |
     You are an expert reasoning assistant. Your task is to enhance the given conversations by adding chain-of-thought reasoning.
-    
+
     For each conversation, add detailed step-by-step reasoning to the assistant's responses while preserving the original answer.
-    
+
     {include_simple_steps} = Whether to add reasoning to simple responses too. If false, only add reasoning to complex responses.
-    
+
     Return the enhanced conversations as a JSON array matching this format:
     [
       [
@@ -170,6 +178,6 @@ prompts:
         {{"role": "assistant", "content": "Let me work through this:\n\n1. I'll start by...\n2. Next...\n\nIn conclusion, [original answer]"}}
       ]
     ]
-    
+
     Original conversations:
     {conversations}