meta-llama · bharti26 · Sep 26, 2025 · Sep 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ __pycache__
 data/\ndata/\n*.pdf
 .venv-ci/
 data/
-example_output
+example_output!/docs/Why_lang_models_hallucinate.pdf
+!/docs/Why_lang_models_hallucinate.pdf
+vllm/
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/copilot.data.migration.agent.xml b/.idea/copilot.data.migration.agent.xml
diff --git a/.idea/copilot.data.migration.ask.xml b/.idea/copilot.data.migration.ask.xml
diff --git a/.idea/copilot.data.migration.ask2agent.xml b/.idea/copilot.data.migration.ask2agent.xml
diff --git a/.idea/copilot.data.migration.edit.xml b/.idea/copilot.data.migration.edit.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/synthetic-data-kit.iml b/.idea/synthetic-data-kit.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -72,7 +72,17 @@ mkdir -p data/{input,parsed,generated,curated,final}
 mkdir -p data/{pdf,html,youtube,docx,ppt,txt,output,generated,cleaned,final}
 ```
 
-- You also need a LLM backend that you will utilize for generating your dataset, if using vLLM:
+- You also need a LLM backend that you will utilize for generating your dataset:
+- if using ollama:
+```bash
+# Download from https://ollama.com/download
+# get llamma model:
+ ollama pull llama3
+# Run Ollama
+ ollama serve
+# server is ruuning at http://localhost:11434
+```
+-  if using vLLM:
 
 ```bash
 # Start vLLM server

diff --git a/configs/config.yaml b/configs/config.yaml
@@ -14,8 +14,11 @@ paths:
 
 # LLM Provider configuration
 llm:
+  provider: "ollama" #Using api-endpoint for Ollama
   # Provider selection: "vllm" or "api-endpoint"
-  provider: "api-endpoint"
+  #  provider: "api-endpoint" #Using api-endpoint for Llama API
+
+
 
 # VLLM server configuration
 vllm:
@@ -35,6 +38,15 @@ api-endpoint:
   retry_delay: 1.0                     # Initial delay between retries (seconds)
   sleep_time: 0.5                      # Small delay in seconds between batches to avoid rate limits
 
+# Ollama server configuration (for Ollama via OpenAI-compatible API)
+ollama:
+  api_base: "http://localhost:11434/v1" # Ollama's OpenAI-compatible endpoint
+  api_key: "not-needed"                 # Ollama doesn't require an API key
+  model: "llama3:latest"                # Your Ollama model
+  max_retries: 3                        # Number of retries for API calls
+  retry_delay: 1.0                      # Initial delay between retries (seconds)
+
+
 # Ingest configuration
 ingest:
   default_format: "txt"  # Default output format for parsed files
@@ -67,6 +79,39 @@ format:
   include_metadata: true  # Include metadata in output files
   pretty_json: true  # Use indentation in JSON output
 
+# Provider-specific settings for different use cases
+provider_configs:
+  # For local development and testing (using Ollama via api-endpoint)
+  local_dev:
+    provider: "api-endpoint"
+    api_base: "http://localhost:11434/v1"
+    api_key: "not-needed"
+    model: "llama3:latest"
+    generation:
+      temperature: 0.8
+      max_tokens: 2048
+      batch_size: 5    # Smaller batch for local resources
+
+  # For production with Llama API
+  production:
+    provider: "api-endpoint"
+    api_base: "https://api.llama.com/v1"
+    api_key: "llama_api_key"
+    model: "Llama-4-Maverick-17B-128E-Instruct-FP8"
+    generation:
+      temperature: 0.7
+      max_tokens: 4096
+      batch_size: 32
+
+  # For high-performance local inference
+  local_vllm:
+    provider: "vllm"
+    model: "meta-llama/Llama-3.3-70B-Instruct"
+    generation:
+      temperature: 0.7
+      max_tokens: 4096
+      batch_size: 64
+
 # Prompts for different tasks
 prompts:
   # Summary generation prompt
@@ -173,3 +218,11 @@ prompts:
 
     Original conversations:
     {conversations}
+
+# Environment variable mappings (optional)
+env_vars:
+  API_ENDPOINT_KEY: "not-needed"          # For api-endpoint provider (Ollama doesn't need a key)
+  OLLAMA_HOST: "http://localhost:11434"   # For reference
+  VLLM_HOST: "http://localhost:8000"      # For vllm provider
+#  SDK_VERBOSE: 'True'
+
diff --git a/docs/Why_lang_models_hallucinate.pdf b/docs/Why_lang_models_hallucinate.pdf
diff --git a/synthetic_data_kit/cli.py b/synthetic_data_kit/cli.py
@@ -13,7 +13,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from synthetic_data_kit.utils.config import load_config, get_vllm_config, get_openai_config, get_llm_provider, get_path_config
+from synthetic_data_kit.utils.config import load_config, get_vllm_config, get_openai_config, get_llm_provider, get_path_config, get_ollama_config
 from synthetic_data_kit.core.context import AppContext
 from synthetic_data_kit.server.app import run_server
 
@@ -338,6 +338,14 @@ def create(
         api_base = api_base or api_endpoint_config.get("api_base")
         model = model or api_endpoint_config.get("model")
         # No server check needed for API endpoint
+
+    if provider == "ollama":
+        # Use Ollama config
+        ollama_config = get_ollama_config(ctx.config)
+        api_base = api_base or ollama_config.get("api_base")
+        model = model or ollama_config.get("model")
+        # No server check needed for Ollama endpoint
+
     else:
         # Use vLLM config
         vllm_config = get_vllm_config(ctx.config)
@@ -498,6 +506,14 @@ def curate(
         api_base = api_base or api_endpoint_config.get("api_base")
         model = model or api_endpoint_config.get("model")
         # No server check needed for API endpoint
+
+    if provider == "ollama":
+        # Use Ollama config
+        ollama_config = get_ollama_config(ctx.config)
+        api_base = api_base or ollama_config.get("api_base")
+        model = model or ollama_config.get("model")
+        # No server check needed for Ollama endpoint
+
     else:
         # Use vLLM config
         vllm_config = get_vllm_config(ctx.config)

diff --git a/synthetic_data_kit/config.yaml b/synthetic_data_kit/config.yaml
@@ -15,7 +15,8 @@ paths:
 # LLM Provider configuration
 llm:
   # Provider selection: "vllm" or "api-endpoint"
-  provider: "api-endpoint"
+  #  provider: "api-endpoint"
+  provider: "ollama"
 
 # VLLM server configuration
 vllm:
@@ -24,7 +25,15 @@ vllm:
   model: "meta-llama/Llama-3.3-70B-Instruct" # Default model to use
   max_retries: 3                       # Number of retries for API calls
   retry_delay: 1.0                     # Initial delay between retries (seconds)
-
+
+#ollama server configuration
+ollama:
+  api_base: "http://localhost:11434/v1" # Ollama's OpenAI-compatible endpoint
+  api_key: "not-needed"                 # Ollama doesn't require an API key
+  model: "llama3:latest"                # Your Ollama model
+  max_retries: 3                        # Number of retries for API calls
+  retry_delay: 1.0                      # Initial delay between retries (seconds)
+
 # API endpoint configuration
 api-endpoint:
   api_base: "https://api.llama.com/v1" # Optional base URL for API endpoint (null for default API)

diff --git a/synthetic_data_kit/core/context.py b/synthetic_data_kit/core/context.py
@@ -21,16 +21,16 @@ def __init__(self, config_path: Optional[Path] = None):
         # Ensure data directories exist
         self._ensure_data_dirs()
 
-    # Why have separeate folders? Yes ideally you should just be able to ingest an input folder and have everything being ingested and converted BUT
-    # Managing context window is hard and there are more edge cases which needs to be handled carefully
+    # Why have separate folders? Yes, ideally, you should just be able to ingest an input folder and have everything being ingested and converted BUT
+    # Managing context window is hard and there are more edge cases that need to be handled carefully
     # it's also easier to debug in alpha if we have multiple files. 
     def _ensure_data_dirs(self):
         """Ensure data directories exist based on configuration"""
         # Load config to get proper paths
         config = load_config(self.config_path)
         paths_config = config.get('paths', {})
 
-        # Create input directory - handle new config format where input is a string
+        # Create input directory - handle a new config format where input is a string
         input_dir = paths_config.get('input', 'data/input')
         os.makedirs(input_dir, exist_ok=True)
 

diff --git a/synthetic_data_kit/core/create.py b/synthetic_data_kit/core/create.py
@@ -33,7 +33,7 @@ def process_file(
     model: Optional[str] = None,
     content_type: str = "qa",
     num_pairs: Optional[int] = None,
-    verbose: bool = False,
+    verbose: bool = True,
     provider: Optional[str] = None,
     chunk_size: Optional[int] = None,
     chunk_overlap: Optional[int] = None,
@@ -42,9 +42,14 @@ def process_file(
     """Process a file to generate content
 
     Args:
+        provider: llm provider to use
+        chunk_size: size of text chunks for processing
+        chunk_overlap: overlap between text chunks
+        rolling_summary: use rolling summary for context
+        verbose: determine if extra logging is needed
         file_path: Path to the text file to process
         output_dir: Directory to save generated content
-        config_path: Path to configuration file
+        config_path: Path to a configuration file
         api_base: VLLM API base URL
         model: Model to use
         content_type: Type of content to generate (qa, summary, cot)
@@ -54,7 +59,7 @@ def process_file(
     Returns:
         Path to the output file
     """
-    # Create output directory if it doesn't exist
+    # Create an output directory if it doesn't exist
     # The reason for having this directory logic for now is explained in context.py
     os.makedirs(output_dir, exist_ok=True)
 
@@ -86,14 +91,14 @@ def process_file(
         documents = [{"text": read_json(file_path), "image": None}]
 
     if content_type == "qa":
+        print("Generating QA pairs...")
         generator = QAGenerator(client, config_path)
 
         # Get num_pairs from args or config
         if num_pairs is None:
             config = client.config
             generation_config = get_generation_config(config)
             num_pairs = generation_config.get("num_pairs", 25)
-
         # Process document
         result = generator.process_documents(
             documents,

diff --git a/synthetic_data_kit/core/curate.py b/synthetic_data_kit/core/curate.py
@@ -28,6 +28,7 @@ def curate_qa_pairs(
     """Clean and filter QA pairs based on quality ratings
 
     Args:
+        provider: llm provider to use eg, vllm, ollama, etc.
         input_path: Path to the input file with QA pairs
         output_path: Path to save the cleaned output
         threshold: Quality threshold (1-10)
@@ -117,7 +118,7 @@ def curate_qa_pairs(
     # This avoids conflicts with other output messages
     print(f"Processing {len(batches)} batches of QA pairs...")
 
-    # Only use detailed progress bar in verbose mode
+    # Only use a detailed progress bar in verbose mode
     if verbose:
         from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
 

diff --git a/synthetic_data_kit/core/ingest.py b/synthetic_data_kit/core/ingest.py
@@ -56,7 +56,7 @@ def determine_parser(file_path: str, config: Dict[str, Any], multimodal: bool =
     # Check if it's a URL
     if file_path.startswith(("http://", "https://")):
         # YouTube URL
-        if "youtube.com" in file_path or "youtu.be" in file_path:
+        if "youtube.com" in file_path or "youtube" in file_path:
             return YouTubeParser()
         # PDF URL
         elif _check_pdf_url(file_path):