Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ __pycache__
data/\ndata/\n*.pdf
.venv-ci/
data/
example_output
example_output!/docs/Why_lang_models_hallucinate.pdf
!/docs/Why_lang_models_hallucinate.pdf
vllm/
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/copilot.data.migration.agent.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/copilot.data.migration.ask.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/copilot.data.migration.ask2agent.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/copilot.data.migration.edit.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions .idea/synthetic-data-kit.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,17 @@ mkdir -p data/{input,parsed,generated,curated,final}
mkdir -p data/{pdf,html,youtube,docx,ppt,txt,output,generated,cleaned,final}
```

- You also need a LLM backend that you will utilize for generating your dataset, if using vLLM:
- You also need a LLM backend that you will utilize for generating your dataset:
- if using ollama:
```bash
# Download from https://ollama.com/download
# get llamma model:
ollama pull llama3
# Run Ollama
ollama serve
# server is ruuning at http://localhost:11434
```
- if using vLLM:

```bash
# Start vLLM server
Expand Down
55 changes: 54 additions & 1 deletion configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ paths:

# LLM Provider configuration
llm:
provider: "ollama" #Using api-endpoint for Ollama
# Provider selection: "vllm" or "api-endpoint"
provider: "api-endpoint"
# provider: "api-endpoint" #Using api-endpoint for Llama API



# VLLM server configuration
vllm:
Expand All @@ -35,6 +38,15 @@ api-endpoint:
retry_delay: 1.0 # Initial delay between retries (seconds)
sleep_time: 0.5 # Small delay in seconds between batches to avoid rate limits

# Ollama server configuration (for Ollama via OpenAI-compatible API)
ollama:
api_base: "http://localhost:11434/v1" # Ollama's OpenAI-compatible endpoint
api_key: "not-needed" # Ollama doesn't require an API key
model: "llama3:latest" # Your Ollama model
max_retries: 3 # Number of retries for API calls
retry_delay: 1.0 # Initial delay between retries (seconds)


# Ingest configuration
ingest:
default_format: "txt" # Default output format for parsed files
Expand Down Expand Up @@ -67,6 +79,39 @@ format:
include_metadata: true # Include metadata in output files
pretty_json: true # Use indentation in JSON output

# Provider-specific settings for different use cases
provider_configs:
# For local development and testing (using Ollama via api-endpoint)
local_dev:
provider: "api-endpoint"
api_base: "http://localhost:11434/v1"
api_key: "not-needed"
model: "llama3:latest"
generation:
temperature: 0.8
max_tokens: 2048
batch_size: 5 # Smaller batch for local resources

# For production with Llama API
production:
provider: "api-endpoint"
api_base: "https://api.llama.com/v1"
api_key: "llama_api_key"
model: "Llama-4-Maverick-17B-128E-Instruct-FP8"
generation:
temperature: 0.7
max_tokens: 4096
batch_size: 32

# For high-performance local inference
local_vllm:
provider: "vllm"
model: "meta-llama/Llama-3.3-70B-Instruct"
generation:
temperature: 0.7
max_tokens: 4096
batch_size: 64

# Prompts for different tasks
prompts:
# Summary generation prompt
Expand Down Expand Up @@ -173,3 +218,11 @@ prompts:

Original conversations:
{conversations}

# Environment variable mappings (optional)
env_vars:
API_ENDPOINT_KEY: "not-needed" # For api-endpoint provider (Ollama doesn't need a key)
OLLAMA_HOST: "http://localhost:11434" # For reference
VLLM_HOST: "http://localhost:8000" # For vllm provider
# SDK_VERBOSE: 'True'

Binary file added docs/Why_lang_models_hallucinate.pdf
Binary file not shown.
18 changes: 17 additions & 1 deletion synthetic_data_kit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from rich.console import Console
from rich.table import Table

from synthetic_data_kit.utils.config import load_config, get_vllm_config, get_openai_config, get_llm_provider, get_path_config
from synthetic_data_kit.utils.config import load_config, get_vllm_config, get_openai_config, get_llm_provider, get_path_config, get_ollama_config
from synthetic_data_kit.core.context import AppContext
from synthetic_data_kit.server.app import run_server

Expand Down Expand Up @@ -338,6 +338,14 @@ def create(
api_base = api_base or api_endpoint_config.get("api_base")
model = model or api_endpoint_config.get("model")
# No server check needed for API endpoint

if provider == "ollama":
# Use Ollama config
ollama_config = get_ollama_config(ctx.config)
api_base = api_base or ollama_config.get("api_base")
model = model or ollama_config.get("model")
# No server check needed for Ollama endpoint

else:
# Use vLLM config
vllm_config = get_vllm_config(ctx.config)
Expand Down Expand Up @@ -498,6 +506,14 @@ def curate(
api_base = api_base or api_endpoint_config.get("api_base")
model = model or api_endpoint_config.get("model")
# No server check needed for API endpoint

if provider == "ollama":
# Use Ollama config
ollama_config = get_ollama_config(ctx.config)
api_base = api_base or ollama_config.get("api_base")
model = model or ollama_config.get("model")
# No server check needed for Ollama endpoint

else:
# Use vLLM config
vllm_config = get_vllm_config(ctx.config)
Expand Down
13 changes: 11 additions & 2 deletions synthetic_data_kit/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ paths:
# LLM Provider configuration
llm:
# Provider selection: "vllm" or "api-endpoint"
provider: "api-endpoint"
# provider: "api-endpoint"
provider: "ollama"

# VLLM server configuration
vllm:
Expand All @@ -24,7 +25,15 @@ vllm:
model: "meta-llama/Llama-3.3-70B-Instruct" # Default model to use
max_retries: 3 # Number of retries for API calls
retry_delay: 1.0 # Initial delay between retries (seconds)


#ollama server configuration
ollama:
api_base: "http://localhost:11434/v1" # Ollama's OpenAI-compatible endpoint
api_key: "not-needed" # Ollama doesn't require an API key
model: "llama3:latest" # Your Ollama model
max_retries: 3 # Number of retries for API calls
retry_delay: 1.0 # Initial delay between retries (seconds)

# API endpoint configuration
api-endpoint:
api_base: "https://api.llama.com/v1" # Optional base URL for API endpoint (null for default API)
Expand Down
6 changes: 3 additions & 3 deletions synthetic_data_kit/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ def __init__(self, config_path: Optional[Path] = None):
# Ensure data directories exist
self._ensure_data_dirs()

# Why have separeate folders? Yes ideally you should just be able to ingest an input folder and have everything being ingested and converted BUT
# Managing context window is hard and there are more edge cases which needs to be handled carefully
# Why have separate folders? Yes, ideally, you should just be able to ingest an input folder and have everything being ingested and converted BUT
# Managing context window is hard and there are more edge cases that need to be handled carefully
# it's also easier to debug in alpha if we have multiple files.
def _ensure_data_dirs(self):
"""Ensure data directories exist based on configuration"""
# Load config to get proper paths
config = load_config(self.config_path)
paths_config = config.get('paths', {})

# Create input directory - handle new config format where input is a string
# Create input directory - handle a new config format where input is a string
input_dir = paths_config.get('input', 'data/input')
os.makedirs(input_dir, exist_ok=True)

Expand Down
13 changes: 9 additions & 4 deletions synthetic_data_kit/core/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def process_file(
model: Optional[str] = None,
content_type: str = "qa",
num_pairs: Optional[int] = None,
verbose: bool = False,
verbose: bool = True,
provider: Optional[str] = None,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
Expand All @@ -42,9 +42,14 @@ def process_file(
"""Process a file to generate content

Args:
provider: llm provider to use
chunk_size: size of text chunks for processing
chunk_overlap: overlap between text chunks
rolling_summary: use rolling summary for context
verbose: determine if extra logging is needed
file_path: Path to the text file to process
output_dir: Directory to save generated content
config_path: Path to configuration file
config_path: Path to a configuration file
api_base: VLLM API base URL
model: Model to use
content_type: Type of content to generate (qa, summary, cot)
Expand All @@ -54,7 +59,7 @@ def process_file(
Returns:
Path to the output file
"""
# Create output directory if it doesn't exist
# Create an output directory if it doesn't exist
# The reason for having this directory logic for now is explained in context.py
os.makedirs(output_dir, exist_ok=True)

Expand Down Expand Up @@ -86,14 +91,14 @@ def process_file(
documents = [{"text": read_json(file_path), "image": None}]

if content_type == "qa":
print("Generating QA pairs...")
generator = QAGenerator(client, config_path)

# Get num_pairs from args or config
if num_pairs is None:
config = client.config
generation_config = get_generation_config(config)
num_pairs = generation_config.get("num_pairs", 25)

# Process document
result = generator.process_documents(
documents,
Expand Down
3 changes: 2 additions & 1 deletion synthetic_data_kit/core/curate.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def curate_qa_pairs(
"""Clean and filter QA pairs based on quality ratings

Args:
provider: llm provider to use eg, vllm, ollama, etc.
input_path: Path to the input file with QA pairs
output_path: Path to save the cleaned output
threshold: Quality threshold (1-10)
Expand Down Expand Up @@ -117,7 +118,7 @@ def curate_qa_pairs(
# This avoids conflicts with other output messages
print(f"Processing {len(batches)} batches of QA pairs...")

# Only use detailed progress bar in verbose mode
# Only use a detailed progress bar in verbose mode
if verbose:
from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn

Expand Down
2 changes: 1 addition & 1 deletion synthetic_data_kit/core/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def determine_parser(file_path: str, config: Dict[str, Any], multimodal: bool =
# Check if it's a URL
if file_path.startswith(("http://", "https://")):
# YouTube URL
if "youtube.com" in file_path or "youtu.be" in file_path:
if "youtube.com" in file_path or "youtube" in file_path:
return YouTubeParser()
# PDF URL
elif _check_pdf_url(file_path):
Expand Down
Loading