Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion scripts/code_hallucination/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

# === Paths ===
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination"
DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination"
DATA_DIR = Path(os.environ.get("CODE_HALLUCINATION_OUTPUT_DIR", str(DEFAULT_DATA_DIR)))
REPOS_DIR = DATA_DIR / "repos"
SOURCE_CACHE_DIR = DATA_DIR / "source_cache"

Expand All @@ -21,6 +22,37 @@
METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json"
VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt"


def set_output_dir(path: str | os.PathLike[str]) -> Path:
"""Redirect all pipeline outputs to a specific directory."""
global DATA_DIR
global REPOS_DIR
global SOURCE_CACHE_DIR
global INSTANCES_PATH
global QUERIES_PATH
global DOCS_PATH
global FORMATS_PATH
global HALLUCINATED_PATH
global DATASET_PATH
global METADATA_PATH
global VALIDATION_REPORT_PATH

DATA_DIR = Path(path)
REPOS_DIR = DATA_DIR / "repos"
SOURCE_CACHE_DIR = DATA_DIR / "source_cache"
INSTANCES_PATH = DATA_DIR / "swebench_instances.json"
QUERIES_PATH = DATA_DIR / "queries.jsonl"
DOCS_PATH = DATA_DIR / "documentation.jsonl"
FORMATS_PATH = DATA_DIR / "formats.jsonl"
HALLUCINATED_PATH = DATA_DIR / "hallucinated_samples.jsonl"
DATASET_PATH = DATA_DIR / "code_hallucination_data.json"
METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json"
VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt"

os.environ["CODE_HALLUCINATION_OUTPUT_DIR"] = str(DATA_DIR)
return DATA_DIR


# === LLM API Config ===
# Override via env vars or CLI args
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
Expand All @@ -36,6 +68,7 @@
HALLUCINATION_RATIO = 0.4 # 40% hallucinated, 60% clean
MAX_FILE_CHARS = 12000 # Cap individual source file size
MAX_CONTEXT7_CHARS = 4000 # Documentation fetch limit
MAX_PROMPT_CHARS = 24000 # ~6K tokens, leaves room for answer within 8K model context

# === LLM Config ===
RETRY_DELAY = 2
Expand All @@ -54,3 +87,13 @@
# SWE-bench datasets
SWEBENCH_FULL = "princeton-nlp/SWE-bench"
SWEBENCH_LITE = "princeton-nlp/SWE-bench_Lite"

# Models that require max_completion_tokens instead of max_tokens
_REASONING_MODEL_PREFIXES = ("o1", "o3", "o4", "gpt-5")


def token_limit_kwargs(model: str, max_tokens: int = 4000) -> dict:
"""Return the right token-limit kwarg for the given model."""
if any(model.startswith(p) for p in _REASONING_MODEL_PREFIXES):
return {"max_completion_tokens": max_tokens, "reasoning_effort": "none"}
return {"max_tokens": max_tokens}
2 changes: 1 addition & 1 deletion scripts/code_hallucination/context7_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def run(instances: list[dict]):

if processed % 100 == 0:
print(
f" Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
f" Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
)

print(
Expand Down
Loading
Loading