KRLabsOrg · adaamko · May 11, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py
@@ -5,7 +5,8 @@
 
 # === Paths ===
 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
-DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination"
+DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination"
+DATA_DIR = Path(os.environ.get("CODE_HALLUCINATION_OUTPUT_DIR", str(DEFAULT_DATA_DIR)))
 REPOS_DIR = DATA_DIR / "repos"
 SOURCE_CACHE_DIR = DATA_DIR / "source_cache"
 
@@ -21,6 +22,37 @@
 METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json"
 VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt"
 
+
+def set_output_dir(path: str | os.PathLike[str]) -> Path:
+    """Redirect all pipeline outputs to a specific directory."""
+    global DATA_DIR
+    global REPOS_DIR
+    global SOURCE_CACHE_DIR
+    global INSTANCES_PATH
+    global QUERIES_PATH
+    global DOCS_PATH
+    global FORMATS_PATH
+    global HALLUCINATED_PATH
+    global DATASET_PATH
+    global METADATA_PATH
+    global VALIDATION_REPORT_PATH
+
+    DATA_DIR = Path(path)
+    REPOS_DIR = DATA_DIR / "repos"
+    SOURCE_CACHE_DIR = DATA_DIR / "source_cache"
+    INSTANCES_PATH = DATA_DIR / "swebench_instances.json"
+    QUERIES_PATH = DATA_DIR / "queries.jsonl"
+    DOCS_PATH = DATA_DIR / "documentation.jsonl"
+    FORMATS_PATH = DATA_DIR / "formats.jsonl"
+    HALLUCINATED_PATH = DATA_DIR / "hallucinated_samples.jsonl"
+    DATASET_PATH = DATA_DIR / "code_hallucination_data.json"
+    METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json"
+    VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt"
+
+    os.environ["CODE_HALLUCINATION_OUTPUT_DIR"] = str(DATA_DIR)
+    return DATA_DIR
+
+
 # === LLM API Config ===
 # Override via env vars or CLI args
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
@@ -36,6 +68,7 @@
 HALLUCINATION_RATIO = 0.4  # 40% hallucinated, 60% clean
 MAX_FILE_CHARS = 12000  # Cap individual source file size
 MAX_CONTEXT7_CHARS = 4000  # Documentation fetch limit
+MAX_PROMPT_CHARS = 24000  # ~6K tokens, leaves room for answer within 8K model context
 
 # === LLM Config ===
 RETRY_DELAY = 2
@@ -54,3 +87,13 @@
 # SWE-bench datasets
 SWEBENCH_FULL = "princeton-nlp/SWE-bench"
 SWEBENCH_LITE = "princeton-nlp/SWE-bench_Lite"
+
+# Models that require max_completion_tokens instead of max_tokens
+_REASONING_MODEL_PREFIXES = ("o1", "o3", "o4", "gpt-5")
+
+
+def token_limit_kwargs(model: str, max_tokens: int = 4000) -> dict:
+    """Return the right token-limit kwarg for the given model."""
+    if any(model.startswith(p) for p in _REASONING_MODEL_PREFIXES):
+        return {"max_completion_tokens": max_tokens, "reasoning_effort": "none"}
+    return {"max_tokens": max_tokens}
diff --git a/scripts/code_hallucination/context7_docs.py b/scripts/code_hallucination/context7_docs.py
@@ -228,7 +228,7 @@ def run(instances: list[dict]):
 
             if processed % 100 == 0:
                 print(
-                    f"  Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
+                    f"  Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
                 )
 
     print(
-Original file line number
+Diff line change
@@ Expand Up / @@ -228,7 +228,7 @@ def run(instances: list[dict]): @@
                 if processed % 100 == 0:
                     print(
-                        f"  Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
+                        f"  Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
                     )
         print(
@@ Expand Down @@