eval-sys · xyliugo · Sep 1, 2025 · Aug 31, 2025 · Aug 31, 2025 · Aug 31, 2025
diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ python -m pipeline \
   --tasks file_property/size_classification
 ```
 
-Results are saved to `./results/{exp_name}/{mcp}__{model}/{task}` (in this example `./results/test-run/filesystem__gpt-5/file_property__size_classification`).
+Results are saved to `./results/{exp_name}/{model}__{mcp}/run-*/...` (e.g., `./results/test-run/gpt-5__filesystem/run-1/...`).
 
 ---
 
@@ -152,7 +152,7 @@ You can also follow `docs/quickstart.md` for the shortest end-to-end path.
 
 ## Results and metrics
 
-- Results are written to `./results/` (JSON + CSV).
+- Results are organized under `./results/{exp_name}/{model}__{mcp}/run-*/` (JSON + CSV per task).
 - Generate a summary with:
 ```bash
 python -m src.aggregators.aggregate_results --exp-name exp
@@ -162,7 +162,9 @@ python -m src.aggregators.aggregate_results --exp-name exp
 ---
 
 ## Model and Tasks
-- See `docs/introduction.md` for models supported in MCPMark.
+- **Model support**: MCPMark calls models via LiteLLM — see the LiteLLM docs: [`LiteLLM Doc`](https://docs.litellm.ai/docs/). For Anthropic (Claude) extended thinking mode (enabled via `--reasoning-effort`), we use Anthropic’s native API.
+- See `docs/introduction.md` for details and configuration of supported models in MCPMark.
+- To add a new model, edit `src/model_config.py`. Before adding, check LiteLLM supported models/providers. See [`LiteLLM Doc`](https://docs.litellm.ai/docs/).
 - Task design principles in `docs/datasets/task.md`. Each task ships with an automated `verify.py` for objective, reproducible evaluation, see `docs/task.md` for details.
 
 ---

diff --git a/pipeline.py b/pipeline.py
@@ -63,10 +63,10 @@ def main():
         "--timeout", type=int, default=3600, help="Timeout in seconds for agent execution"
     )
     parser.add_argument(
-        "--stream",
-        action="store_true",
-        default=False,
-        help="Use streaming execution (default: False, uses non-streaming)",
+        "--reasoning-effort",
+        default="default",
+        choices=["default", "minimal", "low", "medium", "high"],
+        help="Reasoning effort level for supported models (default: None)",
     )
 
     # Output configuration
@@ -113,12 +113,11 @@ def main():
             logger.info(f"Starting Run {run_idx}/{args.k}")
             logger.info(f"{'=' * 80}\n")
 
-            # For k-runs, create run-N subdirectory
+            # For k-runs, results/{exp}/{mcp}__{model}/run-N
             run_exp_name = f"run-{run_idx}"
             run_output_dir = args.output_dir / args.exp_name
         else:
-            # For single run (k=1), maintain backward compatibility
-            # Use run-1 subdirectory for consistency
+            # For single run, still use run-1 under service_model
             run_exp_name = "run-1"
             run_output_dir = args.output_dir / args.exp_name
 
@@ -138,7 +137,7 @@ def main():
                 timeout=args.timeout,
                 exp_name=run_exp_name,
                 output_dir=run_output_dir,
-                stream=args.stream,
+                reasoning_effort=args.reasoning_effort,
             )
 
             pipeline.run_evaluation(args.tasks)

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,8 +14,13 @@ dependencies = [
   "python-dotenv>=1.1.1,<2",
   "ruff>=0.12.4,<0.13",
   "psycopg2-binary>=2.9.10,<3",
-  "pyyaml>=6.0.2,<7"
-, "nest-asyncio>=1.6.0,<2", "pixi", "pipx>=1.7.1,<2", "pgdumplib>=3.1.0,<4"]
+  "pyyaml>=6.0.2,<7",
+  "nest-asyncio>=1.6.0,<2",
+  "pixi",
+  "pipx>=1.7.1,<2",
+  "pgdumplib>=3.1.0,<4",
+  "litellm==1.76.0"
+]
 
 [build-system]
 build-backend = "hatchling.build"

diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,5 @@ matplotlib>=3.7.0
 numpy>=1.23.0
 psycopg2
 pyyaml
-nest_asyncio
+nest_asyncio
+litellm==1.76.0