Terminal-Bench-server/terminal_bench_service.py at main · open-compass/Terminal-Bench-server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""FastAPI service for running Terminal-Bench tasks."""

import argparse
import asyncio
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any

import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel

from agents.terminus_2_agent import Terminus2AgentRunner
from evaluators import terminal_bench as evaluator
from task_loader import load_task
from utils import ImageLRUCache

logger = logging.getLogger(__name__)

DEFAULT_STEP_LIMIT = 100
DEFAULT_COST_LIMIT = 3.0

image_cache = ImageLRUCache()
thread_pool_max_workers = int(os.getenv("THREAD_POOL_MAX_WORKERS", "1"))
task_executor = ThreadPoolExecutor(max_workers=thread_pool_max_workers)


@asynccontextmanager
async def lifespan(app: FastAPI):
    logger.info(f"ThreadPoolExecutor initialized with max_workers={thread_pool_max_workers}")
    yield
    task_executor.shutdown(wait=True)


app = FastAPI(title="Terminal-Bench Service", version="1.0.0", lifespan=lifespan)


class TaskRequest(BaseModel):
    params: dict[str, Any] | None = None
    benchmark: str | None = None
    benchmark_type: str | None = None
    agent_type: str | None = None
    max_steps: int | None = None
    llm_config: dict[str, Any] | None = None
    modality: str | None = None


class TaskResponse(BaseModel):
    status: str
    final_answer: str
    trajectory: list | None = None
    error: str | None = None


def _completed_response(final_answer: str, trajectory: list | None = None) -> TaskResponse:
    """Build a normal terminal response."""
    return TaskResponse(status="completed", final_answer=final_answer, trajectory=trajectory)


def _error_response(error: str, trajectory: list | None = None) -> TaskResponse:
    """Build a service-level error response."""
    return TaskResponse(status="error", final_answer="", trajectory=trajectory, error=error)


def _build_trajectory(messages: list[Any] | None) -> list[dict[str, Any]]:
    """Build a normalized trajectory from agent messages."""
    return [
        {
            "step": i + 1,
            "role": msg.get("role", "unknown"),
            "content": msg.get("content", ""),
        }
        for i, msg in enumerate(messages or [])
        if isinstance(msg, dict)
    ]


def _append_evaluation_step(trajectory: list[dict[str, Any]], output: str) -> None:
    """Append the final evaluation record to the trajectory."""
    trajectory.append(
        {
            "step": len(trajectory) + 1,
            "action": "evaluation",
            "output": output,
        }
    )


def _resolve_task_name(params: dict[str, Any], instance: dict[str, Any]) -> str | None:
    """Resolve a Terminal-Bench task name from legacy and AgentCompass payloads."""
    candidates = [
        ("metadata.task_name", instance.get("task_name")),
        ("metadata.instance_id", instance.get("instance_id")),
        ("metadata.terminal_bench_task_id", instance.get("terminal_bench_task_id")),
        ("metadata.task_id", instance.get("task_id")),
        ("params.task_name", params.get("task_name")),
        ("params.instance_id", params.get("instance_id")),
        ("params.terminal_bench_task_id", params.get("terminal_bench_task_id")),
        ("params.task_id", params.get("task_id")),
    ]

    for source, value in candidates:
        if isinstance(value, str) and value.strip():
            task_name = value.strip()
            if source != "metadata.task_name":
                logger.info("Resolved task_name=%s from %s", task_name, source)
            return task_name
    return None


@app.post("/api/tasks", response_model=TaskResponse, response_model_exclude_none=True)
async def run_terminal_bench_task(request: TaskRequest):
    payload = request.model_dump()
    params = payload.get("params") or {}
    if not isinstance(params, dict):
        params = {}

    instance = params.get("metadata") or {}
    if not isinstance(instance, dict):
        instance = {}

    # task_name maps to the directory name in terminal-bench-2-main.
    # Accept AgentCompass-native identifiers as aliases for compatibility.
    task_name = _resolve_task_name(params, instance)
    if not task_name:
        return _error_response(
            (
                "Missing task identifier. Expected one of "
                "params.metadata.task_name, params.metadata.instance_id, "
                "params.metadata.terminal_bench_task_id, params.metadata.task_id, "
                "or their params-level equivalents."
            )
        )

    # Prefer the authoritative on-disk task definition once task_name is known.
    # AgentCompass currently forwards a truncated params.question preview.
    task = instance.get("problem_statement")
    if not task:
        try:
            task_data = load_task(task_name)
            instance = {**task_data, **instance}
            task = task_data["problem_statement"]
        except Exception as e:
            task = params.get("question")
            if task:
                logger.warning(
                    "Falling back to params.question for task %s because load_task failed: %s",
                    task_name,
                    e,
                )
            else:
                return _error_response(f"Missing problem_statement and failed to load from disk: {e}")

    instance.setdefault("problem_statement", task)
    instance["task_name"] = task_name

    llm_config = payload.get("llm_config") or {}
    step_limit = payload.get("max_steps") or DEFAULT_STEP_LIMIT
    cost_limit = params.get("cost_limit") or DEFAULT_COST_LIMIT
    request_timeout = llm_config.get("request_timeout")

    loop = asyncio.get_running_loop()
    runner = Terminus2AgentRunner(image_cache)

    try:
        result = await loop.run_in_executor(
            task_executor,
            lambda: runner.run(
                instance=instance,
                llm_config=llm_config,
                step_limit=step_limit,
                cost_limit=cost_limit,
                request_timeout=request_timeout,
                keep_container=True,
            ),
        )
    except Exception as e:
        logger.error(f"Agent execution failed: {e}", exc_info=True)
        return _error_response(f"Agent execution failed: {e}")

    trajectory = _build_trajectory(result.get("messages") if isinstance(result, dict) else None)

    # Evaluate
    eval_result = {}
    container_id = result.get("container_id")
    try:
        timeout = instance.get("verifier_timeout", 900.0)
        if not container_id:
            raise RuntimeError("Missing container_id from agent result; cannot run in-place verification")
        eval_result = await loop.run_in_executor(
            task_executor,
            lambda: evaluator.evaluate(task_name, container_id, timeout=timeout),
        )
    except Exception as e:
        logger.error(f"Evaluation failed: {e}", exc_info=True)
        _append_evaluation_step(trajectory, f"ERROR: {e}")
        return _error_response(f"Evaluation failed: {e}", trajectory)
    finally:
        if container_id:
            try:
                await loop.run_in_executor(
                    task_executor,
                    lambda: evaluator.cleanup_container(container_id),
                )
            except Exception as e:
                logger.warning(f"Container cleanup failed for {task_name}: {e}")

    if not isinstance(eval_result, dict) or "resolved" not in eval_result:
        logger.error("Invalid terminal-bench evaluation result: %r", eval_result)
        _append_evaluation_step(trajectory, f"ERROR: invalid evaluation result {eval_result!r}")
        return _error_response("Invalid evaluation result: missing resolved", trajectory)

    _append_evaluation_step(trajectory, str(eval_result))

    resolved = bool(eval_result.get("resolved"))
    logger.info(f"Task {task_name}: resolved={resolved}")
    return _completed_response(str(resolved), trajectory)


@app.get("/health")
async def health_check():
    return {"status": "healthy", "service": "Terminal-Bench"}


if __name__ == "__main__":
    logging.basicConfig(
        level=os.getenv("LOG_LEVEL", "INFO"),
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    parser = argparse.ArgumentParser()
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--port", type=int, default=8080)
    parser.add_argument("--workers", type=int, default=1)
    parser.add_argument(
        "--timeout-keep-alive",
        type=int,
        default=int(os.getenv("TIMEOUT_KEEP_ALIVE", "5")),
    )
    args = parser.parse_args()

    logger.info(
        "Starting Terminal-Bench service on %s:%s with %d worker(s), timeout_keep_alive=%ss",
        args.host,
        args.port,
        args.workers,
        args.timeout_keep_alive,
    )
    uvicorn.run(
        f"{Path(__file__).stem}:app",
        host=args.host,
        port=args.port,
        workers=args.workers,
        timeout_keep_alive=args.timeout_keep_alive,
    )