-
Notifications
You must be signed in to change notification settings - Fork 12
Port mini-swe-agent #21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
60822b6
e97579b
d751e45
8e08ab1
38f4c5b
07ae50a
8cfbb6f
5f8dfe6
6c956b6
d93db8e
1c4b09d
f56c47e
cbeea66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| from pathlib import Path | ||
|
|
||
| from inspect_ai import Task, task | ||
| from inspect_ai.tool import bash, python | ||
| from inspect_swe import mini_swe_agent | ||
|
|
||
|
|
||
| def _get_challenges_dir() -> Path: | ||
| """Get the path to Terminal Bench 2.0 challenges directory. | ||
|
|
||
| Attempts to find the challenges directory from inspect_evals installation. | ||
| """ | ||
| try: | ||
| import inspect_evals.terminal_bench_2 as tb2_module | ||
|
|
||
| module_dir = Path(tb2_module.__file__).parent | ||
| challenges_dir = module_dir / "challenges" | ||
| if challenges_dir.exists(): | ||
| return challenges_dir | ||
| except ImportError: | ||
| pass | ||
|
|
||
| raise ImportError( | ||
| "Could not find Terminal Bench 2.0 challenges directory. " | ||
| "Please install inspect-evals[terminal_bench_2]:\n" | ||
| " pip install inspect-evals[terminal_bench_2]" | ||
| ) | ||
|
|
||
|
|
||
| @task | ||
| def terminal_bench_task( | ||
| eval_names: list[str] | None = None, | ||
| variant_names: list[str] | None = None, | ||
| model: str | None = None, | ||
| ) -> Task: | ||
| """Terminal Bench 2.0 with mini-swe-agent solver. | ||
|
|
||
| Runs Terminal Bench 2.0 challenges using mini-swe-agent instead of | ||
| the default ReAct solver. | ||
|
|
||
| Args: | ||
| eval_names: Filter to specific challenge names (e.g., ["constraints-scheduling"]). | ||
| If None, runs all challenges. | ||
| variant_names: Filter to specific variants. Defaults to ["default"]. | ||
| model: Model name to use for mini-swe-agent. If None, uses default inspect model. | ||
|
|
||
| Returns: | ||
| Task configured with mini-swe-agent solver and Terminal Bench scorer. | ||
|
|
||
| Note: | ||
| This task uses pre-built Docker images from Docker Hub. For local builds, | ||
| use the original inspect_evals/terminal_bench_2 task directly. | ||
| """ | ||
| # Import dependencies from inspect_evals | ||
| try: | ||
| from inspect_cyber import create_agentic_eval_dataset | ||
| from inspect_evals.terminal_bench_2.terminal_bench_2 import ( | ||
| terminal_bench_2_scorer, | ||
| ) | ||
| except ImportError: | ||
| raise ImportError( | ||
| "inspect_cyber and inspect_evals are required for Terminal Bench 2.0. " | ||
| "Please install inspect-evals[terminal_bench_2]:\n" | ||
| " pip install inspect-evals[terminal_bench_2]" | ||
| ) from None | ||
|
|
||
| # Get challenges directory | ||
| challenges_dir = _get_challenges_dir() | ||
|
|
||
| # Load dataset | ||
| dataset = create_agentic_eval_dataset(root_dir=challenges_dir.absolute()) | ||
|
|
||
| # Filter by eval_names if specified | ||
| if eval_names: | ||
| dataset = dataset.filter_by_metadata_field("eval_name", eval_names) | ||
|
|
||
| # Filter by variant_names (default to "default" variant) | ||
| if variant_names is None: | ||
| variant_names = ["default"] | ||
| dataset = dataset.filter_by_metadata_field("variant_name", variant_names) | ||
|
|
||
| # Create the solver | ||
| solver = mini_swe_agent(model=model) | ||
|
|
||
| return Task( | ||
| dataset=dataset, | ||
| solver=solver, | ||
| scorer=terminal_bench_2_scorer(), | ||
| tools=[bash(timeout=60), python(timeout=60)], | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,7 @@ dev = [ | |
| "pytest-dotenv", | ||
| "types-PyYAML", | ||
| "IPython", | ||
| "inspect-evals", | ||
| ] | ||
| doc = ["quarto-cli==1.7.31"] | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,215 @@ | ||
| from textwrap import dedent | ||
| from typing import Literal | ||
|
|
||
| from inspect_ai.agent import ( | ||
| Agent, | ||
| AgentAttempts, | ||
| AgentState, | ||
| agent, | ||
| agent_with, | ||
| sandbox_agent_bridge, | ||
| ) | ||
| from inspect_ai.model import ChatMessageSystem, GenerateFilter, get_model | ||
| from inspect_ai.scorer import score | ||
| from inspect_ai.util import sandbox as sandbox_env | ||
| from inspect_ai.util import store | ||
|
|
||
| from .._util._async import is_callable_coroutine | ||
| from .._util.agentwheel import AgentWheelSource, ensure_agent_wheel_installed | ||
| from .._util.messages import build_user_prompt | ||
| from .._util.trace import trace | ||
|
|
||
| # mini-swe-agent wheel source configuration | ||
| # Pin to v1.x by default - v2.0 has breaking changes (migration guide pending) | ||
| # See: https://mini-swe-agent.com/latest/advanced/v2_migration/ | ||
anaoaktree marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| MINI_SWE_AGENT_SOURCE = AgentWheelSource( | ||
| agent="mini-swe-agent", | ||
| package="mini-swe-agent", | ||
| binary="mini", # CLI entrypoint | ||
| default_version="1.17.4", | ||
| ) | ||
|
|
||
|
|
||
| @agent | ||
| def mini_swe_agent( | ||
| name: str = "mini-swe-agent", | ||
| description: str = dedent(""" | ||
| Minimal AI agent that solves software engineering tasks using bash commands. | ||
| 100 lines of Python, radically simple, scores >74% on SWE-bench verified. | ||
| """), | ||
| system_prompt: str | None = None, | ||
| attempts: int | AgentAttempts = 1, # TODO: currently supports single attempt | ||
| model: str | None = None, | ||
| filter: GenerateFilter | None = None, | ||
| retry_refusals: int | None = None, | ||
| cost_limit: float | None = None, | ||
| cwd: str | None = None, | ||
| env: dict[str, str] | None = None, | ||
| user: str | None = None, | ||
| sandbox: str | None = None, | ||
| version: Literal["stable", "sandbox", "latest"] | str = "stable", | ||
| ) -> Agent: | ||
| """mini-swe-agent agent. | ||
|
|
||
| Agent that uses [mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent) | ||
| running in a sandbox. Mini-swe-agent is a minimal 100-line agent that solves | ||
| GitHub issues using only bash commands. | ||
|
|
||
| The agent can either use a version of mini-swe-agent installed in the sandbox, | ||
| or can download and install it via pip (see docs on `version` option below). | ||
|
|
||
| Use `attempts` to enable additional submissions if initial submission(s) | ||
| are incorrect (by default, no additional attempts are permitted). | ||
|
|
||
| Use `cost_limit` to set a maximum cost for the agent run (in USD). | ||
|
|
||
| Args: | ||
| name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`) | ||
| description: Agent description (used in multi-agent systems) | ||
| system_prompt: Additional system prompt to append to default. | ||
| attempts: Configure agent to make multiple attempts. | ||
| model: Model name to use (defaults to main model for task). | ||
anaoaktree marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| filter: Filter for intercepting bridged model requests. | ||
| retry_refusals: Should refusals be retried? (pass number of times to retry) | ||
| cost_limit: Maximum cost limit for the agent run. | ||
| cwd: Working directory to run mini-swe-agent within. | ||
| env: Environment variables to set for mini-swe-agent. | ||
| user: User to execute mini-swe-agent with. | ||
| sandbox: Optional sandbox environment name. | ||
| version: Version of mini-swe-agent to use. One of: | ||
| - "stable": Download and install the default pinned version. | ||
| - "sandbox": Use version in sandbox (raises RuntimeError if not available) | ||
| - "latest": Download and install latest version from PyPI. | ||
| - "x.x.x": Install and use a specific version. | ||
| """ | ||
| # resolve models | ||
| inspect_model = f"inspect/{model}" if model is not None else "inspect" | ||
|
|
||
| # resolve attempts | ||
| attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts | ||
|
|
||
| async def execute(state: AgentState) -> AgentState: | ||
| # determine port (use new port for each execution of agent on sample) | ||
| MODEL_PORT = "mini_swe_agent_model_port" | ||
| port = store().get(MODEL_PORT, 4000) + 1 | ||
| store().set(MODEL_PORT, port) | ||
|
|
||
| # resolve sandbox once for reuse | ||
| sbox = sandbox_env(sandbox) | ||
|
|
||
| async with sandbox_agent_bridge( | ||
| state, | ||
| model=inspect_model, | ||
| filter=filter, | ||
| retry_refusals=retry_refusals, | ||
| port=port, | ||
| ) as bridge: | ||
| # ensure mini-swe-agent is installed | ||
| mini_binary = await ensure_agent_wheel_installed( | ||
| source=MINI_SWE_AGENT_SOURCE, | ||
| version=version, | ||
| user=user, | ||
| sandbox=sbox, | ||
| ) | ||
|
|
||
| # base command options | ||
| cmd = [ | ||
| mini_binary, | ||
| "--yolo", # run without confirmations (like --print for claude) | ||
| "--exit-immediately", # exit when agent finishes instead of prompting | ||
| ] | ||
|
|
||
| # add cost limit if specified | ||
| if cost_limit is not None: | ||
| cmd.extend(["--cost-limit", str(cost_limit)]) | ||
|
|
||
| # build user prompt | ||
| prompt, _ = build_user_prompt(state.messages) | ||
|
|
||
| # add system prompt context if provided | ||
| full_prompt = prompt | ||
| system_messages = [ | ||
| m.text for m in state.messages if isinstance(m, ChatMessageSystem) | ||
| ] | ||
| if system_prompt is not None: | ||
| system_messages.append(system_prompt) | ||
| if system_messages: | ||
| # Prepend system context to the task | ||
| system_context = "\n\n".join(system_messages) | ||
| full_prompt = ( | ||
| f"System instructions:\n{system_context}\n\nTask:\n{prompt}" | ||
| ) | ||
|
|
||
| # execute the agent | ||
| debug_output: list[str] = [] | ||
| agent_prompt = full_prompt | ||
| attempt_count = 0 | ||
|
|
||
| while True: # Kept for consistency with other agents but currently only single attempt supported | ||
| # TODO: build command with task. This only works for single-turn tasks currently. Need to update to support multi-turn (perhaps via file output option) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this still a todo?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Jay-Bailey yes - see Limitations in the PR description. I'm still waiting to know wether this should be part of the scope for this PR or done separately. |
||
| agent_cmd = cmd + ["--task", agent_prompt] | ||
|
|
||
| # run agent | ||
| result = await sbox.exec( | ||
| cmd=["bash", "-c", 'exec 0</dev/null "$@"', "bash"] + agent_cmd, | ||
| cwd=cwd, | ||
| env={ | ||
| "MSWEA_CONFIGURED": "true", # Skip interactive setup wizard | ||
| "MSWEA_MODEL_NAME": model | ||
| if model is not None | ||
| else get_model().name, | ||
| "OPENAI_API_BASE": f"http://localhost:{bridge.port}/v1", | ||
| # actual key is handled by inspect, mini-swe needs it to approve setup | ||
| "OPENAI_API_KEY": "sk-none", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this intended as the key?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is just so the agent detects a key, the real one will be passed by inspect. |
||
| "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}", | ||
| "ANTHROPIC_API_KEY": "sk-none", | ||
| } | ||
| | (env or {}), | ||
| user=user, | ||
| concurrency=False, | ||
| ) | ||
|
|
||
| # track debug output | ||
| debug_output.append(f"[stdout]\n{result.stdout}") | ||
| debug_output.append(f"[stderr]\n{result.stderr}") | ||
|
|
||
| # raise for error | ||
| if not result.success: | ||
| raise RuntimeError( | ||
| f"Error executing mini-swe-agent (cwd={cwd or 'default'}):\n" | ||
| f"stdout: {result.stdout}\n" | ||
| f"stderr: {result.stderr}" | ||
| ) | ||
|
|
||
| # exit if we are at max_attempts | ||
| attempt_count += 1 | ||
| if attempt_count >= attempts.attempts: | ||
| break | ||
|
|
||
| # score this attempt | ||
| answer_scores = await score(state) | ||
|
|
||
| # break if we score 'correct' | ||
| if attempts.score_value(answer_scores[0].value) == 1.0: | ||
| break | ||
|
|
||
| # otherwise update prompt with incorrect message and continue | ||
| else: | ||
| if callable(attempts.incorrect_message): | ||
| if not is_callable_coroutine(attempts.incorrect_message): | ||
| raise ValueError( | ||
| "The incorrect_message function must be async." | ||
| ) | ||
| agent_prompt = await attempts.incorrect_message( | ||
| state, answer_scores | ||
| ) | ||
| else: | ||
| agent_prompt = attempts.incorrect_message | ||
|
|
||
| # trace debug info | ||
| debug_output.insert(0, "mini-swe-agent Debug Output:") | ||
| trace("\n".join(debug_output)) | ||
|
|
||
| return bridge.state | ||
|
|
||
| return agent_with(execute, name=name, description=description) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When running this with uv run --no-sync inspect eval examples/terminal_bench_test --limit 1 (no-sync needed to prevent aisitools from trying to install)
I get:
/home/ec2-user/inspect_swe/src/inspect_swe/_util/agentwheel.py:140 in detect_python_version │
│ │
│ 137 │ # Detects Python version in sandbox. │
│ 138 │ result = await sandbox.exec(bash_command("python3 --version"), user=user) │
│ 139 │ if not result.success: │
│ ❱ 140 │ │ raise RuntimeError("Python 3 not found in sandbox (required for agent)") │
│ 141 │ │
│ 142 │ # Parse "Python 3.12.0" -> "312" │
│ 143 │ match = re.search(r"Python (\d+).(\d+)", result.stdout) │
╰──────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Python 3 not found in sandbox (required for agent)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Jay-Bailey The code was assuming python installed in the sandbox which didn't work for the particular task limit 1 pulled. I'm now working on changes to use uv in the sandbox instead to avoid this issue for sandboxes with no python (should have it done by tomorrow).
In the meantime, if you run your command with
--T eval_names='["break-filter-js-from-html"]should work (this particular task has python installed).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can confirm this command now runs, even without the --T.