Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions examples/system_explorer/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,21 @@
from inspect_ai.dataset import json_dataset
from inspect_ai.scorer import model_graded_qa
from inspect_ai.util import SandboxEnvironmentType
from inspect_swe import claude_code, codex_cli
from inspect_swe import claude_code, codex_cli, mini_swe_agent


@task
def system_explorer(
agent: Literal["claude_code", "codex_cli"] = "claude_code",
agent: Literal["claude_code", "codex_cli", "mini_swe_agent"] = "claude_code",
sandbox: SandboxEnvironmentType | None = "docker",
) -> Task:
match agent:
case "claude_code":
solver = claude_code()
case "codex_cli":
solver = codex_cli()
case "mini_swe_agent":
solver = mini_swe_agent()

return Task(
dataset=json_dataset("dataset.json"),
Expand Down
90 changes: 90 additions & 0 deletions examples/terminal_bench_test/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from pathlib import Path

from inspect_ai import Task, task
from inspect_ai.tool import bash, python
from inspect_swe import mini_swe_agent


def _get_challenges_dir() -> Path:
"""Get the path to Terminal Bench 2.0 challenges directory.

Attempts to find the challenges directory from inspect_evals installation.
"""
try:
import inspect_evals.terminal_bench_2 as tb2_module

module_dir = Path(tb2_module.__file__).parent
challenges_dir = module_dir / "challenges"
if challenges_dir.exists():
return challenges_dir
except ImportError:
pass

raise ImportError(
"Could not find Terminal Bench 2.0 challenges directory. "
"Please install inspect-evals[terminal_bench_2]:\n"
" pip install inspect-evals[terminal_bench_2]"
)


@task
def terminal_bench_task(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When running this with uv run --no-sync inspect eval examples/terminal_bench_test --limit 1 (no-sync needed to prevent aisitools from trying to install)

I get:

/home/ec2-user/inspect_swe/src/inspect_swe/_util/agentwheel.py:140 in detect_python_version │
│ │
│ 137 │ # Detects Python version in sandbox. │
│ 138 │ result = await sandbox.exec(bash_command("python3 --version"), user=user) │
│ 139 │ if not result.success: │
│ ❱ 140 │ │ raise RuntimeError("Python 3 not found in sandbox (required for agent)") │
│ 141 │ │
│ 142 │ # Parse "Python 3.12.0" -> "312" │
│ 143 │ match = re.search(r"Python (\d+).(\d+)", result.stdout) │
╰──────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Python 3 not found in sandbox (required for agent)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Jay-Bailey The code was assuming python installed in the sandbox which didn't work for the particular task limit 1 pulled. I'm now working on changes to use uv in the sandbox instead to avoid this issue for sandboxes with no python (should have it done by tomorrow).

In the meantime, if you run your command with --T eval_names='["break-filter-js-from-html"] should work (this particular task has python installed).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm this command now runs, even without the --T.

eval_names: list[str] | None = None,
variant_names: list[str] | None = None,
model: str | None = None,
) -> Task:
"""Terminal Bench 2.0 with mini-swe-agent solver.

Runs Terminal Bench 2.0 challenges using mini-swe-agent instead of
the default ReAct solver.

Args:
eval_names: Filter to specific challenge names (e.g., ["constraints-scheduling"]).
If None, runs all challenges.
variant_names: Filter to specific variants. Defaults to ["default"].
model: Model name to use for mini-swe-agent. If None, uses default inspect model.

Returns:
Task configured with mini-swe-agent solver and Terminal Bench scorer.

Note:
This task uses pre-built Docker images from Docker Hub. For local builds,
use the original inspect_evals/terminal_bench_2 task directly.
"""
# Import dependencies from inspect_evals
try:
from inspect_cyber import create_agentic_eval_dataset
from inspect_evals.terminal_bench_2.terminal_bench_2 import (
terminal_bench_2_scorer,
)
except ImportError:
raise ImportError(
"inspect_cyber and inspect_evals are required for Terminal Bench 2.0. "
"Please install inspect-evals[terminal_bench_2]:\n"
" pip install inspect-evals[terminal_bench_2]"
) from None

# Get challenges directory
challenges_dir = _get_challenges_dir()

# Load dataset
dataset = create_agentic_eval_dataset(root_dir=challenges_dir.absolute())

# Filter by eval_names if specified
if eval_names:
dataset = dataset.filter_by_metadata_field("eval_name", eval_names)

# Filter by variant_names (default to "default" variant)
if variant_names is None:
variant_names = ["default"]
dataset = dataset.filter_by_metadata_field("variant_name", variant_names)

# Create the solver
solver = mini_swe_agent(model=model)

return Task(
dataset=dataset,
solver=solver,
scorer=terminal_bench_2_scorer(),
tools=[bash(timeout=60), python(timeout=60)],
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ dev = [
"pytest-dotenv",
"types-PyYAML",
"IPython",
"inspect-evals",
]
doc = ["quarto-cli==1.7.31"]

Expand Down
4 changes: 4 additions & 0 deletions src/inspect_swe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from ._claude_code.claude_code import claude_code
from ._codex_cli.codex_cli import codex_cli
from ._mini_swe_agent.mini_swe_agent import mini_swe_agent
from ._tools.download import AgentBinary, cached_agent_binaries, download_agent_binary
from ._util.agentwheel import download_wheels_tarball
from ._util.centaur import CentaurOptions
from ._util.sandbox import SandboxPlatform

Expand All @@ -13,10 +15,12 @@
__all__ = [
"claude_code",
"codex_cli",
"mini_swe_agent",
"download_agent_binary",
"cached_agent_binaries",
"AgentBinary",
"SandboxPlatform",
"CentaurOptions",
"__version__",
"download_wheels_tarball",
]
Empty file.
215 changes: 215 additions & 0 deletions src/inspect_swe/_mini_swe_agent/mini_swe_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
from textwrap import dedent
from typing import Literal

from inspect_ai.agent import (
Agent,
AgentAttempts,
AgentState,
agent,
agent_with,
sandbox_agent_bridge,
)
from inspect_ai.model import ChatMessageSystem, GenerateFilter, get_model
from inspect_ai.scorer import score
from inspect_ai.util import sandbox as sandbox_env
from inspect_ai.util import store

from .._util._async import is_callable_coroutine
from .._util.agentwheel import AgentWheelSource, ensure_agent_wheel_installed
from .._util.messages import build_user_prompt
from .._util.trace import trace

# mini-swe-agent wheel source configuration
# Pin to v1.x by default - v2.0 has breaking changes (migration guide pending)
# See: https://mini-swe-agent.com/latest/advanced/v2_migration/
MINI_SWE_AGENT_SOURCE = AgentWheelSource(
agent="mini-swe-agent",
package="mini-swe-agent",
binary="mini", # CLI entrypoint
default_version="1.17.4",
)


@agent
def mini_swe_agent(
name: str = "mini-swe-agent",
description: str = dedent("""
Minimal AI agent that solves software engineering tasks using bash commands.
100 lines of Python, radically simple, scores >74% on SWE-bench verified.
"""),
system_prompt: str | None = None,
attempts: int | AgentAttempts = 1, # TODO: currently supports single attempt
model: str | None = None,
filter: GenerateFilter | None = None,
retry_refusals: int | None = None,
cost_limit: float | None = None,
cwd: str | None = None,
env: dict[str, str] | None = None,
user: str | None = None,
sandbox: str | None = None,
version: Literal["stable", "sandbox", "latest"] | str = "stable",
) -> Agent:
"""mini-swe-agent agent.

Agent that uses [mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent)
running in a sandbox. Mini-swe-agent is a minimal 100-line agent that solves
GitHub issues using only bash commands.

The agent can either use a version of mini-swe-agent installed in the sandbox,
or can download and install it via pip (see docs on `version` option below).

Use `attempts` to enable additional submissions if initial submission(s)
are incorrect (by default, no additional attempts are permitted).

Use `cost_limit` to set a maximum cost for the agent run (in USD).

Args:
name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
description: Agent description (used in multi-agent systems)
system_prompt: Additional system prompt to append to default.
attempts: Configure agent to make multiple attempts.
model: Model name to use (defaults to main model for task).
filter: Filter for intercepting bridged model requests.
retry_refusals: Should refusals be retried? (pass number of times to retry)
cost_limit: Maximum cost limit for the agent run.
cwd: Working directory to run mini-swe-agent within.
env: Environment variables to set for mini-swe-agent.
user: User to execute mini-swe-agent with.
sandbox: Optional sandbox environment name.
version: Version of mini-swe-agent to use. One of:
- "stable": Download and install the default pinned version.
- "sandbox": Use version in sandbox (raises RuntimeError if not available)
- "latest": Download and install latest version from PyPI.
- "x.x.x": Install and use a specific version.
"""
# resolve models
inspect_model = f"inspect/{model}" if model is not None else "inspect"

# resolve attempts
attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts

async def execute(state: AgentState) -> AgentState:
# determine port (use new port for each execution of agent on sample)
MODEL_PORT = "mini_swe_agent_model_port"
port = store().get(MODEL_PORT, 4000) + 1
store().set(MODEL_PORT, port)

# resolve sandbox once for reuse
sbox = sandbox_env(sandbox)

async with sandbox_agent_bridge(
state,
model=inspect_model,
filter=filter,
retry_refusals=retry_refusals,
port=port,
) as bridge:
# ensure mini-swe-agent is installed
mini_binary = await ensure_agent_wheel_installed(
source=MINI_SWE_AGENT_SOURCE,
version=version,
user=user,
sandbox=sbox,
)

# base command options
cmd = [
mini_binary,
"--yolo", # run without confirmations (like --print for claude)
"--exit-immediately", # exit when agent finishes instead of prompting
]

# add cost limit if specified
if cost_limit is not None:
cmd.extend(["--cost-limit", str(cost_limit)])

# build user prompt
prompt, _ = build_user_prompt(state.messages)

# add system prompt context if provided
full_prompt = prompt
system_messages = [
m.text for m in state.messages if isinstance(m, ChatMessageSystem)
]
if system_prompt is not None:
system_messages.append(system_prompt)
if system_messages:
# Prepend system context to the task
system_context = "\n\n".join(system_messages)
full_prompt = (
f"System instructions:\n{system_context}\n\nTask:\n{prompt}"
)

# execute the agent
debug_output: list[str] = []
agent_prompt = full_prompt
attempt_count = 0

while True: # Kept for consistency with other agents but currently only single attempt supported
# TODO: build command with task. This only works for single-turn tasks currently. Need to update to support multi-turn (perhaps via file output option)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this still a todo?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Jay-Bailey yes - see Limitations in the PR description. I'm still waiting to know wether this should be part of the scope for this PR or done separately.

agent_cmd = cmd + ["--task", agent_prompt]

# run agent
result = await sbox.exec(
cmd=["bash", "-c", 'exec 0</dev/null "$@"', "bash"] + agent_cmd,
cwd=cwd,
env={
"MSWEA_CONFIGURED": "true", # Skip interactive setup wizard
"MSWEA_MODEL_NAME": model
if model is not None
else get_model().name,
"OPENAI_API_BASE": f"http://localhost:{bridge.port}/v1",
# actual key is handled by inspect, mini-swe needs it to approve setup
"OPENAI_API_KEY": "sk-none",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intended as the key?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is just so the agent detects a key, the real one will be passed by inspect.

"ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
"ANTHROPIC_API_KEY": "sk-none",
}
| (env or {}),
user=user,
concurrency=False,
)

# track debug output
debug_output.append(f"[stdout]\n{result.stdout}")
debug_output.append(f"[stderr]\n{result.stderr}")

# raise for error
if not result.success:
raise RuntimeError(
f"Error executing mini-swe-agent (cwd={cwd or 'default'}):\n"
f"stdout: {result.stdout}\n"
f"stderr: {result.stderr}"
)

# exit if we are at max_attempts
attempt_count += 1
if attempt_count >= attempts.attempts:
break

# score this attempt
answer_scores = await score(state)

# break if we score 'correct'
if attempts.score_value(answer_scores[0].value) == 1.0:
break

# otherwise update prompt with incorrect message and continue
else:
if callable(attempts.incorrect_message):
if not is_callable_coroutine(attempts.incorrect_message):
raise ValueError(
"The incorrect_message function must be async."
)
agent_prompt = await attempts.incorrect_message(
state, answer_scores
)
else:
agent_prompt = attempts.incorrect_message

# trace debug info
debug_output.insert(0, "mini-swe-agent Debug Output:")
trace("\n".join(debug_output))

return bridge.state

return agent_with(execute, name=name, description=description)
3 changes: 2 additions & 1 deletion src/inspect_swe/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

from ._claude_code.claude_code import claude_code
from ._codex_cli.codex_cli import codex_cli
from ._mini_swe_agent.mini_swe_agent import mini_swe_agent

__all__ = ["codex_cli", "claude_code"]
__all__ = ["codex_cli", "claude_code", "mini_swe_agent"]
Loading
Loading