meridianlabs-ai · anaoaktree · Jan 15, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026
diff --git a/examples/system_explorer/task.py b/examples/system_explorer/task.py
@@ -4,19 +4,21 @@
 from inspect_ai.dataset import json_dataset
 from inspect_ai.scorer import model_graded_qa
 from inspect_ai.util import SandboxEnvironmentType
-from inspect_swe import claude_code, codex_cli
+from inspect_swe import claude_code, codex_cli, mini_swe_agent
 
 
 @task
 def system_explorer(
-    agent: Literal["claude_code", "codex_cli"] = "claude_code",
+    agent: Literal["claude_code", "codex_cli", "mini_swe_agent"] = "claude_code",
     sandbox: SandboxEnvironmentType | None = "docker",
 ) -> Task:
     match agent:
         case "claude_code":
             solver = claude_code()
         case "codex_cli":
             solver = codex_cli()
+        case "mini_swe_agent":
+            solver = mini_swe_agent()
 
     return Task(
         dataset=json_dataset("dataset.json"),

diff --git a/examples/terminal_bench_test/task.py b/examples/terminal_bench_test/task.py
@@ -0,0 +1,90 @@
+from pathlib import Path
+
+from inspect_ai import Task, task
+from inspect_ai.tool import bash, python
+from inspect_swe import mini_swe_agent
+
+
+def _get_challenges_dir() -> Path:
+    """Get the path to Terminal Bench 2.0 challenges directory.
+
+    Attempts to find the challenges directory from inspect_evals installation.
+    """
+    try:
+        import inspect_evals.terminal_bench_2 as tb2_module
+
+        module_dir = Path(tb2_module.__file__).parent
+        challenges_dir = module_dir / "challenges"
+        if challenges_dir.exists():
+            return challenges_dir
+    except ImportError:
+        pass
+
+    raise ImportError(
+        "Could not find Terminal Bench 2.0 challenges directory. "
+        "Please install inspect-evals[terminal_bench_2]:\n"
+        "  pip install inspect-evals[terminal_bench_2]"
+    )
+
+
+@task
+def terminal_bench_task(
+    eval_names: list[str] | None = None,
+    variant_names: list[str] | None = None,
+    model: str | None = None,
+) -> Task:
+    """Terminal Bench 2.0 with mini-swe-agent solver.
+
+    Runs Terminal Bench 2.0 challenges using mini-swe-agent instead of
+    the default ReAct solver.
+
+    Args:
+        eval_names: Filter to specific challenge names (e.g., ["constraints-scheduling"]).
+            If None, runs all challenges.
+        variant_names: Filter to specific variants. Defaults to ["default"].
+        model: Model name to use for mini-swe-agent. If None, uses default inspect model.
+
+    Returns:
+        Task configured with mini-swe-agent solver and Terminal Bench scorer.
+
+    Note:
+        This task uses pre-built Docker images from Docker Hub. For local builds,
+        use the original inspect_evals/terminal_bench_2 task directly.
+    """
+    # Import dependencies from inspect_evals
+    try:
+        from inspect_cyber import create_agentic_eval_dataset
+        from inspect_evals.terminal_bench_2.terminal_bench_2 import (
+            terminal_bench_2_scorer,
+        )
+    except ImportError:
+        raise ImportError(
+            "inspect_cyber and inspect_evals are required for Terminal Bench 2.0. "
+            "Please install inspect-evals[terminal_bench_2]:\n"
+            "  pip install inspect-evals[terminal_bench_2]"
+        ) from None
+
+    # Get challenges directory
+    challenges_dir = _get_challenges_dir()
+
+    # Load dataset
+    dataset = create_agentic_eval_dataset(root_dir=challenges_dir.absolute())
+
+    # Filter by eval_names if specified
+    if eval_names:
+        dataset = dataset.filter_by_metadata_field("eval_name", eval_names)
+
+    # Filter by variant_names (default to "default" variant)
+    if variant_names is None:
+        variant_names = ["default"]
+    dataset = dataset.filter_by_metadata_field("variant_name", variant_names)
+
+    # Create the solver
+    solver = mini_swe_agent(model=model)
+
+    return Task(
+        dataset=dataset,
+        solver=solver,
+        scorer=terminal_bench_2_scorer(),
+        tools=[bash(timeout=60), python(timeout=60)],
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,7 @@ dev = [
     "pytest-dotenv",
     "types-PyYAML",
     "IPython",
+    "inspect-evals",
 ]
 doc = ["quarto-cli==1.7.31"]
 

diff --git a/src/inspect_swe/__init__.py b/src/inspect_swe/__init__.py
@@ -1,6 +1,8 @@
 from ._claude_code.claude_code import claude_code
 from ._codex_cli.codex_cli import codex_cli
+from ._mini_swe_agent.mini_swe_agent import mini_swe_agent
 from ._tools.download import AgentBinary, cached_agent_binaries, download_agent_binary
+from ._util.agentwheel import download_wheels_tarball
 from ._util.centaur import CentaurOptions
 from ._util.sandbox import SandboxPlatform
 
@@ -13,10 +15,12 @@
 __all__ = [
     "claude_code",
     "codex_cli",
+    "mini_swe_agent",
     "download_agent_binary",
     "cached_agent_binaries",
     "AgentBinary",
     "SandboxPlatform",
     "CentaurOptions",
     "__version__",
+    "download_wheels_tarball",
 ]
diff --git a/src/inspect_swe/_mini_swe_agent/__init__.py b/src/inspect_swe/_mini_swe_agent/__init__.py
diff --git a/src/inspect_swe/_mini_swe_agent/mini_swe_agent.py b/src/inspect_swe/_mini_swe_agent/mini_swe_agent.py
@@ -0,0 +1,215 @@
+from textwrap import dedent
+from typing import Literal
+
+from inspect_ai.agent import (
+    Agent,
+    AgentAttempts,
+    AgentState,
+    agent,
+    agent_with,
+    sandbox_agent_bridge,
+)
+from inspect_ai.model import ChatMessageSystem, GenerateFilter, get_model
+from inspect_ai.scorer import score
+from inspect_ai.util import sandbox as sandbox_env
+from inspect_ai.util import store
+
+from .._util._async import is_callable_coroutine
+from .._util.agentwheel import AgentWheelSource, ensure_agent_wheel_installed
+from .._util.messages import build_user_prompt
+from .._util.trace import trace
+
+# mini-swe-agent wheel source configuration
+# Pin to v1.x by default - v2.0 has breaking changes (migration guide pending)
+# See: https://mini-swe-agent.com/latest/advanced/v2_migration/
+MINI_SWE_AGENT_SOURCE = AgentWheelSource(
+    agent="mini-swe-agent",
+    package="mini-swe-agent",
+    binary="mini",  # CLI entrypoint
+    default_version="1.17.4",
+)
+
+
+@agent
+def mini_swe_agent(
+    name: str = "mini-swe-agent",
+    description: str = dedent("""
+       Minimal AI agent that solves software engineering tasks using bash commands.
+       100 lines of Python, radically simple, scores >74% on SWE-bench verified.
+    """),
+    system_prompt: str | None = None,
+    attempts: int | AgentAttempts = 1,  # TODO: currently supports single attempt
+    model: str | None = None,
+    filter: GenerateFilter | None = None,
+    retry_refusals: int | None = None,
+    cost_limit: float | None = None,
+    cwd: str | None = None,
+    env: dict[str, str] | None = None,
+    user: str | None = None,
+    sandbox: str | None = None,
+    version: Literal["stable", "sandbox", "latest"] | str = "stable",
+) -> Agent:
+    """mini-swe-agent agent.
+
+    Agent that uses [mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent)
+    running in a sandbox. Mini-swe-agent is a minimal 100-line agent that solves
+    GitHub issues using only bash commands.
+
+    The agent can either use a version of mini-swe-agent installed in the sandbox,
+    or can download and install it via pip (see docs on `version` option below).
+
+    Use `attempts` to enable additional submissions if initial submission(s)
+    are incorrect (by default, no additional attempts are permitted).
+
+    Use `cost_limit` to set a maximum cost for the agent run (in USD).
+
+    Args:
+        name: Agent name (used in multi-agent systems with `as_tool()` and `handoff()`)
+        description: Agent description (used in multi-agent systems)
+        system_prompt: Additional system prompt to append to default.
+        attempts: Configure agent to make multiple attempts.
+        model: Model name to use (defaults to main model for task).
+        filter: Filter for intercepting bridged model requests.
+        retry_refusals: Should refusals be retried? (pass number of times to retry)
+        cost_limit: Maximum cost limit for the agent run.
+        cwd: Working directory to run mini-swe-agent within.
+        env: Environment variables to set for mini-swe-agent.
+        user: User to execute mini-swe-agent with.
+        sandbox: Optional sandbox environment name.
+        version: Version of mini-swe-agent to use. One of:
+            - "stable": Download and install the default pinned version.
+            - "sandbox": Use version in sandbox (raises RuntimeError if not available)
+            - "latest": Download and install latest version from PyPI.
+            - "x.x.x": Install and use a specific version.
+    """
+    # resolve models
+    inspect_model = f"inspect/{model}" if model is not None else "inspect"
+
+    # resolve attempts
+    attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
+
+    async def execute(state: AgentState) -> AgentState:
+        # determine port (use new port for each execution of agent on sample)
+        MODEL_PORT = "mini_swe_agent_model_port"
+        port = store().get(MODEL_PORT, 4000) + 1
+        store().set(MODEL_PORT, port)
+
+        # resolve sandbox once for reuse
+        sbox = sandbox_env(sandbox)
+
+        async with sandbox_agent_bridge(
+            state,
+            model=inspect_model,
+            filter=filter,
+            retry_refusals=retry_refusals,
+            port=port,
+        ) as bridge:
+            # ensure mini-swe-agent is installed
+            mini_binary = await ensure_agent_wheel_installed(
+                source=MINI_SWE_AGENT_SOURCE,
+                version=version,
+                user=user,
+                sandbox=sbox,
+            )
+
+            # base command options
+            cmd = [
+                mini_binary,
+                "--yolo",  # run without confirmations (like --print for claude)
+                "--exit-immediately",  # exit when agent finishes instead of prompting
+            ]
+
+            # add cost limit if specified
+            if cost_limit is not None:
+                cmd.extend(["--cost-limit", str(cost_limit)])
+
+            # build user prompt
+            prompt, _ = build_user_prompt(state.messages)
+
+            # add system prompt context if provided
+            full_prompt = prompt
+            system_messages = [
+                m.text for m in state.messages if isinstance(m, ChatMessageSystem)
+            ]
+            if system_prompt is not None:
+                system_messages.append(system_prompt)
+            if system_messages:
+                # Prepend system context to the task
+                system_context = "\n\n".join(system_messages)
+                full_prompt = (
+                    f"System instructions:\n{system_context}\n\nTask:\n{prompt}"
+                )
+
+            # execute the agent
+            debug_output: list[str] = []
+            agent_prompt = full_prompt
+            attempt_count = 0
+
+            while True:  # Kept for consistency with other agents but currently only single attempt supported
+                # TODO: build command with task. This only works for single-turn tasks currently. Need to update to support multi-turn (perhaps via file output option)
+                agent_cmd = cmd + ["--task", agent_prompt]
+
+                # run agent
+                result = await sbox.exec(
+                    cmd=["bash", "-c", 'exec 0</dev/null "$@"', "bash"] + agent_cmd,
+                    cwd=cwd,
+                    env={
+                        "MSWEA_CONFIGURED": "true",  # Skip interactive setup wizard
+                        "MSWEA_MODEL_NAME": model
+                        if model is not None
+                        else get_model().name,
+                        "OPENAI_API_BASE": f"http://localhost:{bridge.port}/v1",
+                        # actual key is handled by inspect, mini-swe needs it to approve setup
+                        "OPENAI_API_KEY": "sk-none",
+                        "ANTHROPIC_BASE_URL": f"http://localhost:{bridge.port}",
+                        "ANTHROPIC_API_KEY": "sk-none",
+                    }
+                    | (env or {}),
+                    user=user,
+                    concurrency=False,
+                )
+
+                # track debug output
+                debug_output.append(f"[stdout]\n{result.stdout}")
+                debug_output.append(f"[stderr]\n{result.stderr}")
+
+                # raise for error
+                if not result.success:
+                    raise RuntimeError(
+                        f"Error executing mini-swe-agent (cwd={cwd or 'default'}):\n"
+                        f"stdout: {result.stdout}\n"
+                        f"stderr: {result.stderr}"
+                    )
+
+                # exit if we are at max_attempts
+                attempt_count += 1
+                if attempt_count >= attempts.attempts:
+                    break
+
+                # score this attempt
+                answer_scores = await score(state)
+
+                # break if we score 'correct'
+                if attempts.score_value(answer_scores[0].value) == 1.0:
+                    break
+
+                # otherwise update prompt with incorrect message and continue
+                else:
+                    if callable(attempts.incorrect_message):
+                        if not is_callable_coroutine(attempts.incorrect_message):
+                            raise ValueError(
+                                "The incorrect_message function must be async."
+                            )
+                        agent_prompt = await attempts.incorrect_message(
+                            state, answer_scores
+                        )
+                    else:
+                        agent_prompt = attempts.incorrect_message
+
+            # trace debug info
+            debug_output.insert(0, "mini-swe-agent Debug Output:")
+            trace("\n".join(debug_output))
+
+        return bridge.state
+
+    return agent_with(execute, name=name, description=description)
diff --git a/src/inspect_swe/_registry.py b/src/inspect_swe/_registry.py
@@ -2,5 +2,6 @@
 
 from ._claude_code.claude_code import claude_code
 from ._codex_cli.codex_cli import codex_cli
+from ._mini_swe_agent.mini_swe_agent import mini_swe_agent
 
-__all__ = ["codex_cli", "claude_code"]
+__all__ = ["codex_cli", "claude_code", "mini_swe_agent"]