Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
00aa735
Adds some tool-related requirements.
nrfulton Nov 17, 2025
7d2125a
Fixes some bugs in requirement checkers.
nrfulton Nov 17, 2025
6fa86ef
Adds a timeout argument to call_func.
nrfulton Nov 18, 2025
523f3f2
Moves Alex's code interpreters into a `tools` module.
nrfulton Nov 20, 2025
180cf12
Fix imports broken by previous commit.
nrfulton Nov 20, 2025
b72ac1b
Changes name of the static analysis environment.
nrfulton Nov 20, 2025
6b5f716
Refactors ExecutionResult.
nrfulton Nov 20, 2025
7512fd4
Fixes bug in the reqlib.tools uses_tool validator.
nrfulton Nov 20, 2025
65dd35b
More work on the code_interpreter tool.
nrfulton Nov 20, 2025
b05e7b2
Fixes the code interpreter example (need tool_calls=True).
nrfulton Nov 20, 2025
986e940
More cleanup for the python interpreter example.
nrfulton Nov 20, 2025
1c2a5ea
Reverts thread pool for tool calling.
nrfulton Nov 20, 2025
3e2a00b
Revmoes unused imports is base and adds \n
nrfulton Nov 20, 2025
9ec1020
Adds \n and removes unused import in base.py
nrfulton Nov 20, 2025
fdf88d5
Fixes some pre-commit issues.
nrfulton Nov 20, 2025
c63ff78
Fixes some pre-commit issues.
nrfulton Nov 20, 2025
0fd1a57
Fixes some pre-commit errors.
nrfulton Nov 20, 2025
f405560
Ruff formatter pass.
nrfulton Nov 20, 2025
646d127
Handle edge-case in tool requirement checker.
nrfulton Nov 20, 2025
6745e8d
ruff
nrfulton Nov 20, 2025
aaf460e
Fixes final pre-commit error.
nrfulton Nov 20, 2025
9768397
Fixes failing tests caused by changes to ExecutionResult.
nrfulton Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions docs/examples/tools/interpreter_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from mellea.stdlib.tools import code_interpreter, local_code_interpreter
from mellea import start_session, MelleaSession
from mellea.backends.types import ModelOption
from mellea.backends.model_ids import OPENAI_GPT_OSS_20B
from mellea.stdlib.reqlib.tools import uses_tool, tool_arg_validator


def example_1(m: MelleaSession):
# First, let's see how the code interpreter function works without an LLM in the loop:
result = code_interpreter("print(1+1)")
print(result)


# Now let's ask the LLM to make a plot.


def example_2(m: MelleaSession):
plot_output = m.instruct(
description="Make a plot of y=x^2",
model_options={ModelOption.TOOLS: [local_code_interpreter]},
)
print(plot_output)


# Notice that the model did not actually generate a plot. Let's force tool use:


def example_3(m: MelleaSession):
plot_output = m.instruct(
description="Use the code interpreter tool to make a plot of y=x^2.",
requirements=[uses_tool(local_code_interpreter)],
model_options={ModelOption.TOOLS: [local_code_interpreter]},
tool_calls=True,
)

code = plot_output.tool_calls["local_code_interpreter"].args["code"]
print(f"Going to execute the following code:\n```python\n{code}\n```")

# Call the tool.
exec_result = plot_output.tool_calls["local_code_interpreter"].call_func()

print(exec_result)


# Notice that the model did make a plot, but it just "showed" the plot.
# We would actually like this to be written out to a file.


def example_4(m: MelleaSession):
plot_output = m.instruct(
description="Use the code interpreter tool to make a plot of y=x^2.",
requirements=[
uses_tool(local_code_interpreter),
tool_arg_validator(
"The plot should be written to /tmp/output.png",
tool_name=local_code_interpreter,
arg_name="code",
validation_fn=lambda code_snippet: "/tmp/output.png" in code_snippet
and "plt.show()" not in code_snippet,
),
],
model_options={ModelOption.TOOLS: [local_code_interpreter]},
tool_calls=True,
)

code = plot_output.tool_calls["local_code_interpreter"].args["code"]
print(f"Going to execute the following code:\n```python\n{code}\n```")

# Call the tool.
exec_result = plot_output.tool_calls["local_code_interpreter"].call_func()

print(exec_result)


# m = start_session(backend_name="ollama", model_id=OPENAI_GPT_OSS_20B)
m = start_session()
example_4(m)
205 changes: 8 additions & 197 deletions mellea/stdlib/reqlib/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,204 +12,15 @@
from mellea.helpers.fancy_logger import FancyLogger
from mellea.stdlib.base import Context
from mellea.stdlib.requirement import Requirement, ValidationResult
from mellea.stdlib.tools.interpreter import (
ExecutionEnvironment,
LLMSandboxEnvironment,
StaticAnalysisEnvironment,
UnsafeEnvironment,
)

logger = FancyLogger.get_logger()

# region execution backends


@dataclass
class ExecutionResult:
"""Result of code execution."""

success: bool
message: str | None = None
error: str | None = None
skipped: bool = False


class ExecutionEnvironment(ABC):
"""Abstract environment for executing Python code."""

def __init__(self, allowed_imports: list[str] | None = None):
"""Initialize with optional import restrictions.

Args:
allowed_imports: List of allowed import modules. None means any import is allowed.
"""
self.allowed_imports = allowed_imports

@abstractmethod
def execute(self, code: str, timeout: int) -> ExecutionResult:
"""Execute code and return result."""


class SafeEnvironment(ExecutionEnvironment):
"""Safe environment that validates but does not execute code."""

def execute(self, code: str, timeout: int) -> ExecutionResult:
"""Validate code syntax and imports without executing."""
try:
ast.parse(code)
except SyntaxError as e:
return ExecutionResult(success=False, error=str(e))

if self.allowed_imports:
unauthorized = _get_unauthorized_imports(code, self.allowed_imports)
if unauthorized:
return ExecutionResult(
success=False,
error=f"Unauthorized imports detected: {', '.join(unauthorized)}",
)

return ExecutionResult(
success=True,
skipped=True,
message="Code validated but not executed (safe mode)",
)


class UnsafeEnvironment(ExecutionEnvironment):
"""Unsafe environment that executes code directly with subprocess."""

def execute(self, code: str, timeout: int) -> ExecutionResult:
"""Execute code with subprocess after checking imports."""
if self.allowed_imports:
unauthorized = _get_unauthorized_imports(code, self.allowed_imports)
if unauthorized:
return ExecutionResult(
success=False,
error=f"Unauthorized imports detected: {', '.join(unauthorized)}",
)

return self._execute_subprocess(code, timeout)

def _execute_subprocess(self, code: str, timeout: int) -> ExecutionResult:
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
f.write(code)
temp_file = f.name

try:
# Execute code using the same Python interpreter and environment as the current process
# This ensures the code has access to all installed packages and dependencies
result = subprocess.run(
[sys.executable, temp_file],
capture_output=True,
text=True,
timeout=timeout,
)

if result.returncode == 0:
message = "Code executed successfully"
if result.stdout.strip():
message += f"\nOutput: {result.stdout.strip()}"
return ExecutionResult(success=True, message=message)
else:
return ExecutionResult(
success=False,
error=f"Execution failed with error: {result.stderr[:200]}",
)
except subprocess.TimeoutExpired:
return ExecutionResult(
success=False, error=f"Execution timed out after {timeout} seconds"
)
except Exception as e:
return ExecutionResult(success=False, error=f"Execution error: {e!s}")
finally:
try:
Path(temp_file).unlink()
except Exception:
pass


class LLMSandboxEnvironment(ExecutionEnvironment):
"""Environment using llm-sandbox for secure Docker-based execution."""

def execute(self, code: str, timeout: int) -> ExecutionResult:
"""Execute code using llm-sandbox."""
if self.allowed_imports:
unauthorized = _get_unauthorized_imports(code, self.allowed_imports)
if unauthorized:
return ExecutionResult(
success=False,
error=f"Unauthorized imports detected: {', '.join(unauthorized)}",
)

try:
from llm_sandbox import SandboxSession
except ImportError:
return ExecutionResult(
success=False,
error="llm-sandbox not installed. Install with: uv add 'llm-sandbox[docker]'",
)

try:
with SandboxSession(
lang="python", verbose=False, keep_template=False
) as session:
result = session.run(code, timeout=timeout)

if result.exit_code == 0:
message = "Code executed successfully in sandbox"
if (
hasattr(result, "stdout")
and result.stdout
and result.stdout.strip()
):
message += f"\nOutput: {result.stdout.strip()}"
return ExecutionResult(success=True, message=message)
else:
if result.stderr:
error_msg = f"Sandbox execution failed: {result.stderr[:200]}"
else:
# Log unknown error details for debugging
logger.warning(
f"Sandbox execution failed without stderr. Exit code: {result.exit_code}, "
f"Available attributes: {[attr for attr in dir(result) if not attr.startswith('_')]}"
)
error_msg = f"Sandbox execution failed with exit code {result.exit_code} (no error details available)"
return ExecutionResult(success=False, error=error_msg)

except Exception as e:
return ExecutionResult(
success=False, error=f"Sandbox execution error: {e!s}"
)


def _get_unauthorized_imports(code: str, allowed_imports: list[str]) -> list[str]:
"""Get list of unauthorized imports used in code."""
unauthorized: list[str] = []
try:
tree = ast.parse(code)
except SyntaxError:
return unauthorized

for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
base_module = alias.name.split(".")[0]
if (
base_module not in allowed_imports
and base_module not in unauthorized
):
unauthorized.append(base_module)
elif isinstance(node, ast.ImportFrom):
if node.module:
base_module = node.module.split(".")[0]
if (
base_module not in allowed_imports
and base_module not in unauthorized
):
unauthorized.append(base_module)
return unauthorized


def _check_allowed_imports(code: str, allowed_imports: list[str]) -> bool:
"""Check if code only uses allowed imports."""
return len(_get_unauthorized_imports(code, allowed_imports)) == 0


# endregion

# region code extraction

Expand Down Expand Up @@ -328,11 +139,11 @@ def _python_executes_without_error(
elif allow_unsafe:
environment = UnsafeEnvironment(allowed_imports=allowed_imports)
else:
environment = SafeEnvironment(allowed_imports=allowed_imports)
environment = StaticAnalysisEnvironment(allowed_imports=allowed_imports)

result = environment.execute(code, timeout)
return ValidationResult(
result=result.success, reason=result.message or result.error
result=result.success, reason=result.to_validationresult_reason()
)


Expand Down
Loading