From 93ce413c9bd34c4e4e54d345dfcb0a2a64781681 Mon Sep 17 00:00:00 2001 From: Eric Phillips Date: Fri, 20 Feb 2026 20:00:52 -0700 Subject: [PATCH] added sandbox and bash tool --- agent/loop.py | 55 +++++++++++-- agent/tools.py | 27 ++++++ main.py | 8 +- sandbox/__init__.py | 0 sandbox/session.py | 47 +++++++++++ test_sandbox_tool.py | 54 ++++++++++++ tests/conftest.py | 26 ++++++ tests/test_loop.py | 2 +- tests/test_sandbox.py | 187 ++++++++++++++++++++++++++++++++++++++++++ tools/bash.py | 24 ++++++ 10 files changed, 419 insertions(+), 11 deletions(-) create mode 100644 agent/tools.py create mode 100644 sandbox/__init__.py create mode 100644 sandbox/session.py create mode 100644 test_sandbox_tool.py create mode 100644 tests/test_sandbox.py create mode 100644 tools/bash.py diff --git a/agent/loop.py b/agent/loop.py index f34399a..4404eeb 100644 --- a/agent/loop.py +++ b/agent/loop.py @@ -1,15 +1,16 @@ import asyncio +from http.client import responses from anthropic import AsyncAnthropic from agent.config import settings from agent.history import ConversationHistory +from agent.tools import TOOL_SCHEMAS, dispatch_tool client = AsyncAnthropic(api_key=settings.anthropic_api_key) -history = ConversationHistory() -async def run_turn(user_message: str, history: list[dict] = None) -> str: +async def run_turn(user_message: str, history: list[dict] = None, sandbox=None) -> str: if history is None: history = [] @@ -17,21 +18,57 @@ async def run_turn(user_message: str, history: list[dict] = None) -> str: # add the new user message to history messages = history + [{"role": "user", "content": user_message}] - message = await client.messages.create( + response = await client.messages.create( model=settings.model, max_tokens=settings.max_tokens, + tools=TOOL_SCHEMAS, messages=messages, ) - return message + while response.stop_reason == "tool_use": + tool_results = [] + for block in response.content: + if block.type == "tool_use": + result = await dispatch_tool( + tool_name=block.name, tool_input=block.input, sandbox=sandbox + ) + tool_results.append( + {"type": "tool_result", "tool_use_id": block.id, "content": result} + ) + + messages = messages + [ + {"role": "assistant", "content": response.content}, + {"role": "user", "content": tool_results}, + ] + + response = await client.messages.create( + model=settings.model, + max_tokens=settings.max_tokens, + tools=TOOL_SCHEMAS, + messages=messages, + ) + + return next(block.text for block in response.content if hasattr(block, "text")) -async def run_session(): +async def run_session(sandbox=None): + """simple CLI session - temporary until TUI is built""" + history = ConversationHistory() + + print("Codeing agent ready. Type /quit to quit.") + while True: - user_input = input("You: ") + user_input = input("You: ").strip() + # __ UI commands_______ + if not user_input: + continue + if user_input == "/quit": + print("Goodbye") + break + history.add_message("user", user_input) - response = await run_turn(user_input, history.get_all()) - history.add_message("assistant", response.content[0].text) + response = await run_turn(user_input, history.get_all(), sandbox) + history.add_message("assistant", response) - print(f"Assistant: {response.content[0].text}") + print(f"\nAssistant: {response}") diff --git a/agent/tools.py b/agent/tools.py new file mode 100644 index 0000000..5885259 --- /dev/null +++ b/agent/tools.py @@ -0,0 +1,27 @@ +from tools.bash import bash + +TOOL_SCHEMAS = [ + { + "name": "bash", + "description": "Execute a bash command in the isolated sandbox environment. Use this to run shell commands, install packages, run scripts, etc.", + "input_schema": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The bash command to execute (e.g., 'ls -la', 'python script.py', 'pip install requests')", + } + }, + "required": ["command"], + }, + } +] + + +async def dispatch_tool(tool_name: str, tool_input: dict, sandbox) -> str: + """Route tool calls to implementations.""" + + if tool_name == "bash": + return await bash(command=tool_input["command"], sandbox=sandbox) + + return f"Unknown tool: {tool_name}" diff --git a/main.py b/main.py index d066ef2..11ad1dc 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,16 @@ import asyncio from agent.loop import run_session +from sandbox.session import PodmanSandbox async def run_tui(): - await run_session() + print("Starting sandbox....") + async with PodmanSandbox() as sandbox: + print("Sandbox ready") + await run_session(sandbox) + + print("Sandbox destroyed") def main(): diff --git a/sandbox/__init__.py b/sandbox/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sandbox/session.py b/sandbox/session.py new file mode 100644 index 0000000..200b367 --- /dev/null +++ b/sandbox/session.py @@ -0,0 +1,47 @@ +from pathlib import Path + +import podman + +from agent.config import settings + + +class PodmanSandbox: + def __init__(self): + # connect to podman socket (rootless) + self.client = podman.PodmanClient() + self.container = None + + async def __aenter__(self): + self.container = self.client.containers.run( + "python:3.14", + command=["sleep", "60h"], + detach=True, + runtime="krun", + network_mode="none", + mem_limit="512m", + volumes={ + str(Path(settings.safedir).absolute()): { + "bind": "/workspace", + "mode": "rw", + } + }, + working_dir="/workspace", + remove=True, + ) + return self + + async def run(self, command: str) -> str: + """Execute command in microVM/""" + exit_code, output = self.container.exec_run( + ["/bin/sh", "-c", command], workdir="/workspace" + ) + return output.decode() + + async def __aexit__(self, *args): + if self.container: + try: + self.container.stop() + except Exception as e: + # log but don't raise, best effort cleanup + print(f"Warning: Container cleanup failed: {e}") + # possibly use logging.warning if we add loggin later diff --git a/test_sandbox_tool.py b/test_sandbox_tool.py new file mode 100644 index 0000000..e113fa1 --- /dev/null +++ b/test_sandbox_tool.py @@ -0,0 +1,54 @@ +import asyncio + +from sandbox.session import PodmanSandbox +from tools.bash import bash + + +async def main(): + print("Creating sandbox...") + async with PodmanSandbox() as sb: + print("✓ Sandbox created\n") + + # Test 1: Basic command + print("Test 1: pwd") + result = await bash("pwd", sb) + print(f"Result: {result}\n") + + # Test 2: List workspace + print("Test 2: ls -la") + result = await bash("ls -la", sb) + print(f"Result: {result}\n") + + # Test 3: Python version + print("Test 3: python --version") + result = await bash("python --version", sb) + print(f"Result: {result}\n") + + # Test 4: Write a file + print("Test 4: Write test file") + result = await bash("echo 'Hello from sandbox' > test.txt", sb) + print(f"Result: {result}") + + # Test 5: Read it back + print("Test 5: Read test file") + result = await bash("cat test.txt", sb) + print(f"Result: {result}\n") + + # Test 6: Check it exists on host + print("Test 6: Verify file on host") + import os + + from agent.config import settings + + host_file = f"{settings.safedir}/test.txt" + if os.path.exists(host_file): + with open(host_file) as f: + print(f"✓ File exists on host: {f.read()}") + else: + print("✗ File NOT found on host") + + print("\n✓ Sandbox destroyed") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/conftest.py b/tests/conftest.py index e9aaec2..3a6dce6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -54,3 +54,29 @@ def sample_history(): {"role": "assistance", "content": "Hi there!"}, {"role": "user", "content": "What's 2+2?"}, ] + + +@pytest.fixture +def mock_sandbox(): + """Mock sandbox for unit tests.""" + sandbox = MagicMock() + sandbox.run = AsyncMock(return_value="mock output\n") + return sandbox + + +@pytest.fixture +def crashed_sandbox(): + """Mock sandbox that simulates a crash.""" + sandbox = MagicMock() + sandbox.run = AsyncMock(side_effect=RuntimeError("Container crashed unexpectedly")) + return sandbox + + +@pytest.fixture +def mock_podman_client(): + """Mock Podman client for unit tests.""" + with patch("sandbox.session.podman.PodmanClient") as mock: + mock_container = MagicMock() + mock_container.exec_run.return_value = (0, b"mock output\n") + mock.return_value.containers.run.return_value = mock_container + yield mock, mock_container diff --git a/tests/test_loop.py b/tests/test_loop.py index 47d7424..b461001 100644 --- a/tests/test_loop.py +++ b/tests/test_loop.py @@ -19,7 +19,7 @@ async def test_run_turn_basic(mock_anthropic_client): mock_anthropic_client.messages.create.assert_called_once() # verify message returned - assert result.content[0].text == "42" + assert result == "42" # verify call has correct parameters call_args = mock_anthropic_client.messages.create.call_args diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py new file mode 100644 index 0000000..891607b --- /dev/null +++ b/tests/test_sandbox.py @@ -0,0 +1,187 @@ +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +# ─── Unit Tests ─────────────────────────────────────────────── + + +@pytest.mark.unit +def test_sandbox_initializes(): + """Test PodmanSandbox creates client on init.""" + with patch("sandbox.session.podman.PodmanClient") as mock_client: + from sandbox.session import PodmanSandbox + + sb = PodmanSandbox() + + # Client should be created + mock_client.assert_called_once() + assert sb.container is None # Not started yet + + +@pytest.mark.unit +async def test_sandbox_starts_container(): + """Test that __aenter__ starts a container.""" + with patch("sandbox.session.podman.PodmanClient") as mock_client: + # Mock the container + mock_container = MagicMock() + mock_client.return_value.containers.run.return_value = mock_container + + from sandbox.session import PodmanSandbox + + sb = PodmanSandbox() + await sb.__aenter__() + + # Container should be running + mock_client.return_value.containers.run.assert_called_once() + assert sb.container is mock_container + + +@pytest.mark.unit +async def test_sandbox_stops_container_on_exit(): + """Test that __aexit__ stops the container.""" + with patch("sandbox.session.podman.PodmanClient") as mock_client: + mock_container = MagicMock() + mock_client.return_value.containers.run.return_value = mock_container + + from sandbox.session import PodmanSandbox + + sb = PodmanSandbox() + await sb.__aenter__() + await sb.__aexit__(None, None, None) + + # Container should be stopped + mock_container.stop.assert_called_once() + + +@pytest.mark.unit +async def test_sandbox_run_executes_command(): + """Test that run() passes command to container.""" + with patch("sandbox.session.podman.PodmanClient") as mock_client: + # Mock exec_run return value + mock_container = MagicMock() + mock_container.exec_run.return_value = (0, b"hello from sandbox\n") + mock_client.return_value.containers.run.return_value = mock_container + + from sandbox.session import PodmanSandbox + + sb = PodmanSandbox() + await sb.__aenter__() + + result = await sb.run("echo 'hello from sandbox'") + + # Verify exec_run was called with shell wrapper + mock_container.exec_run.assert_called_once_with( + ["/bin/sh", "-c", "echo 'hello from sandbox'"], workdir="/workspace" + ) + assert result == "hello from sandbox\n" + + +@pytest.mark.unit +async def test_tool_call_fails_if_sandbox_crashes(): + """Test that tool calls fail gracefully when sandbox is unavailable.""" + from tools.bash import bash + + # Simulate crashed sandbox (container is None) + mock_sandbox = MagicMock() + mock_sandbox.run = AsyncMock(side_effect=RuntimeError("Container crashed")) + + result = await bash("ls -la", mock_sandbox) + + # Should return error message, not raise exception + assert "error" in result.lower() + + +@pytest.mark.unit +async def test_tool_call_with_no_sandbox(): + """Test tool call handles None sandbox gracefully.""" + from tools.bash import bash + + result = await bash("ls -la", sandbox=None) + + # Should return error, not crash + assert "error" in result.lower() + + +@pytest.mark.unit +async def test_sandbox_cleanup_on_crash(): + """Test __aexit__ handles container stop failure gracefully.""" + with patch("sandbox.session.podman.PodmanClient") as mock_client: + mock_container = MagicMock() + # Simulate container.stop() failing + mock_container.stop.side_effect = RuntimeError("Container already dead") + mock_client.return_value.containers.run.return_value = mock_container + + from sandbox.session import PodmanSandbox + + sb = PodmanSandbox() + await sb.__aenter__() + + # Should not raise even if stop() fails + try: + await sb.__aexit__(None, None, None) + except RuntimeError: + pytest.fail("__aexit__ should handle container stop failure gracefully") + + +# ─── Integration Tests ──────────────────────────────────────── + + +@pytest.mark.integration +async def test_real_sandbox_starts(): + """Integration: Actually start a real sandbox.""" + from sandbox.session import PodmanSandbox + + async with PodmanSandbox() as sb: + assert sb.container is not None + + # Verify container is actually running + result = await sb.run("echo 'sandbox started'") + assert "sandbox started" in result + + +@pytest.mark.integration +async def test_real_bash_tool_executes(): + """Integration: Actually run a command in sandbox.""" + from sandbox.session import PodmanSandbox + from tools.bash import bash + + async with PodmanSandbox() as sb: + result = await bash("echo 'tool works'", sb) + assert "tool works" in result + + +@pytest.mark.integration +async def test_real_sandbox_workspace_mounted(): + """Integration: Verify workspace is mounted correctly.""" + import os + + from agent.config import settings + from sandbox.session import PodmanSandbox + from tools.bash import bash + + async with PodmanSandbox() as sb: + # Write file in sandbox + await bash("echo 'mount test' > /workspace/mount_test.txt", sb) + + # Verify file exists on host + host_file = f"{settings.safedir}/mount_test.txt" + assert os.path.exists(host_file) + + with open(host_file) as f: + content = f.read() + assert "mount test" in content + + # Cleanup + os.remove(host_file) + + +@pytest.mark.integration +async def test_real_sandbox_no_network(): + """Integration: Verify sandbox has no network access.""" + from sandbox.session import PodmanSandbox + from tools.bash import bash + + async with PodmanSandbox() as sb: + # Try to reach the internet (should fail) + result = await bash("ping -c 1 -W 8.8.8.8 2>&1 || echo 'no network'", sb) + assert "no network" in result diff --git a/tools/bash.py b/tools/bash.py new file mode 100644 index 0000000..2bdcc2a --- /dev/null +++ b/tools/bash.py @@ -0,0 +1,24 @@ +async def bash(command: str, sandbox=None) -> str: + """ + Execute a bash command in the sandbox. + + Args: + command: Shell command to run + sandbox: PodmanSandbox instance + + Returns: + Command output (stdout + stderr) + """ + + if sandbox is None: + return "Error: Sandbox not available" + + try: + result = await sandbox.run(command) + return result + + except RuntimeError as e: + return f"Error: sandbox execution failed: {e}" + + except Exception as e: + return f"Error: Unexpected failure: {e}"