[agent, browsing] Support viewing pdf and png/jpg via browser (#7457)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2026-03-22 13:47:19 +08:00 · 2025-03-28 00:07:33 -07:00
parent 23505576a6
commit ac8b5e7934
8 changed files with 366 additions and 32 deletions
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@@ -1,9 +1,12 @@
 """Browsing-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""

+import os
+
 from conftest import _close_test_runtime, _load_runtime

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
+    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
 )
@@ -60,3 +63,112 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
    assert obs.exit_code == 0

    _close_test_runtime(runtime)
+
+
+def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
+    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create a PDF file using reportlab in the host environment
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+
+        pdf_path = os.path.join(temp_dir, 'test_document.pdf')
+        pdf_content = 'This is test content for PDF reading test'
+
+        c = canvas.Canvas(pdf_path, pagesize=letter)
+        # Add more content to make the PDF more robust
+        c.drawString(100, 750, pdf_content)
+        c.drawString(100, 700, 'Additional line for PDF structure')
+        c.drawString(100, 650, 'Third line to ensure valid PDF')
+        # Explicitly set PDF version and ensure proper structure
+        c.setPageCompression(0)  # Disable compression for simpler structure
+        c.save()
+
+        # Copy the PDF to the sandbox
+        sandbox_dir = config.workspace_mount_path_in_sandbox
+        runtime.copy_to(pdf_path, sandbox_dir)
+
+        # Start HTTP server
+        action_cmd = CmdRunAction(command='ls -alh')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'test_document.pdf' in obs.content
+
+        # Get server url
+        action_cmd = CmdRunAction(command='cat /tmp/oh-server-url')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        server_url = obs.content.strip()
+
+        # Browse to the PDF file
+        pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf'
+        action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")')
+        logger.info(action_browse, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_browse)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Verify the browser observation
+        assert isinstance(obs, BrowserOutputObservation)
+        observation_text = str(obs)
+        assert '[Action executed successfully.]' in observation_text
+        assert 'Canvas' in observation_text
+
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
+    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create a PNG file using PIL in the host environment
+        from PIL import Image, ImageDraw
+
+        png_path = os.path.join(temp_dir, 'test_image.png')
+        # Create a simple image with text
+        img = Image.new('RGB', (400, 200), color=(255, 255, 255))
+        d = ImageDraw.Draw(img)
+        text = 'This is a test PNG image'
+        d.text((20, 80), text, fill=(0, 0, 0))
+        img.save(png_path)
+
+        # Copy the PNG to the sandbox
+        sandbox_dir = config.workspace_mount_path_in_sandbox
+        runtime.copy_to(png_path, sandbox_dir)
+
+        # Verify the file exists in the sandbox
+        action_cmd = CmdRunAction(command='ls -alh')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'test_image.png' in obs.content
+
+        # Get server url
+        action_cmd = CmdRunAction(command='cat /tmp/oh-server-url')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        server_url = obs.content.strip()
+
+        # Browse to the PNG file
+        png_url = f'{server_url}/view?path=/workspace/test_image.png'
+        action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")')
+        logger.info(action_browse, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_browse)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Verify the browser observation
+        assert isinstance(obs, BrowserOutputObservation)
+        observation_text = str(obs)
+        assert '[Action executed successfully.]' in observation_text
+        assert 'File Viewer - test_image.png' in observation_text
+
+    finally:
+        _close_test_runtime(runtime)