mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 13:47:19 +08:00
[agent, browsing] Support viewing pdf and png/jpg via browser (#7457)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
@@ -1,9 +1,12 @@
|
||||
"""Browsing-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
|
||||
|
||||
import os
|
||||
|
||||
from conftest import _close_test_runtime, _load_runtime
|
||||
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.events.action import (
|
||||
BrowseInteractiveAction,
|
||||
BrowseURLAction,
|
||||
CmdRunAction,
|
||||
)
|
||||
@@ -60,3 +63,112 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
assert obs.exit_code == 0
|
||||
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
|
||||
try:
|
||||
# Create a PDF file using reportlab in the host environment
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
pdf_path = os.path.join(temp_dir, 'test_document.pdf')
|
||||
pdf_content = 'This is test content for PDF reading test'
|
||||
|
||||
c = canvas.Canvas(pdf_path, pagesize=letter)
|
||||
# Add more content to make the PDF more robust
|
||||
c.drawString(100, 750, pdf_content)
|
||||
c.drawString(100, 700, 'Additional line for PDF structure')
|
||||
c.drawString(100, 650, 'Third line to ensure valid PDF')
|
||||
# Explicitly set PDF version and ensure proper structure
|
||||
c.setPageCompression(0) # Disable compression for simpler structure
|
||||
c.save()
|
||||
|
||||
# Copy the PDF to the sandbox
|
||||
sandbox_dir = config.workspace_mount_path_in_sandbox
|
||||
runtime.copy_to(pdf_path, sandbox_dir)
|
||||
|
||||
# Start HTTP server
|
||||
action_cmd = CmdRunAction(command='ls -alh')
|
||||
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_cmd)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert 'test_document.pdf' in obs.content
|
||||
|
||||
# Get server url
|
||||
action_cmd = CmdRunAction(command='cat /tmp/oh-server-url')
|
||||
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_cmd)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
server_url = obs.content.strip()
|
||||
|
||||
# Browse to the PDF file
|
||||
pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf'
|
||||
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")')
|
||||
logger.info(action_browse, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_browse)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Verify the browser observation
|
||||
assert isinstance(obs, BrowserOutputObservation)
|
||||
observation_text = str(obs)
|
||||
assert '[Action executed successfully.]' in observation_text
|
||||
assert 'Canvas' in observation_text
|
||||
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
|
||||
def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
|
||||
try:
|
||||
# Create a PNG file using PIL in the host environment
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
png_path = os.path.join(temp_dir, 'test_image.png')
|
||||
# Create a simple image with text
|
||||
img = Image.new('RGB', (400, 200), color=(255, 255, 255))
|
||||
d = ImageDraw.Draw(img)
|
||||
text = 'This is a test PNG image'
|
||||
d.text((20, 80), text, fill=(0, 0, 0))
|
||||
img.save(png_path)
|
||||
|
||||
# Copy the PNG to the sandbox
|
||||
sandbox_dir = config.workspace_mount_path_in_sandbox
|
||||
runtime.copy_to(png_path, sandbox_dir)
|
||||
|
||||
# Verify the file exists in the sandbox
|
||||
action_cmd = CmdRunAction(command='ls -alh')
|
||||
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_cmd)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert 'test_image.png' in obs.content
|
||||
|
||||
# Get server url
|
||||
action_cmd = CmdRunAction(command='cat /tmp/oh-server-url')
|
||||
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_cmd)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
server_url = obs.content.strip()
|
||||
|
||||
# Browse to the PNG file
|
||||
png_url = f'{server_url}/view?path=/workspace/test_image.png'
|
||||
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")')
|
||||
logger.info(action_browse, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_browse)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Verify the browser observation
|
||||
assert isinstance(obs, BrowserOutputObservation)
|
||||
observation_text = str(obs)
|
||||
assert '[Action executed successfully.]' in observation_text
|
||||
assert 'File Viewer - test_image.png' in observation_text
|
||||
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
Reference in New Issue
Block a user