[OH-Versa] Add remaining browsing & GAIA eval improvement (#9015)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
This commit is contained in:
Ryan H. Tran 2025-06-25 12:36:15 +07:00 committed by GitHub
parent 76914e3c26
commit dfa54673d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 383 additions and 29 deletions

View File

@ -3,13 +3,20 @@ import copy
import functools
import os
import re
import shutil
import zipfile
import huggingface_hub
import pandas as pd
from datasets import load_dataset
from PIL import Image
from pydantic import SecretStr
from evaluation.benchmarks.gaia.scorer import question_scorer
from evaluation.benchmarks.gaia.utils import (
image_to_jpg_base64_url,
image_to_png_base64_url,
)
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
@ -97,27 +104,44 @@ def initialize_runtime(
if instance['file_name'] != '':
# if this question comes with a file, we need to save it to the workspace
assert metadata.data_split is not None
extension_name = instance['file_name'].split('.')[-1]
src_file = os.path.join(
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
)
assert os.path.exists(src_file)
dest_file = os.path.join('/workspace', instance['file_name'])
runtime.copy_to(src_file, dest_file)
if extension_name == 'zip':
temp_dir = os.path.join(
DATASET_CACHE_DIR, '2023', metadata.data_split, 'tmp_file'
)
os.makedirs(temp_dir, exist_ok=True)
with zipfile.ZipFile(src_file, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
for root, dirs, files in os.walk(temp_dir):
for file in files:
dest_file = '/workspace'
runtime.copy_to(os.path.join(root, file), dest_file)
shutil.rmtree(temp_dir)
elif extension_name not in ['jpg', 'png']:
dest_file = '/workspace'
runtime.copy_to(src_file, dest_file)
# rename to file.extension_name
extension_name = instance['file_name'].split('.')[-1]
action = CmdRunAction(
command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
assert obs.exit_code == 0
# rename to file.extension_name
action = CmdRunAction(
command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(command='cd /workspace')
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
assert obs.exit_code == 0
action = CmdRunAction(
command='apt-get update && apt-get install -y ffmpeg && apt-get install -y ffprobe'
)
runtime.run_action(action)
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
@ -151,8 +175,31 @@ Here is the task:
task_question=instance['Question'],
)
logger.info(f'Instruction: {instruction}')
image_urls = []
if dest_file:
instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'
if extension_name not in ['jpg', 'png', 'zip']:
instruction += f'To solve this task you will have to use the attached file provided in the workspace at location: {dest_file}\n\n'
elif extension_name == 'zip':
filenames = []
src_file = os.path.join(
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
)
with zipfile.ZipFile(src_file, 'r') as zip_ref:
filenames = zip_ref.namelist()
filenames = [f'/workspace/{file}' for file in filenames]
filenames = ', '.join(filenames)
instruction += f'To solve this task you will have to use the attached files provided in the workspace at locations: {filenames}\n\n'
else: # Image files: jpg, png
src_file = os.path.join(
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
)
instruction += 'Image: To solve this task you will have to use the image shown below.\n\n'
image = Image.open(src_file)
if extension_name == 'jpg':
image_urls.append(image_to_jpg_base64_url(image))
else:
image_urls.append(image_to_png_base64_url(image))
instruction += """IMPORTANT: When seeking information from a website, REFRAIN from arbitrary URL navigation. You should utilize the designated search engine tool with precise keywords to obtain relevant URLs or use the specific website's search interface. DO NOT navigate directly to specific URLs as they may not exist.\n\nFor example: if you want to search for a research paper on Arxiv, either use the search engine tool with specific keywords or navigate to arxiv.org and then use its interface.\n"""
instruction += 'IMPORTANT: You should NEVER ask for Human Help.\n'
@ -174,7 +221,9 @@ Here is the task:
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
initial_user_action=MessageAction(
content=instruction, image_urls=image_urls
),
runtime=runtime,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
metadata.agent_class

View File

@ -0,0 +1,43 @@
import base64
import io
import numpy as np
from PIL import Image
def image_to_png_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = True
):
"""Convert a numpy array to a base64 encoded png image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ('RGBA', 'LA'):
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='PNG')
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f'data:image/png;base64,{image_base64}'
if add_data_prefix
else f'{image_base64}'
)
def image_to_jpg_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = True
):
"""Convert a numpy array to a base64 encoded jpeg image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ('RGBA', 'LA'):
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='JPEG')
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f'data:image/jpeg;base64,{image_base64}'
if add_data_prefix
else f'{image_base64}'
)

View File

@ -109,7 +109,7 @@ def codeact_user_response(
) -> str:
encaps_str = (
(
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
'Your final answer MUST be encapsulated within <solution> and </solution>.\n'
'For example: The answer to the question is <solution> 42 </solution>.\n'
)
if encapsulate_solution
@ -117,7 +117,7 @@ def codeact_user_response(
)
msg = (
'Please continue working on the task on whatever approach you think is suitable.\n'
'If you think you have solved the task, please first send your answer to user through message and then finish the interaction.\n'
'When you think you have solved the question, please use the finish tool and include your final answer in the message parameter of the finish tool.\n'
f'{encaps_str}'
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
)

View File

@ -52,3 +52,6 @@ class ObservationType(str, Enum):
MCP = 'mcp'
"""Result of a MCP Server operation"""
DOWNLOAD = 'download'
"""Result of downloading/opening a file via the browser"""

View File

@ -16,6 +16,7 @@ from openhands.events.observation.empty import (
NullObservation,
)
from openhands.events.observation.error import ErrorObservation
from openhands.events.observation.file_download import FileDownloadObservation
from openhands.events.observation.files import (
FileEditObservation,
FileReadObservation,
@ -46,4 +47,5 @@ __all__ = [
'RecallObservation',
'RecallType',
'MCPObservation',
'FileDownloadObservation',
]

View File

@ -32,6 +32,7 @@ class BrowserOutputObservation(Observation):
last_browser_action: str = ''
last_browser_action_error: str = ''
focused_element_bid: str = ''
filter_visible_only: bool = False
@property
def message(self) -> str:

View File

@ -0,0 +1,21 @@
from dataclasses import dataclass
from openhands.core.schema import ObservationType
from openhands.events.observation.observation import Observation
@dataclass
class FileDownloadObservation(Observation):
file_path: str
observation: str = ObservationType.DOWNLOAD
@property
def message(self) -> str:
return f'Downloaded the file at location: {self.file_path}'
def __str__(self) -> str:
ret = (
'**FileDownloadObservation**\n'
f'Location of downloaded file: {self.file_path}\n'
)
return ret

View File

@ -20,6 +20,7 @@ from openhands.events.observation.empty import (
NullObservation,
)
from openhands.events.observation.error import ErrorObservation
from openhands.events.observation.file_download import FileDownloadObservation
from openhands.events.observation.files import (
FileEditObservation,
FileReadObservation,
@ -47,6 +48,7 @@ observations = (
AgentThinkObservation,
RecallObservation,
MCPObservation,
FileDownloadObservation,
)
OBSERVATION_TYPE_TO_CLASS = {

View File

@ -28,6 +28,7 @@ from openhands.events.observation import (
AgentThinkObservation,
BrowserOutputObservation,
CmdOutputObservation,
FileDownloadObservation,
FileEditObservation,
FileReadObservation,
IPythonRunCellObservation,
@ -288,7 +289,12 @@ class ConversationMemory:
role = 'user' if action.source == 'user' else 'assistant'
content = [TextContent(text=action.content or '')]
if vision_is_active and action.image_urls:
content.append(ImageContent(image_urls=action.image_urls))
if role == 'user':
for idx, url in enumerate(action.image_urls):
content.append(TextContent(text=f'Image {idx + 1}:'))
content.append(ImageContent(image_urls=[url]))
else:
content.append(ImageContent(image_urls=action.image_urls))
if role not in ('user', 'system', 'assistant', 'tool'):
raise ValueError(f'Invalid role: {role}')
return [
@ -339,6 +345,7 @@ class ConversationMemory:
- AgentDelegateObservation: Formats results from delegated agent tasks
- ErrorObservation: Formats error messages from failed actions
- UserRejectObservation: Formats user rejection messages
- FileDownloadObservation: Formats the result of a browsing action that opened/downloaded a file
In function calling mode, observations with tool_call_metadata are stored in
tool_call_id_to_message for later processing instead of being returned immediately.
@ -429,7 +436,7 @@ class ConversationMemory:
and enable_som_visual_browsing
and vision_is_active
):
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
# Determine which image to use and validate it
image_url = None
@ -492,6 +499,9 @@ class ConversationMemory:
elif isinstance(obs, AgentCondensationObservation):
text = truncate_content(obs.content, max_message_chars)
message = Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, FileDownloadObservation):
text = truncate_content(obs.content, max_message_chars)
message = Message(role='user', content=[TextContent(text=text)])
elif (
isinstance(obs, RecallObservation)
and self.agent_config.enable_prompt_extensions

View File

@ -20,6 +20,7 @@ from contextlib import asynccontextmanager
from pathlib import Path
from zipfile import ZipFile
import puremagic
from binaryornot.check import is_binary
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
from fastapi.exceptions import RequestValidationError
@ -51,6 +52,7 @@ from openhands.events.event import FileEditSource, FileReadSource
from openhands.events.observation import (
CmdOutputObservation,
ErrorObservation,
FileDownloadObservation,
FileEditObservation,
FileReadObservation,
FileWriteObservation,
@ -193,6 +195,8 @@ class ActionExecutor:
self.start_time = time.time()
self.last_execution_time = self.start_time
self._initialized = False
self.downloaded_files: list[str] = []
self.downloads_directory = '/workspace/.downloads'
self.max_memory_gb: int | None = None
if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
@ -603,7 +607,45 @@ class ActionExecutor:
'Browser functionality is not supported on Windows.'
)
await self._ensure_browser_ready()
return await browse(action, self.browser, self.initial_cwd)
browser_observation = await browse(action, self.browser, self.initial_cwd)
if not browser_observation.error:
return browser_observation
else:
curr_files = os.listdir(self.downloads_directory)
new_download = False
for file in curr_files:
if file not in self.downloaded_files:
new_download = True
self.downloaded_files.append(file)
break # FIXME: assuming only one file will be downloaded for simplicity
if not new_download:
return browser_observation
else:
# A new file is downloaded in self.downloads_directory, shift file to /workspace
src_path = os.path.join(
self.downloads_directory, self.downloaded_files[-1]
)
# Guess extension of file using puremagic and add it to tgt_path file name
file_ext = ''
try:
guesses = puremagic.magic_file(src_path)
if len(guesses) > 0:
ext = guesses[0].extension.strip()
if len(ext) > 0:
file_ext = ext
except Exception as _:
pass
tgt_path = os.path.join(
'/workspace', f'file_{len(self.downloaded_files)}{file_ext}'
)
shutil.copy(src_path, tgt_path)
file_download_obs = FileDownloadObservation(
content=f'Execution of the previous action {action.browser_actions} resulted in a file download. The downloaded file is saved at location: {tgt_path}',
file_path=tgt_path,
)
return file_download_obs
def close(self):
self.memory_monitor.stop_monitoring()

View File

@ -94,6 +94,9 @@ class BrowserEnv:
headless=True,
disable_env_checker=True,
tags_to_mark='all',
timeout=100000,
pw_context_kwargs={'accept_downloads': True},
pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
)
obs, info = env.reset()
@ -105,6 +108,7 @@ class BrowserEnv:
if self.eval_mode:
self.eval_goal = obs['goal']
if 'goal_object' in obs:
obs['goal_object'] = list(obs['goal_object'])
if len(obs['goal_object']) > 0:
self.eval_goal = obs['goal_object'][0]['text']
for message in obs['goal_object']:
@ -182,7 +186,7 @@ class BrowserEnv:
pass
return
def step(self, action_str: str, timeout: float = 100) -> dict:
def step(self, action_str: str, timeout: float = 120) -> dict:
"""Execute an action in the browser environment and return the observation."""
unique_request_id = str(uuid.uuid4())
self.agent_side.send((unique_request_id, {'action': action_str}))

View File

@ -59,13 +59,22 @@ def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
cur_axtree_txt = get_axtree_str(
obs.axtree_object,
obs.extra_element_properties,
filter_visible_only=False,
)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
filter_visible_only=obs.filter_visible_only,
)
if not obs.filter_visible_only:
text += (
f'Accessibility tree of the COMPLETE webpage:\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
else:
text += (
f'Accessibility tree of the VISIBLE portion of the webpage (accessibility tree of complete webpage is too large and you may need to scroll to view remaining portion of the webpage):\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text

View File

@ -15,7 +15,7 @@ ENV POETRY_VIRTUALENVS_PATH=/openhands/poetry \
# Install base system dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget curl ca-certificates sudo apt-utils git jq tmux build-essential ripgrep \
wget curl ca-certificates sudo apt-utils git jq tmux build-essential ripgrep ffmpeg \
{%- if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) -%}
libgl1 \
{%- else %}

View File

@ -59,7 +59,9 @@ def test_view_directory(temp_dir, runtime_cls, run_as_openhands):
obs.content
== f"""Here's the files and directories up to 2 levels deep in {config.workspace_mount_path_in_sandbox}, excluding hidden items:
{config.workspace_mount_path_in_sandbox}/
{config.workspace_mount_path_in_sandbox}/test.txt"""
{config.workspace_mount_path_in_sandbox}/test.txt
1 hidden files/directories in this directory are excluded. You can use 'ls -la /workspace' to see them.""" # The hidden dir is the /workspace/.downloads
)
finally:

View File

@ -14,6 +14,7 @@ from openhands.events.action import (
from openhands.events.observation import (
BrowserOutputObservation,
CmdOutputObservation,
FileDownloadObservation,
)
# ============================================================================================================================
@ -215,3 +216,168 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
assert '.png' in obs.content
finally:
_close_test_runtime(runtime)
@pytest.mark.skipif(
os.environ.get('TEST_RUNTIME') == 'cli',
reason='CLIRuntime does not support browsing actions',
)
def test_download_file(temp_dir, runtime_cls, run_as_openhands):
"""Test downloading a file using the browser."""
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
try:
# Minimal PDF content for testing
pdf_content = b"""%PDF-1.4
1 0 obj
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
>>
endobj
xref
0 4
0000000000 65535 f
0000000010 00000 n
0000000053 00000 n
0000000125 00000 n
trailer
/Size 4
/Root 1 0 R
>>
startxref
212
%%EOF"""
test_file_name = 'test_download.pdf'
test_file_path = os.path.join(temp_dir, test_file_name)
with open(test_file_path, 'wb') as f:
f.write(pdf_content)
# Copy the file to the sandbox
sandbox_dir = config.workspace_mount_path_in_sandbox
runtime.copy_to(test_file_path, sandbox_dir)
# Create a simple HTML page with a download link
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>Download Test</title>
</head>
<body>
<h1>Download Test Page</h1>
<p>Click the link below to download the test file:</p>
<a href="/{test_file_name}" download="{test_file_name}" id="download-link">Download Test File</a>
</body>
</html>
"""
html_file_path = os.path.join(temp_dir, 'download_test.html')
with open(html_file_path, 'w') as f:
f.write(html_content)
# Copy the HTML file to the sandbox
runtime.copy_to(html_file_path, sandbox_dir)
# Verify the files exist in the sandbox
action_cmd = CmdRunAction(command='ls -alh')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert test_file_name in obs.content
assert 'download_test.html' in obs.content
# Ensure downloads directory exists
action_cmd = CmdRunAction(command='mkdir -p /workspace/.downloads')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
# Start HTTP server
action_cmd = CmdRunAction(
command='python3 -m http.server 8000 > server.log 2>&1 &'
)
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
# Wait for server to start
action_cmd = CmdRunAction(command='sleep 2')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Browse to the HTML page
action_browse = BrowseURLAction(url='http://localhost:8000/download_test.html')
logger.info(action_browse, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_browse)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Verify the browser observation
assert isinstance(obs, BrowserOutputObservation)
assert 'http://localhost:8000/download_test.html' in obs.url
assert not obs.error
assert 'Download Test Page' in obs.content
# Go to the PDF file url directly - this should trigger download
file_url = f'http://localhost:8000/{test_file_name}'
action_browse = BrowseInteractiveAction(
browser_actions=f'goto("{file_url}")',
)
logger.info(action_browse, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_browse)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Verify the browser observation after navigating to PDF file
downloaded_file_name = 'file_1.pdf'
assert isinstance(obs, FileDownloadObservation)
assert 'Location of downloaded file:' in str(obs)
assert downloaded_file_name in str(obs) # File is renamed
# Wait for download to complete
action_cmd = CmdRunAction(command='sleep 3')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
# Check if the file was downloaded
action_cmd = CmdRunAction(command='ls -la /workspace')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert isinstance(obs, CmdOutputObservation)
assert obs.exit_code == 0
assert downloaded_file_name in obs.content
# Clean up
action_cmd = CmdRunAction(command='pkill -f "python3 -m http.server" || true')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
action_cmd = CmdRunAction(command='rm -f server.log')
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_cmd)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
finally:
_close_test_runtime(runtime)

View File

@ -160,7 +160,7 @@ async def test_fetch_mcp_via_stdio(temp_dir, runtime_cls, run_as_openhands):
assert result_json['content'][0]['type'] == 'text'
assert (
result_json['content'][0]['text']
== 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
== 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
)
runtime.close()
@ -269,7 +269,7 @@ async def test_both_stdio_and_sse_mcp(
assert result_json['content'][0]['type'] == 'text'
assert (
result_json['content'][0]['text']
== 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
== 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
)
finally:
if runtime:
@ -354,7 +354,7 @@ async def test_microagent_and_one_stdio_mcp_in_config(
assert result_json['content'][0]['type'] == 'text'
assert (
result_json['content'][0]['text']
== 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
== 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
)
finally:
if runtime: