Move get_agent_obs_text function to browser utils and add return_all option (#9019)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Xingyao Wang 2025-06-11 00:32:38 -04:00 committed by GitHub
parent fd921a4f88
commit 9097f487a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 145 additions and 85 deletions

View File

@ -208,7 +208,7 @@ Note:
# for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
# initialize and retrieve the first observation by issuing an noop OP
# For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
return BrowseInteractiveAction(browser_actions='noop(1000)')
return BrowseInteractiveAction(browser_actions='noop(1000)', return_axtree=True)
for event in state.view:
if isinstance(event, BrowseInteractiveAction):

View File

@ -12,6 +12,7 @@ class BrowseURLAction(Action):
action: str = ActionType.BROWSE
runnable: ClassVar[bool] = True
security_risk: ActionSecurityRisk | None = None
return_axtree: bool = False
@property
def message(self) -> str:
@ -33,6 +34,7 @@ class BrowseInteractiveAction(Action):
action: str = ActionType.BROWSE_INTERACTIVE
runnable: ClassVar[bool] = True
security_risk: ActionSecurityRisk | None = None
return_axtree: bool = False
@property
def message(self) -> str:

View File

@ -1,9 +1,7 @@
from dataclasses import dataclass, field
from typing import Any
from browsergym.utils.obs import flatten_axtree_to_str
from openhands.core.schema import ActionType, ObservationType
from openhands.core.schema import ObservationType
from openhands.events.observation.observation import Observation
@ -53,69 +51,5 @@ class BrowserOutputObservation(Observation):
if self.screenshot_path:
ret += f'Screenshot saved to: {self.screenshot_path}\n'
ret += '--- Agent Observation ---\n'
ret += self.get_agent_obs_text()
ret += self.content
return ret
def get_agent_obs_text(self) -> str:
"""Get a concise text that will be shown to the agent."""
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n'
# Add screenshot path information if available
if self.screenshot_path:
text += f'[Screenshot saved to: {self.screenshot_path}]\n'
text += '\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += (
f'\n[Error encountered when processing the accessibility tree: {e}]'
)
return text
elif self.trigger_by_action == ActionType.BROWSE:
text = f'[Current URL: {self.url}]\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when trying to visit the URL:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
text += '============== BEGIN webpage content ==============\n'
text += self.content
text += '\n============== END webpage content ==============\n'
return text
else:
raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
cur_axtree_txt = flatten_axtree_to_str(
self.axtree_object,
extra_properties=self.extra_element_properties,
with_clickable=True,
skip_generic=False,
filter_visible_only=filter_visible_only,
)
return str(cur_axtree_txt)

View File

@ -391,7 +391,7 @@ class ConversationMemory:
role='user', content=[TextContent(text=obs.content)]
) # Content is already truncated by openhands-aci
elif isinstance(obs, BrowserOutputObservation):
text = obs.get_agent_obs_text()
text = obs.content
if (
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
and enable_som_visual_browsing

View File

@ -2,7 +2,9 @@ import base64
import datetime
import os
from pathlib import Path
from typing import Any
from browsergym.utils.obs import flatten_axtree_to_str
from PIL import Image
from openhands.core.exceptions import BrowserUnavailableException
@ -14,6 +16,78 @@ from openhands.runtime.browser.browser_env import BrowserEnv
from openhands.utils.async_utils import call_sync_from_async
def get_axtree_str(
axtree_object: dict[str, Any],
extra_element_properties: dict[str, Any],
filter_visible_only: bool = False,
) -> str:
cur_axtree_txt = flatten_axtree_to_str(
axtree_object,
extra_properties=extra_element_properties,
with_clickable=True,
skip_generic=False,
filter_visible_only=filter_visible_only,
)
return str(cur_axtree_txt)
def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
"""Get a concise text that will be shown to the agent."""
if obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
text = f'[Current URL: {obs.url}]\n'
text += f'[Focused element bid: {obs.focused_element_bid}]\n'
# Add screenshot path information if available
if obs.screenshot_path:
text += f'[Screenshot saved to: {obs.screenshot_path}]\n'
text += '\n'
if obs.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{obs.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = get_axtree_str(
obs.axtree_object,
obs.extra_element_properties,
filter_visible_only=False,
)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text
elif obs.trigger_by_action == ActionType.BROWSE:
text = f'[Current URL: {obs.url}]\n'
if obs.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when trying to visit the URL:\n'
f'{obs.last_browser_action_error}\n'
'================ END error message ===============\n'
)
text += '============== BEGIN webpage content ==============\n'
text += obs.content
text += '\n============== END webpage content ==============\n'
return text
else:
raise ValueError(f'Invalid trigger_by_action: {obs.trigger_by_action}')
async def browse(
action: BrowseURLAction | BrowseInteractiveAction,
browser: BrowserEnv | None,
@ -78,7 +152,8 @@ async def browse(
image = png_base64_url_to_image(obs.get('screenshot'))
image.save(screenshot_path, format='PNG', optimize=True)
return BrowserOutputObservation(
# Create the observation with all data
observation = BrowserOutputObservation(
content=obs['text_content'], # text content of the page
url=obs.get('url', ''), # URL of the page
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
@ -103,13 +178,37 @@ async def browse(
error=True if obs.get('last_action_error', '') else False, # error flag
trigger_by_action=action.action,
)
# Process the content first using the axtree_object
observation.content = get_agent_obs_text(observation)
# If return_axtree is False, remove the axtree_object to save space
if not action.return_axtree:
observation.dom_object = {}
observation.axtree_object = {}
observation.extra_element_properties = {}
return observation
except Exception as e:
return BrowserOutputObservation(
content=str(e),
error_message = str(e)
error_url = asked_url if action.action == ActionType.BROWSE else ''
# Create error observation
observation = BrowserOutputObservation(
content=error_message,
screenshot='',
screenshot_path=None,
error=True,
last_browser_action_error=str(e),
url=asked_url if action.action == ActionType.BROWSE else '',
last_browser_action_error=error_message,
url=error_url,
trigger_by_action=action.action,
)
# Process the content using get_agent_obs_text regardless of return_axtree value
try:
observation.content = get_agent_obs_text(observation)
except Exception:
# If get_agent_obs_text fails, keep the original error message
pass
return observation

View File

@ -50,6 +50,7 @@ def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement
event_dict = event_to_dict(action)
args = event_dict.get('args', {})
thought = args.pop('thought', None)
function = Function(name=action.action, arguments=args)
if thought is not None:
inv_trace.append(Message(role='assistant', content=thought))

View File

@ -43,7 +43,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
)
# Test browse
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
action = BrowseInteractiveAction(
browser_actions=BROWSER_EVAL_GET_GOAL_ACTION, return_axtree=False
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -54,7 +56,7 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
assert 'from the list and click Submit' in obs.content
# Make sure the browser can produce observation in eval env
action = BrowseInteractiveAction(browser_actions='noop()')
action = BrowseInteractiveAction(browser_actions='noop()', return_axtree=False)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -64,7 +66,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
)
# Make sure the rewards are working
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
action = BrowseInteractiveAction(
browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION, return_axtree=False
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

View File

@ -45,7 +45,7 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0
action_browse = BrowseURLAction(url='http://localhost:8000')
action_browse = BrowseURLAction(url='http://localhost:8000', return_axtree=False)
logger.info(action_browse, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_browse)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -116,7 +116,9 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
# Browse to the PDF file
pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf'
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")')
action_browse = BrowseInteractiveAction(
browser_actions=f'goto("{pdf_url}")', return_axtree=False
)
logger.info(action_browse, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_browse)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -185,7 +187,9 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
# Browse to the PNG file
png_url = f'{server_url}/view?path=/workspace/test_image.png'
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")')
action_browse = BrowseInteractiveAction(
browser_actions=f'goto("{png_url}")', return_axtree=False
)
logger.info(action_browse, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action_browse)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

View File

@ -108,7 +108,11 @@ def test_cmd_run_action_serialization_deserialization():
def test_browse_url_action_serialization_deserialization():
original_action_dict = {
'action': 'browse',
'args': {'thought': '', 'url': 'https://www.example.com'},
'args': {
'thought': '',
'url': 'https://www.example.com',
'return_axtree': False,
},
}
serialization_deserialization(original_action_dict, BrowseURLAction)
@ -120,6 +124,7 @@ def test_browse_interactive_action_serialization_deserialization():
'thought': '',
'browser_actions': 'goto("https://www.example.com")',
'browsergym_send_msg_to_user': '',
'return_axtree': False,
},
}
serialization_deserialization(original_action_dict, BrowseInteractiveAction)

View File

@ -80,3 +80,4 @@ def test_parse_action(
assert action.browser_actions == expected_browser_actions
assert action.thought == expected_thought
assert action.browsergym_send_msg_to_user == expected_msg_content
assert action.return_axtree is False # Default value should be False

View File

@ -457,11 +457,13 @@ def test_process_events_with_file_read_observation(conversation_memory):
def test_process_events_with_browser_output_observation(conversation_memory):
formatted_content = '[Current URL: http://example.com]\n\n============== BEGIN webpage content ==============\nPage loaded\n============== END webpage content =============='
obs = BrowserOutputObservation(
url='http://example.com',
trigger_by_action='browse',
screenshot='',
content='Page loaded',
content=formatted_content,
error=False,
)

View File

@ -178,6 +178,7 @@ def test_browser_valid():
assert len(actions) == 1
assert isinstance(actions[0], BrowseInteractiveAction)
assert actions[0].browser_actions == "click('button-1')"
assert actions[0].return_axtree is False # Default value should be False
def test_browser_missing_code():

View File

@ -413,6 +413,7 @@ async def test_unsafe_bash_command(temp_dir: str):
browser_actions='goto("http://localhost:3000")',
thought='browsing to localhost',
browsergym_send_msg_to_user='browsergym',
return_axtree=False,
),
[
Message(
@ -430,6 +431,7 @@ async def test_unsafe_bash_command(temp_dir: str):
arguments={
'browser_actions': 'goto("http://localhost:3000")',
'browsergym_send_msg_to_user': 'browsergym',
'return_axtree': False,
},
),
),
@ -437,7 +439,9 @@ async def test_unsafe_bash_command(temp_dir: str):
),
( # Test BrowseURLAction
BrowseURLAction(
url='http://localhost:3000', thought='browsing to localhost'
url='http://localhost:3000',
thought='browsing to localhost',
return_axtree=False,
),
[
Message(
@ -452,7 +456,10 @@ async def test_unsafe_bash_command(temp_dir: str):
type='function',
function=Function(
name=ActionType.BROWSE,
arguments={'url': 'http://localhost:3000'},
arguments={
'url': 'http://localhost:3000',
'return_axtree': False,
},
),
),
],