mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Move get_agent_obs_text function to browser utils and add return_all option (#9019)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
parent
fd921a4f88
commit
9097f487a6
@ -208,7 +208,7 @@ Note:
|
||||
# for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
|
||||
# initialize and retrieve the first observation by issuing an noop OP
|
||||
# For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
|
||||
return BrowseInteractiveAction(browser_actions='noop(1000)')
|
||||
return BrowseInteractiveAction(browser_actions='noop(1000)', return_axtree=True)
|
||||
|
||||
for event in state.view:
|
||||
if isinstance(event, BrowseInteractiveAction):
|
||||
|
||||
@ -12,6 +12,7 @@ class BrowseURLAction(Action):
|
||||
action: str = ActionType.BROWSE
|
||||
runnable: ClassVar[bool] = True
|
||||
security_risk: ActionSecurityRisk | None = None
|
||||
return_axtree: bool = False
|
||||
|
||||
@property
|
||||
def message(self) -> str:
|
||||
@ -33,6 +34,7 @@ class BrowseInteractiveAction(Action):
|
||||
action: str = ActionType.BROWSE_INTERACTIVE
|
||||
runnable: ClassVar[bool] = True
|
||||
security_risk: ActionSecurityRisk | None = None
|
||||
return_axtree: bool = False
|
||||
|
||||
@property
|
||||
def message(self) -> str:
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from browsergym.utils.obs import flatten_axtree_to_str
|
||||
|
||||
from openhands.core.schema import ActionType, ObservationType
|
||||
from openhands.core.schema import ObservationType
|
||||
from openhands.events.observation.observation import Observation
|
||||
|
||||
|
||||
@ -53,69 +51,5 @@ class BrowserOutputObservation(Observation):
|
||||
if self.screenshot_path:
|
||||
ret += f'Screenshot saved to: {self.screenshot_path}\n'
|
||||
ret += '--- Agent Observation ---\n'
|
||||
ret += self.get_agent_obs_text()
|
||||
ret += self.content
|
||||
return ret
|
||||
|
||||
def get_agent_obs_text(self) -> str:
|
||||
"""Get a concise text that will be shown to the agent."""
|
||||
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
|
||||
text = f'[Current URL: {self.url}]\n'
|
||||
text += f'[Focused element bid: {self.focused_element_bid}]\n'
|
||||
|
||||
# Add screenshot path information if available
|
||||
if self.screenshot_path:
|
||||
text += f'[Screenshot saved to: {self.screenshot_path}]\n'
|
||||
|
||||
text += '\n'
|
||||
|
||||
if self.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
'The following error occurred when executing the last action:\n'
|
||||
f'{self.last_browser_action_error}\n'
|
||||
'================ END error message ===============\n'
|
||||
)
|
||||
else:
|
||||
text += '[Action executed successfully.]\n'
|
||||
try:
|
||||
# We do not filter visible only here because we want to show the full content
|
||||
# of the web page to the agent for simplicity.
|
||||
# FIXME: handle the case when the web page is too large
|
||||
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
|
||||
text += (
|
||||
f'============== BEGIN accessibility tree ==============\n'
|
||||
f'{cur_axtree_txt}\n'
|
||||
f'============== END accessibility tree ==============\n'
|
||||
)
|
||||
except Exception as e:
|
||||
text += (
|
||||
f'\n[Error encountered when processing the accessibility tree: {e}]'
|
||||
)
|
||||
return text
|
||||
|
||||
elif self.trigger_by_action == ActionType.BROWSE:
|
||||
text = f'[Current URL: {self.url}]\n'
|
||||
|
||||
if self.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
'The following error occurred when trying to visit the URL:\n'
|
||||
f'{self.last_browser_action_error}\n'
|
||||
'================ END error message ===============\n'
|
||||
)
|
||||
text += '============== BEGIN webpage content ==============\n'
|
||||
text += self.content
|
||||
text += '\n============== END webpage content ==============\n'
|
||||
return text
|
||||
else:
|
||||
raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')
|
||||
|
||||
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
self.axtree_object,
|
||||
extra_properties=self.extra_element_properties,
|
||||
with_clickable=True,
|
||||
skip_generic=False,
|
||||
filter_visible_only=filter_visible_only,
|
||||
)
|
||||
return str(cur_axtree_txt)
|
||||
|
||||
@ -391,7 +391,7 @@ class ConversationMemory:
|
||||
role='user', content=[TextContent(text=obs.content)]
|
||||
) # Content is already truncated by openhands-aci
|
||||
elif isinstance(obs, BrowserOutputObservation):
|
||||
text = obs.get_agent_obs_text()
|
||||
text = obs.content
|
||||
if (
|
||||
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
|
||||
and enable_som_visual_browsing
|
||||
|
||||
@ -2,7 +2,9 @@ import base64
|
||||
import datetime
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from browsergym.utils.obs import flatten_axtree_to_str
|
||||
from PIL import Image
|
||||
|
||||
from openhands.core.exceptions import BrowserUnavailableException
|
||||
@ -14,6 +16,78 @@ from openhands.runtime.browser.browser_env import BrowserEnv
|
||||
from openhands.utils.async_utils import call_sync_from_async
|
||||
|
||||
|
||||
def get_axtree_str(
|
||||
axtree_object: dict[str, Any],
|
||||
extra_element_properties: dict[str, Any],
|
||||
filter_visible_only: bool = False,
|
||||
) -> str:
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
axtree_object,
|
||||
extra_properties=extra_element_properties,
|
||||
with_clickable=True,
|
||||
skip_generic=False,
|
||||
filter_visible_only=filter_visible_only,
|
||||
)
|
||||
return str(cur_axtree_txt)
|
||||
|
||||
|
||||
def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
|
||||
"""Get a concise text that will be shown to the agent."""
|
||||
if obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
|
||||
text = f'[Current URL: {obs.url}]\n'
|
||||
text += f'[Focused element bid: {obs.focused_element_bid}]\n'
|
||||
|
||||
# Add screenshot path information if available
|
||||
if obs.screenshot_path:
|
||||
text += f'[Screenshot saved to: {obs.screenshot_path}]\n'
|
||||
|
||||
text += '\n'
|
||||
|
||||
if obs.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
'The following error occurred when executing the last action:\n'
|
||||
f'{obs.last_browser_action_error}\n'
|
||||
'================ END error message ===============\n'
|
||||
)
|
||||
else:
|
||||
text += '[Action executed successfully.]\n'
|
||||
try:
|
||||
# We do not filter visible only here because we want to show the full content
|
||||
# of the web page to the agent for simplicity.
|
||||
# FIXME: handle the case when the web page is too large
|
||||
cur_axtree_txt = get_axtree_str(
|
||||
obs.axtree_object,
|
||||
obs.extra_element_properties,
|
||||
filter_visible_only=False,
|
||||
)
|
||||
text += (
|
||||
f'============== BEGIN accessibility tree ==============\n'
|
||||
f'{cur_axtree_txt}\n'
|
||||
f'============== END accessibility tree ==============\n'
|
||||
)
|
||||
except Exception as e:
|
||||
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
|
||||
return text
|
||||
|
||||
elif obs.trigger_by_action == ActionType.BROWSE:
|
||||
text = f'[Current URL: {obs.url}]\n'
|
||||
|
||||
if obs.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
'The following error occurred when trying to visit the URL:\n'
|
||||
f'{obs.last_browser_action_error}\n'
|
||||
'================ END error message ===============\n'
|
||||
)
|
||||
text += '============== BEGIN webpage content ==============\n'
|
||||
text += obs.content
|
||||
text += '\n============== END webpage content ==============\n'
|
||||
return text
|
||||
else:
|
||||
raise ValueError(f'Invalid trigger_by_action: {obs.trigger_by_action}')
|
||||
|
||||
|
||||
async def browse(
|
||||
action: BrowseURLAction | BrowseInteractiveAction,
|
||||
browser: BrowserEnv | None,
|
||||
@ -78,7 +152,8 @@ async def browse(
|
||||
image = png_base64_url_to_image(obs.get('screenshot'))
|
||||
image.save(screenshot_path, format='PNG', optimize=True)
|
||||
|
||||
return BrowserOutputObservation(
|
||||
# Create the observation with all data
|
||||
observation = BrowserOutputObservation(
|
||||
content=obs['text_content'], # text content of the page
|
||||
url=obs.get('url', ''), # URL of the page
|
||||
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
|
||||
@ -103,13 +178,37 @@ async def browse(
|
||||
error=True if obs.get('last_action_error', '') else False, # error flag
|
||||
trigger_by_action=action.action,
|
||||
)
|
||||
|
||||
# Process the content first using the axtree_object
|
||||
observation.content = get_agent_obs_text(observation)
|
||||
|
||||
# If return_axtree is False, remove the axtree_object to save space
|
||||
if not action.return_axtree:
|
||||
observation.dom_object = {}
|
||||
observation.axtree_object = {}
|
||||
observation.extra_element_properties = {}
|
||||
|
||||
return observation
|
||||
except Exception as e:
|
||||
return BrowserOutputObservation(
|
||||
content=str(e),
|
||||
error_message = str(e)
|
||||
error_url = asked_url if action.action == ActionType.BROWSE else ''
|
||||
|
||||
# Create error observation
|
||||
observation = BrowserOutputObservation(
|
||||
content=error_message,
|
||||
screenshot='',
|
||||
screenshot_path=None,
|
||||
error=True,
|
||||
last_browser_action_error=str(e),
|
||||
url=asked_url if action.action == ActionType.BROWSE else '',
|
||||
last_browser_action_error=error_message,
|
||||
url=error_url,
|
||||
trigger_by_action=action.action,
|
||||
)
|
||||
|
||||
# Process the content using get_agent_obs_text regardless of return_axtree value
|
||||
try:
|
||||
observation.content = get_agent_obs_text(observation)
|
||||
except Exception:
|
||||
# If get_agent_obs_text fails, keep the original error message
|
||||
pass
|
||||
|
||||
return observation
|
||||
|
||||
@ -50,6 +50,7 @@ def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement
|
||||
event_dict = event_to_dict(action)
|
||||
args = event_dict.get('args', {})
|
||||
thought = args.pop('thought', None)
|
||||
|
||||
function = Function(name=action.action, arguments=args)
|
||||
if thought is not None:
|
||||
inv_trace.append(Message(role='assistant', content=thought))
|
||||
|
||||
@ -43,7 +43,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
|
||||
)
|
||||
|
||||
# Test browse
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
|
||||
action = BrowseInteractiveAction(
|
||||
browser_actions=BROWSER_EVAL_GET_GOAL_ACTION, return_axtree=False
|
||||
)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
@ -54,7 +56,7 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
|
||||
assert 'from the list and click Submit' in obs.content
|
||||
|
||||
# Make sure the browser can produce observation in eval env
|
||||
action = BrowseInteractiveAction(browser_actions='noop()')
|
||||
action = BrowseInteractiveAction(browser_actions='noop()', return_axtree=False)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
@ -64,7 +66,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
|
||||
)
|
||||
|
||||
# Make sure the rewards are working
|
||||
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
|
||||
action = BrowseInteractiveAction(
|
||||
browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION, return_axtree=False
|
||||
)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
@ -45,7 +45,7 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
action_browse = BrowseURLAction(url='http://localhost:8000')
|
||||
action_browse = BrowseURLAction(url='http://localhost:8000', return_axtree=False)
|
||||
logger.info(action_browse, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_browse)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
@ -116,7 +116,9 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
|
||||
# Browse to the PDF file
|
||||
pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf'
|
||||
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")')
|
||||
action_browse = BrowseInteractiveAction(
|
||||
browser_actions=f'goto("{pdf_url}")', return_axtree=False
|
||||
)
|
||||
logger.info(action_browse, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_browse)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
@ -185,7 +187,9 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
|
||||
# Browse to the PNG file
|
||||
png_url = f'{server_url}/view?path=/workspace/test_image.png'
|
||||
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")')
|
||||
action_browse = BrowseInteractiveAction(
|
||||
browser_actions=f'goto("{png_url}")', return_axtree=False
|
||||
)
|
||||
logger.info(action_browse, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_browse)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
@ -108,7 +108,11 @@ def test_cmd_run_action_serialization_deserialization():
|
||||
def test_browse_url_action_serialization_deserialization():
|
||||
original_action_dict = {
|
||||
'action': 'browse',
|
||||
'args': {'thought': '', 'url': 'https://www.example.com'},
|
||||
'args': {
|
||||
'thought': '',
|
||||
'url': 'https://www.example.com',
|
||||
'return_axtree': False,
|
||||
},
|
||||
}
|
||||
serialization_deserialization(original_action_dict, BrowseURLAction)
|
||||
|
||||
@ -120,6 +124,7 @@ def test_browse_interactive_action_serialization_deserialization():
|
||||
'thought': '',
|
||||
'browser_actions': 'goto("https://www.example.com")',
|
||||
'browsergym_send_msg_to_user': '',
|
||||
'return_axtree': False,
|
||||
},
|
||||
}
|
||||
serialization_deserialization(original_action_dict, BrowseInteractiveAction)
|
||||
|
||||
@ -80,3 +80,4 @@ def test_parse_action(
|
||||
assert action.browser_actions == expected_browser_actions
|
||||
assert action.thought == expected_thought
|
||||
assert action.browsergym_send_msg_to_user == expected_msg_content
|
||||
assert action.return_axtree is False # Default value should be False
|
||||
|
||||
@ -457,11 +457,13 @@ def test_process_events_with_file_read_observation(conversation_memory):
|
||||
|
||||
|
||||
def test_process_events_with_browser_output_observation(conversation_memory):
|
||||
formatted_content = '[Current URL: http://example.com]\n\n============== BEGIN webpage content ==============\nPage loaded\n============== END webpage content =============='
|
||||
|
||||
obs = BrowserOutputObservation(
|
||||
url='http://example.com',
|
||||
trigger_by_action='browse',
|
||||
screenshot='',
|
||||
content='Page loaded',
|
||||
content=formatted_content,
|
||||
error=False,
|
||||
)
|
||||
|
||||
|
||||
@ -178,6 +178,7 @@ def test_browser_valid():
|
||||
assert len(actions) == 1
|
||||
assert isinstance(actions[0], BrowseInteractiveAction)
|
||||
assert actions[0].browser_actions == "click('button-1')"
|
||||
assert actions[0].return_axtree is False # Default value should be False
|
||||
|
||||
|
||||
def test_browser_missing_code():
|
||||
|
||||
@ -413,6 +413,7 @@ async def test_unsafe_bash_command(temp_dir: str):
|
||||
browser_actions='goto("http://localhost:3000")',
|
||||
thought='browsing to localhost',
|
||||
browsergym_send_msg_to_user='browsergym',
|
||||
return_axtree=False,
|
||||
),
|
||||
[
|
||||
Message(
|
||||
@ -430,6 +431,7 @@ async def test_unsafe_bash_command(temp_dir: str):
|
||||
arguments={
|
||||
'browser_actions': 'goto("http://localhost:3000")',
|
||||
'browsergym_send_msg_to_user': 'browsergym',
|
||||
'return_axtree': False,
|
||||
},
|
||||
),
|
||||
),
|
||||
@ -437,7 +439,9 @@ async def test_unsafe_bash_command(temp_dir: str):
|
||||
),
|
||||
( # Test BrowseURLAction
|
||||
BrowseURLAction(
|
||||
url='http://localhost:3000', thought='browsing to localhost'
|
||||
url='http://localhost:3000',
|
||||
thought='browsing to localhost',
|
||||
return_axtree=False,
|
||||
),
|
||||
[
|
||||
Message(
|
||||
@ -452,7 +456,10 @@ async def test_unsafe_bash_command(temp_dir: str):
|
||||
type='function',
|
||||
function=Function(
|
||||
name=ActionType.BROWSE,
|
||||
arguments={'url': 'http://localhost:3000'},
|
||||
arguments={
|
||||
'url': 'http://localhost:3000',
|
||||
'return_axtree': False,
|
||||
},
|
||||
),
|
||||
),
|
||||
],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user