diff --git a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py index d0c26b0bb4..171ec677ba 100644 --- a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py +++ b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py @@ -208,7 +208,7 @@ Note: # for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env # initialize and retrieve the first observation by issuing an noop OP # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites - return BrowseInteractiveAction(browser_actions='noop(1000)') + return BrowseInteractiveAction(browser_actions='noop(1000)', return_axtree=True) for event in state.view: if isinstance(event, BrowseInteractiveAction): diff --git a/openhands/events/action/browse.py b/openhands/events/action/browse.py index c68c27ac0f..556b693a0e 100644 --- a/openhands/events/action/browse.py +++ b/openhands/events/action/browse.py @@ -12,6 +12,7 @@ class BrowseURLAction(Action): action: str = ActionType.BROWSE runnable: ClassVar[bool] = True security_risk: ActionSecurityRisk | None = None + return_axtree: bool = False @property def message(self) -> str: @@ -33,6 +34,7 @@ class BrowseInteractiveAction(Action): action: str = ActionType.BROWSE_INTERACTIVE runnable: ClassVar[bool] = True security_risk: ActionSecurityRisk | None = None + return_axtree: bool = False @property def message(self) -> str: diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py index 4474cfcb66..dcecd86123 100644 --- a/openhands/events/observation/browse.py +++ b/openhands/events/observation/browse.py @@ -1,9 +1,7 @@ from dataclasses import dataclass, field from typing import Any -from browsergym.utils.obs import flatten_axtree_to_str - -from openhands.core.schema import ActionType, ObservationType +from openhands.core.schema import ObservationType from openhands.events.observation.observation import Observation @@ -53,69 +51,5 @@ class BrowserOutputObservation(Observation): if self.screenshot_path: ret += f'Screenshot saved to: {self.screenshot_path}\n' ret += '--- Agent Observation ---\n' - ret += self.get_agent_obs_text() + ret += self.content return ret - - def get_agent_obs_text(self) -> str: - """Get a concise text that will be shown to the agent.""" - if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE: - text = f'[Current URL: {self.url}]\n' - text += f'[Focused element bid: {self.focused_element_bid}]\n' - - # Add screenshot path information if available - if self.screenshot_path: - text += f'[Screenshot saved to: {self.screenshot_path}]\n' - - text += '\n' - - if self.error: - text += ( - '================ BEGIN error message ===============\n' - 'The following error occurred when executing the last action:\n' - f'{self.last_browser_action_error}\n' - '================ END error message ===============\n' - ) - else: - text += '[Action executed successfully.]\n' - try: - # We do not filter visible only here because we want to show the full content - # of the web page to the agent for simplicity. - # FIXME: handle the case when the web page is too large - cur_axtree_txt = self.get_axtree_str(filter_visible_only=False) - text += ( - f'============== BEGIN accessibility tree ==============\n' - f'{cur_axtree_txt}\n' - f'============== END accessibility tree ==============\n' - ) - except Exception as e: - text += ( - f'\n[Error encountered when processing the accessibility tree: {e}]' - ) - return text - - elif self.trigger_by_action == ActionType.BROWSE: - text = f'[Current URL: {self.url}]\n' - - if self.error: - text += ( - '================ BEGIN error message ===============\n' - 'The following error occurred when trying to visit the URL:\n' - f'{self.last_browser_action_error}\n' - '================ END error message ===============\n' - ) - text += '============== BEGIN webpage content ==============\n' - text += self.content - text += '\n============== END webpage content ==============\n' - return text - else: - raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}') - - def get_axtree_str(self, filter_visible_only: bool = False) -> str: - cur_axtree_txt = flatten_axtree_to_str( - self.axtree_object, - extra_properties=self.extra_element_properties, - with_clickable=True, - skip_generic=False, - filter_visible_only=filter_visible_only, - ) - return str(cur_axtree_txt) diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py index c0de1877b1..78873efc30 100644 --- a/openhands/memory/conversation_memory.py +++ b/openhands/memory/conversation_memory.py @@ -391,7 +391,7 @@ class ConversationMemory: role='user', content=[TextContent(text=obs.content)] ) # Content is already truncated by openhands-aci elif isinstance(obs, BrowserOutputObservation): - text = obs.get_agent_obs_text() + text = obs.content if ( obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE and enable_som_visual_browsing diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 1df1112a61..cb8be5509d 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -2,7 +2,9 @@ import base64 import datetime import os from pathlib import Path +from typing import Any +from browsergym.utils.obs import flatten_axtree_to_str from PIL import Image from openhands.core.exceptions import BrowserUnavailableException @@ -14,6 +16,78 @@ from openhands.runtime.browser.browser_env import BrowserEnv from openhands.utils.async_utils import call_sync_from_async +def get_axtree_str( + axtree_object: dict[str, Any], + extra_element_properties: dict[str, Any], + filter_visible_only: bool = False, +) -> str: + cur_axtree_txt = flatten_axtree_to_str( + axtree_object, + extra_properties=extra_element_properties, + with_clickable=True, + skip_generic=False, + filter_visible_only=filter_visible_only, + ) + return str(cur_axtree_txt) + + +def get_agent_obs_text(obs: BrowserOutputObservation) -> str: + """Get a concise text that will be shown to the agent.""" + if obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE: + text = f'[Current URL: {obs.url}]\n' + text += f'[Focused element bid: {obs.focused_element_bid}]\n' + + # Add screenshot path information if available + if obs.screenshot_path: + text += f'[Screenshot saved to: {obs.screenshot_path}]\n' + + text += '\n' + + if obs.error: + text += ( + '================ BEGIN error message ===============\n' + 'The following error occurred when executing the last action:\n' + f'{obs.last_browser_action_error}\n' + '================ END error message ===============\n' + ) + else: + text += '[Action executed successfully.]\n' + try: + # We do not filter visible only here because we want to show the full content + # of the web page to the agent for simplicity. + # FIXME: handle the case when the web page is too large + cur_axtree_txt = get_axtree_str( + obs.axtree_object, + obs.extra_element_properties, + filter_visible_only=False, + ) + text += ( + f'============== BEGIN accessibility tree ==============\n' + f'{cur_axtree_txt}\n' + f'============== END accessibility tree ==============\n' + ) + except Exception as e: + text += f'\n[Error encountered when processing the accessibility tree: {e}]' + return text + + elif obs.trigger_by_action == ActionType.BROWSE: + text = f'[Current URL: {obs.url}]\n' + + if obs.error: + text += ( + '================ BEGIN error message ===============\n' + 'The following error occurred when trying to visit the URL:\n' + f'{obs.last_browser_action_error}\n' + '================ END error message ===============\n' + ) + text += '============== BEGIN webpage content ==============\n' + text += obs.content + text += '\n============== END webpage content ==============\n' + return text + else: + raise ValueError(f'Invalid trigger_by_action: {obs.trigger_by_action}') + + async def browse( action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None, @@ -78,7 +152,8 @@ async def browse( image = png_base64_url_to_image(obs.get('screenshot')) image.save(screenshot_path, format='PNG', optimize=True) - return BrowserOutputObservation( + # Create the observation with all data + observation = BrowserOutputObservation( content=obs['text_content'], # text content of the page url=obs.get('url', ''), # URL of the page screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png @@ -103,13 +178,37 @@ async def browse( error=True if obs.get('last_action_error', '') else False, # error flag trigger_by_action=action.action, ) + + # Process the content first using the axtree_object + observation.content = get_agent_obs_text(observation) + + # If return_axtree is False, remove the axtree_object to save space + if not action.return_axtree: + observation.dom_object = {} + observation.axtree_object = {} + observation.extra_element_properties = {} + + return observation except Exception as e: - return BrowserOutputObservation( - content=str(e), + error_message = str(e) + error_url = asked_url if action.action == ActionType.BROWSE else '' + + # Create error observation + observation = BrowserOutputObservation( + content=error_message, screenshot='', screenshot_path=None, error=True, - last_browser_action_error=str(e), - url=asked_url if action.action == ActionType.BROWSE else '', + last_browser_action_error=error_message, + url=error_url, trigger_by_action=action.action, ) + + # Process the content using get_agent_obs_text regardless of return_axtree value + try: + observation.content = get_agent_obs_text(observation) + except Exception: + # If get_agent_obs_text fails, keep the original error message + pass + + return observation diff --git a/openhands/security/invariant/parser.py b/openhands/security/invariant/parser.py index ba64583edd..01875e099b 100644 --- a/openhands/security/invariant/parser.py +++ b/openhands/security/invariant/parser.py @@ -50,6 +50,7 @@ def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement event_dict = event_to_dict(action) args = event_dict.get('args', {}) thought = args.pop('thought', None) + function = Function(name=action.action, arguments=args) if thought is not None: inv_trace.append(Message(role='assistant', content=thought)) diff --git a/tests/runtime/test_browsergym_envs.py b/tests/runtime/test_browsergym_envs.py index c3806e484e..ad31d84647 100644 --- a/tests/runtime/test_browsergym_envs.py +++ b/tests/runtime/test_browsergym_envs.py @@ -43,7 +43,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir): ) # Test browse - action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION) + action = BrowseInteractiveAction( + browser_actions=BROWSER_EVAL_GET_GOAL_ACTION, return_axtree=False + ) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -54,7 +56,7 @@ def test_browsergym_eval_env(runtime_cls, temp_dir): assert 'from the list and click Submit' in obs.content # Make sure the browser can produce observation in eval env - action = BrowseInteractiveAction(browser_actions='noop()') + action = BrowseInteractiveAction(browser_actions='noop()', return_axtree=False) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -64,7 +66,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir): ) # Make sure the rewards are working - action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION) + action = BrowseInteractiveAction( + browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION, return_axtree=False + ) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py index 4cf0dcf14d..df42f5dff4 100644 --- a/tests/runtime/test_browsing.py +++ b/tests/runtime/test_browsing.py @@ -45,7 +45,7 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands): logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 - action_browse = BrowseURLAction(url='http://localhost:8000') + action_browse = BrowseURLAction(url='http://localhost:8000', return_axtree=False) logger.info(action_browse, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action_browse) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -116,7 +116,9 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands): # Browse to the PDF file pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf' - action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")') + action_browse = BrowseInteractiveAction( + browser_actions=f'goto("{pdf_url}")', return_axtree=False + ) logger.info(action_browse, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action_browse) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -185,7 +187,9 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands): # Browse to the PNG file png_url = f'{server_url}/view?path=/workspace/test_image.png' - action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")') + action_browse = BrowseInteractiveAction( + browser_actions=f'goto("{png_url}")', return_axtree=False + ) logger.info(action_browse, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action_browse) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/tests/unit/test_action_serialization.py b/tests/unit/test_action_serialization.py index 2b7daaa87f..05459ae850 100644 --- a/tests/unit/test_action_serialization.py +++ b/tests/unit/test_action_serialization.py @@ -108,7 +108,11 @@ def test_cmd_run_action_serialization_deserialization(): def test_browse_url_action_serialization_deserialization(): original_action_dict = { 'action': 'browse', - 'args': {'thought': '', 'url': 'https://www.example.com'}, + 'args': { + 'thought': '', + 'url': 'https://www.example.com', + 'return_axtree': False, + }, } serialization_deserialization(original_action_dict, BrowseURLAction) @@ -120,6 +124,7 @@ def test_browse_interactive_action_serialization_deserialization(): 'thought': '', 'browser_actions': 'goto("https://www.example.com")', 'browsergym_send_msg_to_user': '', + 'return_axtree': False, }, } serialization_deserialization(original_action_dict, BrowseInteractiveAction) diff --git a/tests/unit/test_browsing_agent_parser.py b/tests/unit/test_browsing_agent_parser.py index 351b8e8eaf..6392f90c0a 100644 --- a/tests/unit/test_browsing_agent_parser.py +++ b/tests/unit/test_browsing_agent_parser.py @@ -80,3 +80,4 @@ def test_parse_action( assert action.browser_actions == expected_browser_actions assert action.thought == expected_thought assert action.browsergym_send_msg_to_user == expected_msg_content + assert action.return_axtree is False # Default value should be False diff --git a/tests/unit/test_conversation_memory.py b/tests/unit/test_conversation_memory.py index 37a7c5a975..7ae0122f38 100644 --- a/tests/unit/test_conversation_memory.py +++ b/tests/unit/test_conversation_memory.py @@ -457,11 +457,13 @@ def test_process_events_with_file_read_observation(conversation_memory): def test_process_events_with_browser_output_observation(conversation_memory): + formatted_content = '[Current URL: http://example.com]\n\n============== BEGIN webpage content ==============\nPage loaded\n============== END webpage content ==============' + obs = BrowserOutputObservation( url='http://example.com', trigger_by_action='browse', screenshot='', - content='Page loaded', + content=formatted_content, error=False, ) diff --git a/tests/unit/test_function_calling.py b/tests/unit/test_function_calling.py index 2da8da9858..5865db8090 100644 --- a/tests/unit/test_function_calling.py +++ b/tests/unit/test_function_calling.py @@ -178,6 +178,7 @@ def test_browser_valid(): assert len(actions) == 1 assert isinstance(actions[0], BrowseInteractiveAction) assert actions[0].browser_actions == "click('button-1')" + assert actions[0].return_axtree is False # Default value should be False def test_browser_missing_code(): diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py index 058a0b8303..44f9d6adc5 100644 --- a/tests/unit/test_security.py +++ b/tests/unit/test_security.py @@ -413,6 +413,7 @@ async def test_unsafe_bash_command(temp_dir: str): browser_actions='goto("http://localhost:3000")', thought='browsing to localhost', browsergym_send_msg_to_user='browsergym', + return_axtree=False, ), [ Message( @@ -430,6 +431,7 @@ async def test_unsafe_bash_command(temp_dir: str): arguments={ 'browser_actions': 'goto("http://localhost:3000")', 'browsergym_send_msg_to_user': 'browsergym', + 'return_axtree': False, }, ), ), @@ -437,7 +439,9 @@ async def test_unsafe_bash_command(temp_dir: str): ), ( # Test BrowseURLAction BrowseURLAction( - url='http://localhost:3000', thought='browsing to localhost' + url='http://localhost:3000', + thought='browsing to localhost', + return_axtree=False, ), [ Message( @@ -452,7 +456,10 @@ async def test_unsafe_bash_command(temp_dir: str): type='function', function=Function( name=ActionType.BROWSE, - arguments={'url': 'http://localhost:3000'}, + arguments={ + 'url': 'http://localhost:3000', + 'return_axtree': False, + }, ), ), ],