Move get_agent_obs_text function to browser utils and add return_all option (#9019)

Co-authored-by: openhands <openhands@all-hands.dev>
2025-12-26 05:48:36 +08:00 · 2025-06-11 00:32:38 -04:00 · 2025-06-11 00:32:38 -04:00 · 9097f487a6
commit 9097f487a6
parent fd921a4f88
13 changed files with 145 additions and 85 deletions
--- a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
+++ b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
@ -208,7 +208,7 @@ Note:
            # for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
            # initialize and retrieve the first observation by issuing an noop OP
            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
-            return BrowseInteractiveAction(browser_actions='noop(1000)')
+            return BrowseInteractiveAction(browser_actions='noop(1000)', return_axtree=True)

        for event in state.view:
            if isinstance(event, BrowseInteractiveAction):
--- a/openhands/events/action/browse.py
+++ b/openhands/events/action/browse.py
@ -12,6 +12,7 @@ class BrowseURLAction(Action):
    action: str = ActionType.BROWSE
    runnable: ClassVar[bool] = True
    security_risk: ActionSecurityRisk | None = None
+    return_axtree: bool = False

    @property
    def message(self) -> str:
@ -33,6 +34,7 @@ class BrowseInteractiveAction(Action):
    action: str = ActionType.BROWSE_INTERACTIVE
    runnable: ClassVar[bool] = True
    security_risk: ActionSecurityRisk | None = None
+    return_axtree: bool = False

    @property
    def message(self) -> str:
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@ -1,9 +1,7 @@
 from dataclasses import dataclass, field
 from typing import Any

-from browsergym.utils.obs import flatten_axtree_to_str
-
-from openhands.core.schema import ActionType, ObservationType
+from openhands.core.schema import ObservationType
 from openhands.events.observation.observation import Observation


@ -53,69 +51,5 @@ class BrowserOutputObservation(Observation):
        if self.screenshot_path:
            ret += f'Screenshot saved to: {self.screenshot_path}\n'
        ret += '--- Agent Observation ---\n'
-        ret += self.get_agent_obs_text()
+        ret += self.content
        return ret
-
-    def get_agent_obs_text(self) -> str:
-        """Get a concise text that will be shown to the agent."""
-        if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
-            text = f'[Current URL: {self.url}]\n'
-            text += f'[Focused element bid: {self.focused_element_bid}]\n'
-
-            # Add screenshot path information if available
-            if self.screenshot_path:
-                text += f'[Screenshot saved to: {self.screenshot_path}]\n'
-
-            text += '\n'
-
-            if self.error:
-                text += (
-                    '================ BEGIN error message ===============\n'
-                    'The following error occurred when executing the last action:\n'
-                    f'{self.last_browser_action_error}\n'
-                    '================ END error message ===============\n'
-                )
-            else:
-                text += '[Action executed successfully.]\n'
-            try:
-                # We do not filter visible only here because we want to show the full content
-                # of the web page to the agent for simplicity.
-                # FIXME: handle the case when the web page is too large
-                cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
-                text += (
-                    f'============== BEGIN accessibility tree ==============\n'
-                    f'{cur_axtree_txt}\n'
-                    f'============== END accessibility tree ==============\n'
-                )
-            except Exception as e:
-                text += (
-                    f'\n[Error encountered when processing the accessibility tree: {e}]'
-                )
-            return text
-
-        elif self.trigger_by_action == ActionType.BROWSE:
-            text = f'[Current URL: {self.url}]\n'
-
-            if self.error:
-                text += (
-                    '================ BEGIN error message ===============\n'
-                    'The following error occurred when trying to visit the URL:\n'
-                    f'{self.last_browser_action_error}\n'
-                    '================ END error message ===============\n'
-                )
-            text += '============== BEGIN webpage content ==============\n'
-            text += self.content
-            text += '\n============== END webpage content ==============\n'
-            return text
-        else:
-            raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')
-
-    def get_axtree_str(self, filter_visible_only: bool = False) -> str:
-        cur_axtree_txt = flatten_axtree_to_str(
-            self.axtree_object,
-            extra_properties=self.extra_element_properties,
-            with_clickable=True,
-            skip_generic=False,
-            filter_visible_only=filter_visible_only,
-        )
-        return str(cur_axtree_txt)
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@ -391,7 +391,7 @@ class ConversationMemory:
                role='user', content=[TextContent(text=obs.content)]
            )  # Content is already truncated by openhands-aci
        elif isinstance(obs, BrowserOutputObservation):
-            text = obs.get_agent_obs_text()
+            text = obs.content
            if (
                obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
                and enable_som_visual_browsing
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@ -2,7 +2,9 @@ import base64
 import datetime
 import os
 from pathlib import Path
+from typing import Any

+from browsergym.utils.obs import flatten_axtree_to_str
 from PIL import Image

 from openhands.core.exceptions import BrowserUnavailableException
@ -14,6 +16,78 @@ from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.utils.async_utils import call_sync_from_async


+def get_axtree_str(
+    axtree_object: dict[str, Any],
+    extra_element_properties: dict[str, Any],
+    filter_visible_only: bool = False,
+) -> str:
+    cur_axtree_txt = flatten_axtree_to_str(
+        axtree_object,
+        extra_properties=extra_element_properties,
+        with_clickable=True,
+        skip_generic=False,
+        filter_visible_only=filter_visible_only,
+    )
+    return str(cur_axtree_txt)
+
+
+def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
+    """Get a concise text that will be shown to the agent."""
+    if obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
+        text = f'[Current URL: {obs.url}]\n'
+        text += f'[Focused element bid: {obs.focused_element_bid}]\n'
+
+        # Add screenshot path information if available
+        if obs.screenshot_path:
+            text += f'[Screenshot saved to: {obs.screenshot_path}]\n'
+
+        text += '\n'
+
+        if obs.error:
+            text += (
+                '================ BEGIN error message ===============\n'
+                'The following error occurred when executing the last action:\n'
+                f'{obs.last_browser_action_error}\n'
+                '================ END error message ===============\n'
+            )
+        else:
+            text += '[Action executed successfully.]\n'
+        try:
+            # We do not filter visible only here because we want to show the full content
+            # of the web page to the agent for simplicity.
+            # FIXME: handle the case when the web page is too large
+            cur_axtree_txt = get_axtree_str(
+                obs.axtree_object,
+                obs.extra_element_properties,
+                filter_visible_only=False,
+            )
+            text += (
+                f'============== BEGIN accessibility tree ==============\n'
+                f'{cur_axtree_txt}\n'
+                f'============== END accessibility tree ==============\n'
+            )
+        except Exception as e:
+            text += f'\n[Error encountered when processing the accessibility tree: {e}]'
+        return text
+
+    elif obs.trigger_by_action == ActionType.BROWSE:
+        text = f'[Current URL: {obs.url}]\n'
+
+        if obs.error:
+            text += (
+                '================ BEGIN error message ===============\n'
+                'The following error occurred when trying to visit the URL:\n'
+                f'{obs.last_browser_action_error}\n'
+                '================ END error message ===============\n'
+            )
+        text += '============== BEGIN webpage content ==============\n'
+        text += obs.content
+        text += '\n============== END webpage content ==============\n'
+        return text
+    else:
+        raise ValueError(f'Invalid trigger_by_action: {obs.trigger_by_action}')
+
+
 async def browse(
    action: BrowseURLAction | BrowseInteractiveAction,
    browser: BrowserEnv | None,
@ -78,7 +152,8 @@ async def browse(
                image = png_base64_url_to_image(obs.get('screenshot'))
                image.save(screenshot_path, format='PNG', optimize=True)

-        return BrowserOutputObservation(
+        # Create the observation with all data
+        observation = BrowserOutputObservation(
            content=obs['text_content'],  # text content of the page
            url=obs.get('url', ''),  # URL of the page
            screenshot=obs.get('screenshot', None),  # base64-encoded screenshot, png
@ -103,13 +178,37 @@ async def browse(
            error=True if obs.get('last_action_error', '') else False,  # error flag
            trigger_by_action=action.action,
        )
+
+        # Process the content first using the axtree_object
+        observation.content = get_agent_obs_text(observation)
+
+        # If return_axtree is False, remove the axtree_object to save space
+        if not action.return_axtree:
+            observation.dom_object = {}
+            observation.axtree_object = {}
+            observation.extra_element_properties = {}
+
+        return observation
    except Exception as e:
-        return BrowserOutputObservation(
-            content=str(e),
+        error_message = str(e)
+        error_url = asked_url if action.action == ActionType.BROWSE else ''
+
+        # Create error observation
+        observation = BrowserOutputObservation(
+            content=error_message,
            screenshot='',
            screenshot_path=None,
            error=True,
-            last_browser_action_error=str(e),
-            url=asked_url if action.action == ActionType.BROWSE else '',
+            last_browser_action_error=error_message,
+            url=error_url,
            trigger_by_action=action.action,
        )
+
+        # Process the content using get_agent_obs_text regardless of return_axtree value
+        try:
+            observation.content = get_agent_obs_text(observation)
+        except Exception:
+            # If get_agent_obs_text fails, keep the original error message
+            pass
+
+        return observation
--- a/openhands/security/invariant/parser.py
+++ b/openhands/security/invariant/parser.py
@ -50,6 +50,7 @@ def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement
        event_dict = event_to_dict(action)
        args = event_dict.get('args', {})
        thought = args.pop('thought', None)
+
        function = Function(name=action.action, arguments=args)
        if thought is not None:
            inv_trace.append(Message(role='assistant', content=thought))
--- a/tests/runtime/test_browsergym_envs.py
+++ b/tests/runtime/test_browsergym_envs.py
@ -43,7 +43,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
    )

    # Test browse
-    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    action = BrowseInteractiveAction(
+        browser_actions=BROWSER_EVAL_GET_GOAL_ACTION, return_axtree=False
+    )
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -54,7 +56,7 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
    assert 'from the list and click Submit' in obs.content

    # Make sure the browser can produce observation in eval env
-    action = BrowseInteractiveAction(browser_actions='noop()')
+    action = BrowseInteractiveAction(browser_actions='noop()', return_axtree=False)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -64,7 +66,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
    )

    # Make sure the rewards are working
-    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    action = BrowseInteractiveAction(
+        browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION, return_axtree=False
+    )
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@ -45,7 +45,7 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

-    action_browse = BrowseURLAction(url='http://localhost:8000')
+    action_browse = BrowseURLAction(url='http://localhost:8000', return_axtree=False)
    logger.info(action_browse, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action_browse)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -116,7 +116,9 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):

        # Browse to the PDF file
        pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf'
-        action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")')
+        action_browse = BrowseInteractiveAction(
+            browser_actions=f'goto("{pdf_url}")', return_axtree=False
+        )
        logger.info(action_browse, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action_browse)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -185,7 +187,9 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):

        # Browse to the PNG file
        png_url = f'{server_url}/view?path=/workspace/test_image.png'
-        action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")')
+        action_browse = BrowseInteractiveAction(
+            browser_actions=f'goto("{png_url}")', return_axtree=False
+        )
        logger.info(action_browse, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action_browse)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@ -108,7 +108,11 @@ def test_cmd_run_action_serialization_deserialization():
 def test_browse_url_action_serialization_deserialization():
    original_action_dict = {
        'action': 'browse',
-        'args': {'thought': '', 'url': 'https://www.example.com'},
+        'args': {
+            'thought': '',
+            'url': 'https://www.example.com',
+            'return_axtree': False,
+        },
    }
    serialization_deserialization(original_action_dict, BrowseURLAction)

@ -120,6 +124,7 @@ def test_browse_interactive_action_serialization_deserialization():
            'thought': '',
            'browser_actions': 'goto("https://www.example.com")',
            'browsergym_send_msg_to_user': '',
+            'return_axtree': False,
        },
    }
    serialization_deserialization(original_action_dict, BrowseInteractiveAction)
--- a/tests/unit/test_browsing_agent_parser.py
+++ b/tests/unit/test_browsing_agent_parser.py
@ -80,3 +80,4 @@ def test_parse_action(
    assert action.browser_actions == expected_browser_actions
    assert action.thought == expected_thought
    assert action.browsergym_send_msg_to_user == expected_msg_content
+    assert action.return_axtree is False  # Default value should be False
--- a/tests/unit/test_conversation_memory.py
+++ b/tests/unit/test_conversation_memory.py
@ -457,11 +457,13 @@ def test_process_events_with_file_read_observation(conversation_memory):


 def test_process_events_with_browser_output_observation(conversation_memory):
+    formatted_content = '[Current URL: http://example.com]\n\n============== BEGIN webpage content ==============\nPage loaded\n============== END webpage content =============='
+
    obs = BrowserOutputObservation(
        url='http://example.com',
        trigger_by_action='browse',
        screenshot='',
-        content='Page loaded',
+        content=formatted_content,
        error=False,
    )

--- a/tests/unit/test_function_calling.py
+++ b/tests/unit/test_function_calling.py
@ -178,6 +178,7 @@ def test_browser_valid():
    assert len(actions) == 1
    assert isinstance(actions[0], BrowseInteractiveAction)
    assert actions[0].browser_actions == "click('button-1')"
+    assert actions[0].return_axtree is False  # Default value should be False


 def test_browser_missing_code():
--- a/tests/unit/test_security.py
+++ b/tests/unit/test_security.py
@ -413,6 +413,7 @@ async def test_unsafe_bash_command(temp_dir: str):
                browser_actions='goto("http://localhost:3000")',
                thought='browsing to localhost',
                browsergym_send_msg_to_user='browsergym',
+                return_axtree=False,
            ),
            [
                Message(
@ -430,6 +431,7 @@ async def test_unsafe_bash_command(temp_dir: str):
                        arguments={
                            'browser_actions': 'goto("http://localhost:3000")',
                            'browsergym_send_msg_to_user': 'browsergym',
+                            'return_axtree': False,
                        },
                    ),
                ),
@ -437,7 +439,9 @@ async def test_unsafe_bash_command(temp_dir: str):
        ),
        (  # Test BrowseURLAction
            BrowseURLAction(
-                url='http://localhost:3000', thought='browsing to localhost'
+                url='http://localhost:3000',
+                thought='browsing to localhost',
+                return_axtree=False,
            ),
            [
                Message(
@ -452,7 +456,10 @@ async def test_unsafe_bash_command(temp_dir: str):
                    type='function',
                    function=Function(
                        name=ActionType.BROWSE,
-                        arguments={'url': 'http://localhost:3000'},
+                        arguments={
+                            'url': 'http://localhost:3000',
+                            'return_axtree': False,
+                        },
                    ),
                ),
            ],