Fix visual browsing (#7278)

Co-authored-by: openhands <openhands@all-hands.dev>
2025-12-26 05:48:36 +08:00 · 2025-03-16 16:50:25 +01:00 · 2025-03-16 16:50:25 +01:00 · 3f6c8a2338
commit 3f6c8a2338
parent dd09d46ccb
2 changed files with 7 additions and 2 deletions
--- a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
+++ b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
@ -202,6 +202,7 @@ Note:
        tabs = ''
        last_obs = None
        last_action = None
+        set_of_marks = None  # Initialize set_of_marks to None

        if len(state.history) == 1:
            # for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
@ -217,6 +218,9 @@ Note:
                # agent has responded, task finished.
                return AgentFinishAction(outputs={'content': event.content})
            elif isinstance(event, Observation):
+                # Only process BrowserOutputObservation and skip other observation types
+                if not isinstance(event, BrowserOutputObservation):
+                    continue
                last_obs = event

        if len(prev_actions) >= 1:  # ignore noop()
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@ -52,7 +52,6 @@ class ConversationMemory:
        initial_messages: list[Message],
        max_message_chars: int | None = None,
        vision_is_active: bool = False,
-        enable_som_visual_browsing: bool = False,
    ) -> list[Message]:
        """Process state history into a list of messages for the LLM.

@ -64,11 +63,13 @@ class ConversationMemory:
            max_message_chars: The maximum number of characters in the content of an event included
                in the prompt to the LLM. Larger observations are truncated.
            vision_is_active: Whether vision is active in the LLM. If True, image URLs will be included.
-            enable_som_visual_browsing: Whether to enable visual browsing for the SOM model.
        """

        events = condensed_history

+        # log visual browsing status
+        logger.debug(f'Visual browsing: {self.agent_config.enable_som_visual_browsing}')
+
        # Process special events first (system prompts, etc.)
        messages = initial_messages