From 3f6c8a2338600e4bce41f485a1a6646e59b0cbe8 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sun, 16 Mar 2025 16:50:25 +0100 Subject: [PATCH] Fix visual browsing (#7278) Co-authored-by: openhands --- .../agenthub/visualbrowsing_agent/visualbrowsing_agent.py | 4 ++++ openhands/memory/conversation_memory.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py index 0a154190d7..f27ea74e9a 100644 --- a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py +++ b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py @@ -202,6 +202,7 @@ Note: tabs = '' last_obs = None last_action = None + set_of_marks = None # Initialize set_of_marks to None if len(state.history) == 1: # for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env @@ -217,6 +218,9 @@ Note: # agent has responded, task finished. return AgentFinishAction(outputs={'content': event.content}) elif isinstance(event, Observation): + # Only process BrowserOutputObservation and skip other observation types + if not isinstance(event, BrowserOutputObservation): + continue last_obs = event if len(prev_actions) >= 1: # ignore noop() diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py index b308fef142..3c43116e50 100644 --- a/openhands/memory/conversation_memory.py +++ b/openhands/memory/conversation_memory.py @@ -52,7 +52,6 @@ class ConversationMemory: initial_messages: list[Message], max_message_chars: int | None = None, vision_is_active: bool = False, - enable_som_visual_browsing: bool = False, ) -> list[Message]: """Process state history into a list of messages for the LLM. @@ -64,11 +63,13 @@ class ConversationMemory: max_message_chars: The maximum number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated. vision_is_active: Whether vision is active in the LLM. If True, image URLs will be included. - enable_som_visual_browsing: Whether to enable visual browsing for the SOM model. """ events = condensed_history + # log visual browsing status + logger.debug(f'Visual browsing: {self.agent_config.enable_som_visual_browsing}') + # Process special events first (system prompts, etc.) messages = initial_messages