Fix visual browsing (#7278)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Engel Nyst 2025-03-16 16:50:25 +01:00 committed by GitHub
parent dd09d46ccb
commit 3f6c8a2338
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 7 additions and 2 deletions

View File

@ -202,6 +202,7 @@ Note:
tabs = ''
last_obs = None
last_action = None
set_of_marks = None # Initialize set_of_marks to None
if len(state.history) == 1:
# for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
@ -217,6 +218,9 @@ Note:
# agent has responded, task finished.
return AgentFinishAction(outputs={'content': event.content})
elif isinstance(event, Observation):
# Only process BrowserOutputObservation and skip other observation types
if not isinstance(event, BrowserOutputObservation):
continue
last_obs = event
if len(prev_actions) >= 1: # ignore noop()

View File

@ -52,7 +52,6 @@ class ConversationMemory:
initial_messages: list[Message],
max_message_chars: int | None = None,
vision_is_active: bool = False,
enable_som_visual_browsing: bool = False,
) -> list[Message]:
"""Process state history into a list of messages for the LLM.
@ -64,11 +63,13 @@ class ConversationMemory:
max_message_chars: The maximum number of characters in the content of an event included
in the prompt to the LLM. Larger observations are truncated.
vision_is_active: Whether vision is active in the LLM. If True, image URLs will be included.
enable_som_visual_browsing: Whether to enable visual browsing for the SOM model.
"""
events = condensed_history
# log visual browsing status
logger.debug(f'Visual browsing: {self.agent_config.enable_som_visual_browsing}')
# Process special events first (system prompts, etc.)
messages = initial_messages