From a593d9bc6dda7b888448fc1934927cc3a8ba8d1e Mon Sep 17 00:00:00 2001 From: Aditya Bharat Soni Date: Sat, 1 Feb 2025 15:56:11 -0500 Subject: [PATCH] Visual browsing in CodeAct using set-of-marks annotated webpage screenshots (#6464) --- .../agenthub/codeact_agent/codeact_agent.py | 29 ++++++++++++++++--- openhands/core/config/agent_config.py | 1 + openhands/core/message.py | 6 +++- openhands/llm/llm.py | 19 ++++++++++++ 4 files changed, 50 insertions(+), 5 deletions(-) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index d2b5b35a73..9c075a66f3 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -11,6 +11,7 @@ from openhands.controller.state.state import State from openhands.core.config import AgentConfig from openhands.core.logger import openhands_logger as logger from openhands.core.message import ImageContent, Message, TextContent +from openhands.core.schema import ActionType from openhands.events.action import ( Action, AgentDelegateAction, @@ -304,10 +305,30 @@ class CodeActAgent(Agent): ) # Content is already truncated by openhands-aci elif isinstance(obs, BrowserOutputObservation): text = obs.get_agent_obs_text() - message = Message( - role='user', - content=[TextContent(text=text)], - ) + if ( + obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE + and obs.set_of_marks is not None + and len(obs.set_of_marks) > 0 + and self.config.enable_som_visual_browsing + and self.llm.vision_is_active() + and ( + self.mock_function_calling + or self.llm.is_visual_browser_tool_active() + ) + ): + text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n' + message = Message( + role='user', + content=[ + TextContent(text=text), + ImageContent(image_urls=[obs.set_of_marks]), + ], + ) + else: + message = Message( + role='user', + content=[TextContent(text=text)], + ) elif isinstance(obs, AgentDelegateObservation): text = truncate_content( obs.outputs['content'] if 'content' in obs.outputs else '', diff --git a/openhands/core/config/agent_config.py b/openhands/core/config/agent_config.py index 67fa4e9d8a..ccd7e855b8 100644 --- a/openhands/core/config/agent_config.py +++ b/openhands/core/config/agent_config.py @@ -21,6 +21,7 @@ class AgentConfig(BaseModel): """ codeact_enable_browsing: bool = Field(default=True) + enable_som_visual_browsing: bool = Field(default=False) codeact_enable_llm_editor: bool = Field(default=False) codeact_enable_jupyter: bool = Field(default=True) micro_agent_name: str | None = Field(default=None) diff --git a/openhands/core/message.py b/openhands/core/message.py index 5da0cc6be3..ea4f0106ab 100644 --- a/openhands/core/message.py +++ b/openhands/core/message.py @@ -101,7 +101,11 @@ class Message(BaseModel): # See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472 if self.role == 'tool' and item.cache_prompt: role_tool_with_prompt_caching = True - d.pop('cache_control') + if isinstance(d, dict): + d.pop('cache_control') + elif isinstance(d, list): + for d_item in d: + d_item.pop('cache_control') if isinstance(item, TextContent): content.append(d) elif isinstance(item, ImageContent) and self.vision_enabled: diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 5b656387ec..940a96a474 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -73,6 +73,16 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [ 'o1-2024-12-17', ] +# visual browsing tool supported models +# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool' +VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [ + 'claude-3-5-sonnet', + 'claude-3-5-sonnet-20240620', + 'claude-3-5-sonnet-20241022', + 'o1-2024-12-17', +] + + REASONING_EFFORT_SUPPORTED_MODELS = [ 'o1-2024-12-17', ] @@ -466,6 +476,15 @@ class LLM(RetryMixin, DebugMixin): """ return self._function_calling_active + def is_visual_browser_tool_active(self) -> bool: + return ( + self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS + or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS + or any( + m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS + ) + ) + def _post_completion(self, response: ModelResponse) -> float: """Post-process the completion response.