Visual browsing in CodeAct using set-of-marks annotated webpage screenshots (#6464)

2025-12-26 05:48:36 +08:00 · 2025-02-01 15:56:11 -05:00 · 2025-02-01 15:56:11 -05:00 · a593d9bc6d
commit a593d9bc6d
parent eb8d1600c3
4 changed files with 50 additions and 5 deletions
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@ -11,6 +11,7 @@ from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
+from openhands.core.schema import ActionType
 from openhands.events.action import (
    Action,
    AgentDelegateAction,
@ -304,10 +305,30 @@ class CodeActAgent(Agent):
            )  # Content is already truncated by openhands-aci
        elif isinstance(obs, BrowserOutputObservation):
            text = obs.get_agent_obs_text()
-            message = Message(
-                role='user',
-                content=[TextContent(text=text)],
-            )
+            if (
+                obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
+                and obs.set_of_marks is not None
+                and len(obs.set_of_marks) > 0
+                and self.config.enable_som_visual_browsing
+                and self.llm.vision_is_active()
+                and (
+                    self.mock_function_calling
+                    or self.llm.is_visual_browser_tool_active()
+                )
+            ):
+                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
+                message = Message(
+                    role='user',
+                    content=[
+                        TextContent(text=text),
+                        ImageContent(image_urls=[obs.set_of_marks]),
+                    ],
+                )
+            else:
+                message = Message(
+                    role='user',
+                    content=[TextContent(text=text)],
+                )
        elif isinstance(obs, AgentDelegateObservation):
            text = truncate_content(
                obs.outputs['content'] if 'content' in obs.outputs else '',
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@ -21,6 +21,7 @@ class AgentConfig(BaseModel):
    """

    codeact_enable_browsing: bool = Field(default=True)
+    enable_som_visual_browsing: bool = Field(default=False)
    codeact_enable_llm_editor: bool = Field(default=False)
    codeact_enable_jupyter: bool = Field(default=True)
    micro_agent_name: str | None = Field(default=None)
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@ -101,7 +101,11 @@ class Message(BaseModel):
            # See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472
            if self.role == 'tool' and item.cache_prompt:
                role_tool_with_prompt_caching = True
-                d.pop('cache_control')
+                if isinstance(d, dict):
+                    d.pop('cache_control')
+                elif isinstance(d, list):
+                    for d_item in d:
+                        d_item.pop('cache_control')
            if isinstance(item, TextContent):
                content.append(d)
            elif isinstance(item, ImageContent) and self.vision_enabled:
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@ -73,6 +73,16 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
    'o1-2024-12-17',
 ]

+# visual browsing tool supported models
+# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool'
+VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [
+    'claude-3-5-sonnet',
+    'claude-3-5-sonnet-20240620',
+    'claude-3-5-sonnet-20241022',
+    'o1-2024-12-17',
+]
+
+
 REASONING_EFFORT_SUPPORTED_MODELS = [
    'o1-2024-12-17',
 ]
@ -466,6 +476,15 @@ class LLM(RetryMixin, DebugMixin):
        """
        return self._function_calling_active

+    def is_visual_browser_tool_active(self) -> bool:
+        return (
+            self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            or any(
+                m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            )
+        )
+
    def _post_completion(self, response: ModelResponse) -> float:
        """Post-process the completion response.