Visual browsing in CodeAct using set-of-marks annotated webpage screenshots (#6464)

This commit is contained in:
Aditya Bharat Soni 2025-02-01 15:56:11 -05:00 committed by GitHub
parent eb8d1600c3
commit a593d9bc6d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 50 additions and 5 deletions

View File

@ -11,6 +11,7 @@ from openhands.controller.state.state import State
from openhands.core.config import AgentConfig
from openhands.core.logger import openhands_logger as logger
from openhands.core.message import ImageContent, Message, TextContent
from openhands.core.schema import ActionType
from openhands.events.action import (
Action,
AgentDelegateAction,
@ -304,10 +305,30 @@ class CodeActAgent(Agent):
) # Content is already truncated by openhands-aci
elif isinstance(obs, BrowserOutputObservation):
text = obs.get_agent_obs_text()
message = Message(
role='user',
content=[TextContent(text=text)],
)
if (
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
and obs.set_of_marks is not None
and len(obs.set_of_marks) > 0
and self.config.enable_som_visual_browsing
and self.llm.vision_is_active()
and (
self.mock_function_calling
or self.llm.is_visual_browser_tool_active()
)
):
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
message = Message(
role='user',
content=[
TextContent(text=text),
ImageContent(image_urls=[obs.set_of_marks]),
],
)
else:
message = Message(
role='user',
content=[TextContent(text=text)],
)
elif isinstance(obs, AgentDelegateObservation):
text = truncate_content(
obs.outputs['content'] if 'content' in obs.outputs else '',

View File

@ -21,6 +21,7 @@ class AgentConfig(BaseModel):
"""
codeact_enable_browsing: bool = Field(default=True)
enable_som_visual_browsing: bool = Field(default=False)
codeact_enable_llm_editor: bool = Field(default=False)
codeact_enable_jupyter: bool = Field(default=True)
micro_agent_name: str | None = Field(default=None)

View File

@ -101,7 +101,11 @@ class Message(BaseModel):
# See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472
if self.role == 'tool' and item.cache_prompt:
role_tool_with_prompt_caching = True
d.pop('cache_control')
if isinstance(d, dict):
d.pop('cache_control')
elif isinstance(d, list):
for d_item in d:
d_item.pop('cache_control')
if isinstance(item, TextContent):
content.append(d)
elif isinstance(item, ImageContent) and self.vision_enabled:

View File

@ -73,6 +73,16 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
'o1-2024-12-17',
]
# visual browsing tool supported models
# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool'
VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [
'claude-3-5-sonnet',
'claude-3-5-sonnet-20240620',
'claude-3-5-sonnet-20241022',
'o1-2024-12-17',
]
REASONING_EFFORT_SUPPORTED_MODELS = [
'o1-2024-12-17',
]
@ -466,6 +476,15 @@ class LLM(RetryMixin, DebugMixin):
"""
return self._function_calling_active
def is_visual_browser_tool_active(self) -> bool:
return (
self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
or any(
m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
)
)
def _post_completion(self, response: ModelResponse) -> float:
"""Post-process the completion response.