mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Visual browsing in CodeAct using set-of-marks annotated webpage screenshots (#6464)
This commit is contained in:
parent
eb8d1600c3
commit
a593d9bc6d
@ -11,6 +11,7 @@ from openhands.controller.state.state import State
|
||||
from openhands.core.config import AgentConfig
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.core.message import ImageContent, Message, TextContent
|
||||
from openhands.core.schema import ActionType
|
||||
from openhands.events.action import (
|
||||
Action,
|
||||
AgentDelegateAction,
|
||||
@ -304,10 +305,30 @@ class CodeActAgent(Agent):
|
||||
) # Content is already truncated by openhands-aci
|
||||
elif isinstance(obs, BrowserOutputObservation):
|
||||
text = obs.get_agent_obs_text()
|
||||
message = Message(
|
||||
role='user',
|
||||
content=[TextContent(text=text)],
|
||||
)
|
||||
if (
|
||||
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
|
||||
and obs.set_of_marks is not None
|
||||
and len(obs.set_of_marks) > 0
|
||||
and self.config.enable_som_visual_browsing
|
||||
and self.llm.vision_is_active()
|
||||
and (
|
||||
self.mock_function_calling
|
||||
or self.llm.is_visual_browser_tool_active()
|
||||
)
|
||||
):
|
||||
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
|
||||
message = Message(
|
||||
role='user',
|
||||
content=[
|
||||
TextContent(text=text),
|
||||
ImageContent(image_urls=[obs.set_of_marks]),
|
||||
],
|
||||
)
|
||||
else:
|
||||
message = Message(
|
||||
role='user',
|
||||
content=[TextContent(text=text)],
|
||||
)
|
||||
elif isinstance(obs, AgentDelegateObservation):
|
||||
text = truncate_content(
|
||||
obs.outputs['content'] if 'content' in obs.outputs else '',
|
||||
|
||||
@ -21,6 +21,7 @@ class AgentConfig(BaseModel):
|
||||
"""
|
||||
|
||||
codeact_enable_browsing: bool = Field(default=True)
|
||||
enable_som_visual_browsing: bool = Field(default=False)
|
||||
codeact_enable_llm_editor: bool = Field(default=False)
|
||||
codeact_enable_jupyter: bool = Field(default=True)
|
||||
micro_agent_name: str | None = Field(default=None)
|
||||
|
||||
@ -101,7 +101,11 @@ class Message(BaseModel):
|
||||
# See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472
|
||||
if self.role == 'tool' and item.cache_prompt:
|
||||
role_tool_with_prompt_caching = True
|
||||
d.pop('cache_control')
|
||||
if isinstance(d, dict):
|
||||
d.pop('cache_control')
|
||||
elif isinstance(d, list):
|
||||
for d_item in d:
|
||||
d_item.pop('cache_control')
|
||||
if isinstance(item, TextContent):
|
||||
content.append(d)
|
||||
elif isinstance(item, ImageContent) and self.vision_enabled:
|
||||
|
||||
@ -73,6 +73,16 @@ FUNCTION_CALLING_SUPPORTED_MODELS = [
|
||||
'o1-2024-12-17',
|
||||
]
|
||||
|
||||
# visual browsing tool supported models
|
||||
# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool'
|
||||
VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [
|
||||
'claude-3-5-sonnet',
|
||||
'claude-3-5-sonnet-20240620',
|
||||
'claude-3-5-sonnet-20241022',
|
||||
'o1-2024-12-17',
|
||||
]
|
||||
|
||||
|
||||
REASONING_EFFORT_SUPPORTED_MODELS = [
|
||||
'o1-2024-12-17',
|
||||
]
|
||||
@ -466,6 +476,15 @@ class LLM(RetryMixin, DebugMixin):
|
||||
"""
|
||||
return self._function_calling_active
|
||||
|
||||
def is_visual_browser_tool_active(self) -> bool:
|
||||
return (
|
||||
self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
|
||||
or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
|
||||
or any(
|
||||
m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
|
||||
)
|
||||
)
|
||||
|
||||
def _post_completion(self, response: ModelResponse) -> float:
|
||||
"""Post-process the completion response.
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user