mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[Feat] A competitive Web Browsing agent (#1856)
* initial attempt at a browsing only agent * add browsing agent * update * implement agent * update * fix comments * remove unnecessary things from memory extras * update image processing --------- Co-authored-by: Yufan Song <33971064+yufansong@users.noreply.github.com>
This commit is contained in:
parent
d76c425b76
commit
1fe290adf9
@ -10,6 +10,7 @@ load_dotenv()
|
||||
|
||||
from . import ( # noqa: E402
|
||||
SWE_agent,
|
||||
browsing_agent,
|
||||
codeact_agent,
|
||||
delegator_agent,
|
||||
dummy_agent,
|
||||
@ -24,6 +25,7 @@ __all__ = [
|
||||
'SWE_agent',
|
||||
'delegator_agent',
|
||||
'dummy_agent',
|
||||
'browsing_agent',
|
||||
]
|
||||
|
||||
for agent in all_microagents.values():
|
||||
|
||||
16
agenthub/browsing_agent/README.md
Normal file
16
agenthub/browsing_agent/README.md
Normal file
@ -0,0 +1,16 @@
|
||||
# Browsing Agent Framework
|
||||
|
||||
This folder implements the basic BrowserGym [demo agent](https://github.com/ServiceNow/BrowserGym/tree/main/demo_agent) that enables full-featured web browsing.
|
||||
|
||||
|
||||
## Test run
|
||||
|
||||
Note that for browsing tasks, GPT-4 is usually a requirement to get reasonable results, due to the complexity of the web page structures.
|
||||
|
||||
```
|
||||
poetry run python ./opendevin/core/main.py \
|
||||
-i 5 \
|
||||
-t "tell me the usa's president using google search" \
|
||||
-c BrowsingAgent \
|
||||
-m gpt-4o-2024-05-13
|
||||
```
|
||||
5
agenthub/browsing_agent/__init__.py
Normal file
5
agenthub/browsing_agent/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from opendevin.controller.agent import Agent
|
||||
|
||||
from .browsing_agent import BrowsingAgent
|
||||
|
||||
Agent.register('BrowsingAgent', BrowsingAgent)
|
||||
155
agenthub/browsing_agent/browsing_agent.py
Normal file
155
agenthub/browsing_agent/browsing_agent.py
Normal file
@ -0,0 +1,155 @@
|
||||
import ast
|
||||
|
||||
from browsergym.core.action.highlevel import HighLevelActionSet
|
||||
from browsergym.utils.obs import flatten_axtree_to_str
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.events.action import (
|
||||
Action,
|
||||
BrowseInteractiveAction,
|
||||
MessageAction,
|
||||
)
|
||||
from opendevin.events.observation import BrowserOutputObservation
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.plugins import (
|
||||
PluginRequirement,
|
||||
)
|
||||
|
||||
|
||||
def parse_response(response: str) -> Action:
|
||||
thought = response.split('```')[0].strip()
|
||||
action_str = response.split('```')[1].strip()
|
||||
if 'send_msg_to_user(' in action_str:
|
||||
tree = ast.parse(action_str)
|
||||
args = tree.body[0].value.args # type: ignore
|
||||
return MessageAction(args[0].value)
|
||||
|
||||
return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
|
||||
|
||||
|
||||
class BrowsingAgent(Agent):
|
||||
VERSION = '1.0'
|
||||
"""
|
||||
An agent that interacts with the browser.
|
||||
"""
|
||||
|
||||
sandbox_plugins: list[PluginRequirement] = []
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm: LLM,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes a new instance of the BrowsingAgent class.
|
||||
|
||||
Parameters:
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
"""
|
||||
super().__init__(llm)
|
||||
self.action_space = HighLevelActionSet(
|
||||
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
|
||||
subsets=[
|
||||
'chat',
|
||||
'bid',
|
||||
'nav',
|
||||
], # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
|
||||
strict=False, # less strict on the parsing of the actions
|
||||
multiaction=True, # enable to agent to take multiple actions at once
|
||||
)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Resets the Browsing Agent.
|
||||
"""
|
||||
super().reset()
|
||||
self.cost_accumulator = 0
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
Performs one step using the Browsing Agent.
|
||||
This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
|
||||
|
||||
Parameters:
|
||||
- state (State): used to get updated info
|
||||
|
||||
Returns:
|
||||
- BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
|
||||
- MessageAction(content) - Message action to run (e.g. ask for clarification)
|
||||
- AgentFinishAction() - end the interaction
|
||||
"""
|
||||
goal = state.get_current_user_intent()
|
||||
messages = []
|
||||
|
||||
system_msg = f"""\
|
||||
# Instructions
|
||||
Review the current state of the page and all other information to find the best
|
||||
possible next action to accomplish your goal. Your answer will be interpreted
|
||||
and executed by a program, make sure to follow the formatting instructions.
|
||||
|
||||
# Goal:
|
||||
{goal}
|
||||
|
||||
# Action Space
|
||||
{self.action_space.describe(with_long_description=False, with_examples=True)}
|
||||
"""
|
||||
messages.append({'role': 'system', 'content': system_msg})
|
||||
prev_actions = ''
|
||||
cur_axtree_txt = ''
|
||||
error_prefix = ''
|
||||
last_obs = None
|
||||
for prev_action, obs in state.history:
|
||||
if isinstance(prev_action, BrowseInteractiveAction):
|
||||
prev_actions += f'{prev_action.browser_actions}\n'
|
||||
last_obs = obs
|
||||
|
||||
if isinstance(last_obs, BrowserOutputObservation):
|
||||
if last_obs.error:
|
||||
# add error recovery prompt prefix
|
||||
error_prefix = f'Last action failed:\n{last_obs.last_browser_action}\nTry again with the current state of the page.\n'
|
||||
cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
|
||||
|
||||
prompt = f"""\
|
||||
{error_prefix}
|
||||
|
||||
# Current Accessibility Tree:
|
||||
{cur_axtree_txt}
|
||||
|
||||
# Previous Actions
|
||||
{prev_actions}
|
||||
|
||||
Here is an example with chain of thought of a valid action when clicking on a button:
|
||||
"
|
||||
In order to accomplish my goal I need to click on the button with bid 12
|
||||
```click("12")```
|
||||
"
|
||||
"""
|
||||
messages.append({'role': 'user', 'content': prompt})
|
||||
response = self.llm.completion(
|
||||
messages=messages,
|
||||
temperature=0.0,
|
||||
)
|
||||
self.log_cost(response)
|
||||
action_resp = response['choices'][0]['message']['content']
|
||||
logger.info(prompt)
|
||||
logger.info(action_resp)
|
||||
return parse_response(action_resp)
|
||||
|
||||
def search_memory(self, query: str) -> list[str]:
|
||||
raise NotImplementedError('Implement this abstract method')
|
||||
|
||||
def log_cost(self, response):
|
||||
# TODO: refactor to unified cost tracking
|
||||
try:
|
||||
cur_cost = self.llm.completion_cost(response)
|
||||
except Exception:
|
||||
cur_cost = 0
|
||||
self.cost_accumulator += cur_cost
|
||||
logger.info(
|
||||
'Cost: %.2f USD | Accumulated Cost: %.2f USD',
|
||||
cur_cost,
|
||||
self.cost_accumulator,
|
||||
)
|
||||
785
agenthub/browsing_agent/prompt.py
Normal file
785
agenthub/browsing_agent/prompt.py
Normal file
@ -0,0 +1,785 @@
|
||||
import abc
|
||||
import difflib
|
||||
import logging
|
||||
import platform
|
||||
from copy import deepcopy
|
||||
from dataclasses import asdict, dataclass
|
||||
from textwrap import dedent
|
||||
from typing import Literal, Union
|
||||
from warnings import warn
|
||||
|
||||
from browsergym.core.action.base import AbstractActionSet
|
||||
from browsergym.core.action.highlevel import HighLevelActionSet
|
||||
from browsergym.core.action.python import PythonActionSet
|
||||
|
||||
from opendevin.runtime.browser.browser_env import BrowserEnv
|
||||
|
||||
from .utils import (
|
||||
ParseError,
|
||||
parse_html_tags_raise,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Flags:
|
||||
use_html: bool = True
|
||||
use_ax_tree: bool = False
|
||||
drop_ax_tree_first: bool = True # This flag is no longer active TODO delete
|
||||
use_thinking: bool = False
|
||||
use_error_logs: bool = False
|
||||
use_past_error_logs: bool = False
|
||||
use_history: bool = False
|
||||
use_action_history: bool = False
|
||||
use_memory: bool = False
|
||||
use_diff: bool = False
|
||||
html_type: str = 'pruned_html'
|
||||
use_concrete_example: bool = True
|
||||
use_abstract_example: bool = False
|
||||
multi_actions: bool = False
|
||||
action_space: Literal[
|
||||
'python', 'bid', 'coord', 'bid+coord', 'bid+nav', 'coord+nav', 'bid+coord+nav'
|
||||
] = 'bid'
|
||||
is_strict: bool = False
|
||||
# This flag will be automatically disabled `if not chat_model_args.has_vision()`
|
||||
use_screenshot: bool = True
|
||||
enable_chat: bool = False
|
||||
max_prompt_tokens: int = 100_000
|
||||
extract_visible_tag: bool = False
|
||||
extract_coords: Literal['False', 'center', 'box'] = 'False'
|
||||
extract_visible_elements_only: bool = False
|
||||
demo_mode: Literal['off', 'default', 'only_visible_elements'] = 'off'
|
||||
|
||||
def copy(self):
|
||||
return deepcopy(self)
|
||||
|
||||
def asdict(self):
|
||||
"""Helper for JSON serializble requirement."""
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(self, flags_dict):
|
||||
"""Helper for JSON serializble requirement."""
|
||||
if isinstance(flags_dict, Flags):
|
||||
return flags_dict
|
||||
|
||||
if not isinstance(flags_dict, dict):
|
||||
raise ValueError(
|
||||
f'Unregcognized type for flags_dict of type {type(flags_dict)}.'
|
||||
)
|
||||
return Flags(**flags_dict)
|
||||
|
||||
|
||||
class PromptElement:
|
||||
"""Base class for all prompt elements. Prompt elements can be hidden.
|
||||
|
||||
Prompt elements are used to build the prompt. Use flags to control which
|
||||
prompt elements are visible. We use class attributes as a convenient way
|
||||
to implement static prompts, but feel free to override them with instance
|
||||
attributes or @property decorator."""
|
||||
|
||||
_prompt = ''
|
||||
_abstract_ex = ''
|
||||
_concrete_ex = ''
|
||||
|
||||
def __init__(self, visible: bool = True) -> None:
|
||||
"""Prompt element that can be hidden.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
visible : bool, optional
|
||||
Whether the prompt element should be visible, by default True. Can
|
||||
be a callable that returns a bool. This is useful when a specific
|
||||
flag changes during a shrink iteration.
|
||||
"""
|
||||
self._visible = visible
|
||||
|
||||
@property
|
||||
def prompt(self):
|
||||
"""Avoid overriding this method. Override _prompt instead."""
|
||||
return self._hide(self._prompt)
|
||||
|
||||
@property
|
||||
def abstract_ex(self):
|
||||
"""Useful when this prompt element is requesting an answer from the llm.
|
||||
Provide an abstract example of the answer here. See Memory for an
|
||||
example.
|
||||
|
||||
Avoid overriding this method. Override _abstract_ex instead
|
||||
"""
|
||||
return self._hide(self._abstract_ex)
|
||||
|
||||
@property
|
||||
def concrete_ex(self):
|
||||
"""Useful when this prompt element is requesting an answer from the llm.
|
||||
Provide a concrete example of the answer here. See Memory for an
|
||||
example.
|
||||
|
||||
Avoid overriding this method. Override _concrete_ex instead
|
||||
"""
|
||||
return self._hide(self._concrete_ex)
|
||||
|
||||
@property
|
||||
def is_visible(self):
|
||||
"""Handle the case where visible is a callable."""
|
||||
visible = self._visible
|
||||
if callable(visible):
|
||||
visible = visible()
|
||||
return visible
|
||||
|
||||
def _hide(self, value):
|
||||
"""Return value if visible is True, else return empty string."""
|
||||
if self.is_visible:
|
||||
return value
|
||||
else:
|
||||
return ''
|
||||
|
||||
def _parse_answer(self, text_answer) -> dict:
|
||||
if self.is_visible:
|
||||
return self._parse_answer(text_answer)
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
class Shrinkable(PromptElement, abc.ABC):
|
||||
@abc.abstractmethod
|
||||
def shrink(self) -> None:
|
||||
"""Implement shrinking of this prompt element.
|
||||
|
||||
You need to recursively call all shrinkable elements that are part of
|
||||
this prompt. You can also implement a shriking startegy for this prompt.
|
||||
Shrinking is can be called multiple times to progressively shrink the
|
||||
prompt until it fits max_tokens. Default max shrink iterations is 20.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class Truncater(Shrinkable):
|
||||
"""A prompt element that can be truncated to fit the context length of the LLM.
|
||||
Of course, it will be great that we never have to use the functionality here to `shrink()` the prompt.
|
||||
Extend this class for prompt elements that can be truncated. Usually long observations such as AxTree or HTML.
|
||||
"""
|
||||
|
||||
def __init__(self, visible, shrink_speed=0.3, start_truncate_iteration=10):
|
||||
super().__init__(visible=visible)
|
||||
self.shrink_speed = shrink_speed # the percentage shrinked in each iteration
|
||||
self.start_truncate_iteration = (
|
||||
start_truncate_iteration # the iteration to start truncating
|
||||
)
|
||||
self.shrink_calls = 0
|
||||
self.deleted_lines = 0
|
||||
|
||||
def shrink(self) -> None:
|
||||
if self.is_visible and self.shrink_calls >= self.start_truncate_iteration:
|
||||
# remove the fraction of _prompt
|
||||
lines = self._prompt.splitlines()
|
||||
new_line_count = int(len(lines) * (1 - self.shrink_speed))
|
||||
self.deleted_lines += len(lines) - new_line_count
|
||||
self._prompt = '\n'.join(lines[:new_line_count])
|
||||
self._prompt += (
|
||||
f'\n... Deleted {self.deleted_lines} lines to reduce prompt size.'
|
||||
)
|
||||
|
||||
self.shrink_calls += 1
|
||||
|
||||
|
||||
def fit_tokens(
|
||||
shrinkable: Shrinkable,
|
||||
max_prompt_chars=None,
|
||||
max_iterations=20,
|
||||
):
|
||||
"""Shrink a prompt element until it fits max_tokens.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shrinkable : Shrinkable
|
||||
The prompt element to shrink.
|
||||
max_prompt_chars : int
|
||||
The maximum number of chars allowed.
|
||||
max_iterations : int, optional
|
||||
The maximum number of shrink iterations, by default 20.
|
||||
model_name : str, optional
|
||||
The name of the model used when tokenizing.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str : the prompt after shrinking.
|
||||
"""
|
||||
|
||||
if max_prompt_chars is None:
|
||||
return shrinkable.prompt
|
||||
|
||||
for _ in range(max_iterations):
|
||||
prompt = shrinkable.prompt
|
||||
if isinstance(prompt, str):
|
||||
prompt_str = prompt
|
||||
elif isinstance(prompt, list):
|
||||
prompt_str = '\n'.join([p['text'] for p in prompt if p['type'] == 'text'])
|
||||
else:
|
||||
raise ValueError(f'Unrecognized type for prompt: {type(prompt)}')
|
||||
n_chars = len(prompt_str)
|
||||
if n_chars <= max_prompt_chars:
|
||||
return prompt
|
||||
shrinkable.shrink()
|
||||
|
||||
logging.info(
|
||||
dedent(
|
||||
f"""\
|
||||
After {max_iterations} shrink iterations, the prompt is still
|
||||
{len(prompt_str)} chars (greater than {max_prompt_chars}). Returning the prompt as is."""
|
||||
)
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
class HTML(Truncater):
|
||||
def __init__(self, html, visible: bool = True, prefix='') -> None:
|
||||
super().__init__(visible=visible, start_truncate_iteration=5)
|
||||
self._prompt = f'\n{prefix}HTML:\n{html}\n'
|
||||
|
||||
|
||||
class AXTree(Truncater):
|
||||
def __init__(
|
||||
self, ax_tree, visible: bool = True, coord_type=None, prefix=''
|
||||
) -> None:
|
||||
super().__init__(visible=visible, start_truncate_iteration=10)
|
||||
if coord_type == 'center':
|
||||
coord_note = """\
|
||||
Note: center coordinates are provided in parenthesis and are
|
||||
relative to the top left corner of the page.\n\n"""
|
||||
elif coord_type == 'box':
|
||||
coord_note = """\
|
||||
Note: bounding box of each object are provided in parenthesis and are
|
||||
relative to the top left corner of the page.\n\n"""
|
||||
else:
|
||||
coord_note = ''
|
||||
self._prompt = f'\n{prefix}AXTree:\n{coord_note}{ax_tree}\n'
|
||||
|
||||
|
||||
class Error(PromptElement):
|
||||
def __init__(self, error, visible: bool = True, prefix='') -> None:
|
||||
super().__init__(visible=visible)
|
||||
self._prompt = f'\n{prefix}Error from previous action:\n{error}\n'
|
||||
|
||||
|
||||
class Observation(Shrinkable):
|
||||
"""Observation of the current step.
|
||||
|
||||
Contains the html, the accessibility tree and the error logs.
|
||||
"""
|
||||
|
||||
def __init__(self, obs, flags: Flags) -> None:
|
||||
super().__init__()
|
||||
self.flags = flags
|
||||
self.obs = obs
|
||||
self.html = HTML(obs[flags.html_type], visible=flags.use_html, prefix='## ')
|
||||
self.ax_tree = AXTree(
|
||||
obs['axtree_txt'],
|
||||
visible=flags.use_ax_tree,
|
||||
coord_type=flags.extract_coords,
|
||||
prefix='## ',
|
||||
)
|
||||
self.error = Error(
|
||||
obs['last_action_error'],
|
||||
visible=flags.use_error_logs and obs['last_action_error'],
|
||||
prefix='## ',
|
||||
)
|
||||
|
||||
def shrink(self):
|
||||
self.ax_tree.shrink()
|
||||
self.html.shrink()
|
||||
|
||||
@property
|
||||
def _prompt(self) -> str: # type: ignore
|
||||
return f'\n# Observation of current step:\n{self.html.prompt}{self.ax_tree.prompt}{self.error.prompt}\n\n'
|
||||
|
||||
def add_screenshot(self, prompt):
|
||||
if self.flags.use_screenshot:
|
||||
if isinstance(prompt, str):
|
||||
prompt = [{'type': 'text', 'text': prompt}]
|
||||
img_url = BrowserEnv.image_to_jpg_base64_url(
|
||||
self.obs['screenshot'], add_data_prefix=True
|
||||
)
|
||||
prompt.append({'type': 'image_url', 'image_url': img_url})
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
class MacNote(PromptElement):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(visible=platform.system() == 'Darwin')
|
||||
self._prompt = '\nNote: you are on mac so you should use Meta instead of Control for Control+C etc.\n'
|
||||
|
||||
|
||||
class BeCautious(PromptElement):
|
||||
def __init__(self, visible: bool = True) -> None:
|
||||
super().__init__(visible=visible)
|
||||
self._prompt = """\
|
||||
\nBe very cautious. Avoid submitting anything before verifying the effect of your
|
||||
actions. Take the time to explore the effect of safe actions first. For example
|
||||
you can fill a few elements of a form, but don't click submit before verifying
|
||||
that everything was filled correctly.\n"""
|
||||
|
||||
|
||||
class GoalInstructions(PromptElement):
|
||||
def __init__(self, goal, visible: bool = True) -> None:
|
||||
super().__init__(visible)
|
||||
self._prompt = f"""\
|
||||
# Instructions
|
||||
Review the current state of the page and all other information to find the best
|
||||
possible next action to accomplish your goal. Your answer will be interpreted
|
||||
and executed by a program, make sure to follow the formatting instructions.
|
||||
|
||||
## Goal:
|
||||
{goal}
|
||||
"""
|
||||
|
||||
|
||||
class ChatInstructions(PromptElement):
|
||||
def __init__(self, chat_messages, visible: bool = True) -> None:
|
||||
super().__init__(visible)
|
||||
self._prompt = """\
|
||||
# Instructions
|
||||
|
||||
You are a UI Assistant, your goal is to help the user perform tasks using a web browser. You can
|
||||
communicate with the user via a chat, in which the user gives you instructions and in which you
|
||||
can send back messages. You have access to a web browser that both you and the user can see,
|
||||
and with which only you can interact via specific commands.
|
||||
|
||||
Review the instructions from the user, the current state of the page and all other information
|
||||
to find the best possible next action to accomplish your goal. Your answer will be interpreted
|
||||
and executed by a program, make sure to follow the formatting instructions.
|
||||
|
||||
## Chat messages:
|
||||
|
||||
"""
|
||||
self._prompt += '\n'.join(
|
||||
[
|
||||
f"""\
|
||||
- [{msg['role']}] {msg['message']}"""
|
||||
for msg in chat_messages
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class SystemPrompt(PromptElement):
|
||||
_prompt = """\
|
||||
You are an agent trying to solve a web task based on the content of the page and
|
||||
a user instructions. You can interact with the page and explore. Each time you
|
||||
submit an action it will be sent to the browser and you will receive a new page."""
|
||||
|
||||
|
||||
class MainPrompt(Shrinkable):
|
||||
def __init__(
|
||||
self,
|
||||
obs_history,
|
||||
actions,
|
||||
memories,
|
||||
thoughts,
|
||||
flags: Flags,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.flags = flags
|
||||
self.history = History(obs_history, actions, memories, thoughts, flags)
|
||||
if self.flags.enable_chat:
|
||||
self.instructions: Union[ChatInstructions, GoalInstructions] = (
|
||||
ChatInstructions(obs_history[-1]['chat_messages'])
|
||||
)
|
||||
else:
|
||||
if (
|
||||
'chat_messages' in obs_history[-1]
|
||||
and sum(
|
||||
[msg['role'] == 'user' for msg in obs_history[-1]['chat_messages']]
|
||||
)
|
||||
> 1
|
||||
):
|
||||
logging.warning(
|
||||
'Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`.'
|
||||
)
|
||||
self.instructions = GoalInstructions(obs_history[-1]['goal'])
|
||||
|
||||
self.obs = Observation(obs_history[-1], self.flags)
|
||||
self.action_space = ActionSpace(self.flags)
|
||||
|
||||
self.think = Think(visible=flags.use_thinking)
|
||||
self.memory = Memory(visible=flags.use_memory)
|
||||
|
||||
@property
|
||||
def _prompt(self) -> str: # type: ignore
|
||||
prompt = f"""\
|
||||
{self.instructions.prompt}\
|
||||
{self.obs.prompt}\
|
||||
{self.history.prompt}\
|
||||
{self.action_space.prompt}\
|
||||
{self.think.prompt}\
|
||||
{self.memory.prompt}\
|
||||
"""
|
||||
|
||||
if self.flags.use_abstract_example:
|
||||
prompt += f"""
|
||||
# Abstract Example
|
||||
|
||||
Here is an abstract version of the answer with description of the content of
|
||||
each tag. Make sure you follow this structure, but replace the content with your
|
||||
answer:
|
||||
{self.think.abstract_ex}\
|
||||
{self.memory.abstract_ex}\
|
||||
{self.action_space.abstract_ex}\
|
||||
"""
|
||||
|
||||
if self.flags.use_concrete_example:
|
||||
prompt += f"""
|
||||
# Concrete Example
|
||||
|
||||
Here is a concrete example of how to format your answer.
|
||||
Make sure to follow the template with proper tags:
|
||||
{self.think.concrete_ex}\
|
||||
{self.memory.concrete_ex}\
|
||||
{self.action_space.concrete_ex}\
|
||||
"""
|
||||
return self.obs.add_screenshot(prompt)
|
||||
|
||||
def shrink(self):
|
||||
self.history.shrink()
|
||||
self.obs.shrink()
|
||||
|
||||
def _parse_answer(self, text_answer):
|
||||
ans_dict = {}
|
||||
ans_dict.update(self.think._parse_answer(text_answer))
|
||||
ans_dict.update(self.memory._parse_answer(text_answer))
|
||||
ans_dict.update(self.action_space._parse_answer(text_answer))
|
||||
return ans_dict
|
||||
|
||||
|
||||
class ActionSpace(PromptElement):
|
||||
def __init__(self, flags: Flags) -> None:
|
||||
super().__init__()
|
||||
self.flags = flags
|
||||
self.action_space = _get_action_space(flags)
|
||||
|
||||
self._prompt = (
|
||||
f'# Action space:\n{self.action_space.describe()}{MacNote().prompt}\n'
|
||||
)
|
||||
self._abstract_ex = f"""
|
||||
<action>
|
||||
{self.action_space.example_action(abstract=True)}
|
||||
</action>
|
||||
"""
|
||||
self._concrete_ex = f"""
|
||||
<action>
|
||||
{self.action_space.example_action(abstract=False)}
|
||||
</action>
|
||||
"""
|
||||
|
||||
def _parse_answer(self, text_answer):
|
||||
ans_dict = parse_html_tags_raise(
|
||||
text_answer, keys=['action'], merge_multiple=True
|
||||
)
|
||||
|
||||
try:
|
||||
# just check if action can be mapped to python code but keep action as is
|
||||
# the environment will be responsible for mapping it to python
|
||||
self.action_space.to_python_code(ans_dict['action'])
|
||||
except Exception as e:
|
||||
raise ParseError(
|
||||
f'Error while parsing action\n: {e}\n'
|
||||
'Make sure your answer is restricted to the allowed actions.'
|
||||
)
|
||||
|
||||
return ans_dict
|
||||
|
||||
|
||||
def _get_action_space(flags: Flags) -> AbstractActionSet:
|
||||
match flags.action_space:
|
||||
case 'python':
|
||||
action_space = PythonActionSet(strict=flags.is_strict)
|
||||
if flags.multi_actions:
|
||||
warn(
|
||||
f'Flag action_space={repr(flags.action_space)} incompatible with multi_actions={repr(flags.multi_actions)}.'
|
||||
)
|
||||
if flags.demo_mode != 'off':
|
||||
warn(
|
||||
f'Flag action_space={repr(flags.action_space)} incompatible with demo_mode={repr(flags.demo_mode)}.'
|
||||
)
|
||||
return action_space
|
||||
case 'bid':
|
||||
action_subsets = ['chat', 'bid']
|
||||
case 'coord':
|
||||
action_subsets = ['chat', 'coord']
|
||||
case 'bid+coord':
|
||||
action_subsets = ['chat', 'bid', 'coord']
|
||||
case 'bid+nav':
|
||||
action_subsets = ['chat', 'bid', 'nav']
|
||||
case 'coord+nav':
|
||||
action_subsets = ['chat', 'coord', 'nav']
|
||||
case 'bid+coord+nav':
|
||||
action_subsets = ['chat', 'bid', 'coord', 'nav']
|
||||
case _:
|
||||
raise NotImplementedError(
|
||||
f'Unknown action_space {repr(flags.action_space)}'
|
||||
)
|
||||
|
||||
action_space = HighLevelActionSet(
|
||||
subsets=action_subsets,
|
||||
multiaction=flags.multi_actions,
|
||||
strict=flags.is_strict,
|
||||
demo_mode=flags.demo_mode,
|
||||
)
|
||||
|
||||
return action_space
|
||||
|
||||
|
||||
class Memory(PromptElement):
|
||||
_prompt = '' # provided in the abstract and concrete examples
|
||||
|
||||
_abstract_ex = """
|
||||
<memory>
|
||||
Write down anything you need to remember for next steps. You will be presented
|
||||
with the list of previous memories and past actions.
|
||||
</memory>
|
||||
"""
|
||||
|
||||
_concrete_ex = """
|
||||
<memory>
|
||||
I clicked on bid 32 to activate tab 2. The accessibility tree should mention
|
||||
focusable for elements of the form at next step.
|
||||
</memory>
|
||||
"""
|
||||
|
||||
def _parse_answer(self, text_answer):
|
||||
return parse_html_tags_raise(
|
||||
text_answer, optional_keys=['memory'], merge_multiple=True
|
||||
)
|
||||
|
||||
|
||||
class Think(PromptElement):
|
||||
_prompt = ''
|
||||
|
||||
_abstract_ex = """
|
||||
<think>
|
||||
Think step by step. If you need to make calculations such as coordinates, write them here. Describe the effect
|
||||
that your previous action had on the current content of the page.
|
||||
</think>
|
||||
"""
|
||||
_concrete_ex = """
|
||||
<think>
|
||||
My memory says that I filled the first name and last name, but I can't see any
|
||||
content in the form. I need to explore different ways to fill the form. Perhaps
|
||||
the form is not visible yet or some fields are disabled. I need to replan.
|
||||
</think>
|
||||
"""
|
||||
|
||||
def _parse_answer(self, text_answer):
|
||||
return parse_html_tags_raise(
|
||||
text_answer, optional_keys=['think'], merge_multiple=True
|
||||
)
|
||||
|
||||
|
||||
def diff(previous, new):
|
||||
"""Return a string showing the difference between original and new.
|
||||
|
||||
If the difference is above diff_threshold, return the diff string."""
|
||||
|
||||
if previous == new:
|
||||
return 'Identical', []
|
||||
|
||||
if len(previous) == 0 or previous is None:
|
||||
return 'previous is empty', []
|
||||
|
||||
diff_gen = difflib.ndiff(previous.splitlines(), new.splitlines())
|
||||
|
||||
diff_lines = []
|
||||
plus_count = 0
|
||||
minus_count = 0
|
||||
for line in diff_gen:
|
||||
if line.strip().startswith('+'):
|
||||
diff_lines.append(line)
|
||||
plus_count += 1
|
||||
elif line.strip().startswith('-'):
|
||||
diff_lines.append(line)
|
||||
minus_count += 1
|
||||
else:
|
||||
continue
|
||||
|
||||
header = f'{plus_count} lines added and {minus_count} lines removed:'
|
||||
|
||||
return header, diff_lines
|
||||
|
||||
|
||||
class Diff(Shrinkable):
|
||||
def __init__(
|
||||
self, previous, new, prefix='', max_line_diff=20, shrink_speed=2, visible=True
|
||||
) -> None:
|
||||
super().__init__(visible=visible)
|
||||
self.max_line_diff = max_line_diff
|
||||
self.header, self.diff_lines = diff(previous, new)
|
||||
self.shrink_speed = shrink_speed
|
||||
self.prefix = prefix
|
||||
|
||||
def shrink(self):
|
||||
self.max_line_diff -= self.shrink_speed
|
||||
self.max_line_diff = max(1, self.max_line_diff)
|
||||
|
||||
@property
|
||||
def _prompt(self) -> str: # type: ignore
|
||||
diff_str = '\n'.join(self.diff_lines[: self.max_line_diff])
|
||||
if len(self.diff_lines) > self.max_line_diff:
|
||||
original_count = len(self.diff_lines)
|
||||
diff_str = f'{diff_str}\nDiff truncated, {original_count - self.max_line_diff} changes now shown.'
|
||||
return f'{self.prefix}{self.header}\n{diff_str}\n'
|
||||
|
||||
|
||||
class HistoryStep(Shrinkable):
|
||||
def __init__(
|
||||
self, previous_obs, current_obs, action, memory, flags: Flags, shrink_speed=1
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.html_diff = Diff(
|
||||
previous_obs[flags.html_type],
|
||||
current_obs[flags.html_type],
|
||||
prefix='\n### HTML diff:\n',
|
||||
shrink_speed=shrink_speed,
|
||||
visible=lambda: flags.use_html and flags.use_diff,
|
||||
)
|
||||
self.ax_tree_diff = Diff(
|
||||
previous_obs['axtree_txt'],
|
||||
current_obs['axtree_txt'],
|
||||
prefix='\n### Accessibility tree diff:\n',
|
||||
shrink_speed=shrink_speed,
|
||||
visible=lambda: flags.use_ax_tree and flags.use_diff,
|
||||
)
|
||||
self.error = Error(
|
||||
current_obs['last_action_error'],
|
||||
visible=(
|
||||
flags.use_error_logs
|
||||
and current_obs['last_action_error']
|
||||
and flags.use_past_error_logs
|
||||
),
|
||||
prefix='### ',
|
||||
)
|
||||
self.shrink_speed = shrink_speed
|
||||
self.action = action
|
||||
self.memory = memory
|
||||
self.flags = flags
|
||||
|
||||
def shrink(self):
|
||||
super().shrink()
|
||||
self.html_diff.shrink()
|
||||
self.ax_tree_diff.shrink()
|
||||
|
||||
@property
|
||||
def _prompt(self) -> str: # type: ignore
|
||||
prompt = ''
|
||||
|
||||
if self.flags.use_action_history:
|
||||
prompt += f'\n### Action:\n{self.action}\n'
|
||||
|
||||
prompt += (
|
||||
f'{self.error.prompt}{self.html_diff.prompt}{self.ax_tree_diff.prompt}'
|
||||
)
|
||||
|
||||
if self.flags.use_memory and self.memory is not None:
|
||||
prompt += f'\n### Memory:\n{self.memory}\n'
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
class History(Shrinkable):
|
||||
def __init__(
|
||||
self, history_obs, actions, memories, thoughts, flags: Flags, shrink_speed=1
|
||||
) -> None:
|
||||
super().__init__(visible=flags.use_history)
|
||||
assert len(history_obs) == len(actions) + 1
|
||||
assert len(history_obs) == len(memories) + 1
|
||||
|
||||
self.shrink_speed = shrink_speed
|
||||
self.history_steps: list[HistoryStep] = []
|
||||
|
||||
for i in range(1, len(history_obs)):
|
||||
self.history_steps.append(
|
||||
HistoryStep(
|
||||
history_obs[i - 1],
|
||||
history_obs[i],
|
||||
actions[i - 1],
|
||||
memories[i - 1],
|
||||
flags,
|
||||
)
|
||||
)
|
||||
|
||||
def shrink(self):
|
||||
"""Shrink individual steps"""
|
||||
# TODO set the shrink speed of older steps to be higher
|
||||
super().shrink()
|
||||
for step in self.history_steps:
|
||||
step.shrink()
|
||||
|
||||
@property
|
||||
def _prompt(self):
|
||||
prompts = ['# History of interaction with the task:\n']
|
||||
for i, step in enumerate(self.history_steps):
|
||||
prompts.append(f'## step {i}')
|
||||
prompts.append(step.prompt)
|
||||
return '\n'.join(prompts) + '\n'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
html_template = """
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
Hello World.
|
||||
Step {}.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
OBS_HISTORY = [
|
||||
{
|
||||
'goal': 'do this and that',
|
||||
'pruned_html': html_template.format(1),
|
||||
'axtree_txt': '[1] Click me',
|
||||
'last_action_error': '',
|
||||
},
|
||||
{
|
||||
'goal': 'do this and that',
|
||||
'pruned_html': html_template.format(2),
|
||||
'axtree_txt': '[1] Click me',
|
||||
'last_action_error': '',
|
||||
},
|
||||
{
|
||||
'goal': 'do this and that',
|
||||
'pruned_html': html_template.format(3),
|
||||
'axtree_txt': '[1] Click me',
|
||||
'last_action_error': 'Hey, there is an error now',
|
||||
},
|
||||
]
|
||||
ACTIONS = ["click('41')", "click('42')"]
|
||||
MEMORIES = ['memory A', 'memory B']
|
||||
THOUGHTS = ['thought A', 'thought B']
|
||||
|
||||
flags = Flags(
|
||||
use_html=True,
|
||||
use_ax_tree=True,
|
||||
use_thinking=True,
|
||||
use_error_logs=True,
|
||||
use_past_error_logs=True,
|
||||
use_history=True,
|
||||
use_action_history=True,
|
||||
use_memory=True,
|
||||
use_diff=True,
|
||||
html_type='pruned_html',
|
||||
use_concrete_example=True,
|
||||
use_abstract_example=True,
|
||||
use_screenshot=False,
|
||||
multi_actions=True,
|
||||
)
|
||||
|
||||
print(
|
||||
MainPrompt(
|
||||
obs_history=OBS_HISTORY,
|
||||
actions=ACTIONS,
|
||||
memories=MEMORIES,
|
||||
thoughts=THOUGHTS,
|
||||
flags=flags,
|
||||
).prompt
|
||||
)
|
||||
160
agenthub/browsing_agent/utils.py
Normal file
160
agenthub/browsing_agent/utils.py
Normal file
@ -0,0 +1,160 @@
|
||||
import collections
|
||||
import re
|
||||
from warnings import warn
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def yaml_parser(message):
|
||||
"""Parse a yaml message for the retry function."""
|
||||
|
||||
# saves gpt-3.5 from some yaml parsing errors
|
||||
message = re.sub(r':\s*\n(?=\S|\n)', ': ', message)
|
||||
|
||||
try:
|
||||
value = yaml.safe_load(message)
|
||||
valid = True
|
||||
retry_message = ''
|
||||
except yaml.YAMLError as e:
|
||||
warn(str(e))
|
||||
value = {}
|
||||
valid = False
|
||||
retry_message = "Your response is not a valid yaml. Please try again and be careful to the format. Don't add any apology or comment, just the answer."
|
||||
return value, valid, retry_message
|
||||
|
||||
|
||||
def _compress_chunks(text, identifier, skip_list, split_regex='\n\n+'):
|
||||
"""Compress a string by replacing redundant chunks by identifiers. Chunks are defined by the split_regex."""
|
||||
text_list = re.split(split_regex, text)
|
||||
text_list = [chunk.strip() for chunk in text_list]
|
||||
counter = collections.Counter(text_list)
|
||||
def_dict = {}
|
||||
id = 0
|
||||
|
||||
# Store items that occur more than once in a dictionary
|
||||
for item, count in counter.items():
|
||||
if count > 1 and item not in skip_list and len(item) > 10:
|
||||
def_dict[f'{identifier}-{id}'] = item
|
||||
id += 1
|
||||
|
||||
# Replace redundant items with their identifiers in the text
|
||||
compressed_text = '\n'.join(text_list)
|
||||
for key, value in def_dict.items():
|
||||
compressed_text = compressed_text.replace(value, key)
|
||||
|
||||
return def_dict, compressed_text
|
||||
|
||||
|
||||
def compress_string(text):
|
||||
"""Compress a string by replacing redundant paragraphs and lines with identifiers."""
|
||||
|
||||
# Perform paragraph-level compression
|
||||
def_dict, compressed_text = _compress_chunks(
|
||||
text, identifier='§', skip_list=[], split_regex='\n\n+'
|
||||
)
|
||||
|
||||
# Perform line-level compression, skipping any paragraph identifiers
|
||||
line_dict, compressed_text = _compress_chunks(
|
||||
compressed_text, '¶', list(def_dict.keys()), split_regex='\n+'
|
||||
)
|
||||
def_dict.update(line_dict)
|
||||
|
||||
# Create a definitions section
|
||||
def_lines = ['<definitions>']
|
||||
for key, value in def_dict.items():
|
||||
def_lines.append(f'{key}:\n{value}')
|
||||
def_lines.append('</definitions>')
|
||||
definitions = '\n'.join(def_lines)
|
||||
|
||||
return definitions + '\n' + compressed_text
|
||||
|
||||
|
||||
def extract_html_tags(text, keys):
|
||||
"""Extract the content within HTML tags for a list of keys.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
The input string containing the HTML tags.
|
||||
keys : list of str
|
||||
The HTML tags to extract the content from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A dictionary mapping each key to a list of subset in `text` that match the key.
|
||||
|
||||
Notes
|
||||
-----
|
||||
All text and keys will be converted to lowercase before matching.
|
||||
|
||||
"""
|
||||
content_dict = {}
|
||||
# text = text.lower()
|
||||
# keys = set([k.lower() for k in keys])
|
||||
for key in keys:
|
||||
pattern = f'<{key}>(.*?)</{key}>'
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
if matches:
|
||||
content_dict[key] = [match.strip() for match in matches]
|
||||
return content_dict
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def parse_html_tags_raise(text, keys=(), optional_keys=(), merge_multiple=False):
|
||||
"""A version of parse_html_tags that raises an exception if the parsing is not successful."""
|
||||
content_dict, valid, retry_message = parse_html_tags(
|
||||
text, keys, optional_keys, merge_multiple=merge_multiple
|
||||
)
|
||||
if not valid:
|
||||
raise ParseError(retry_message)
|
||||
return content_dict
|
||||
|
||||
|
||||
def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False):
|
||||
"""Satisfy the parse api, extracts 1 match per key and validates that all keys are present
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
The input string containing the HTML tags.
|
||||
keys : list of str
|
||||
The HTML tags to extract the content from.
|
||||
optional_keys : list of str
|
||||
The HTML tags to extract the content from, but are optional.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A dictionary mapping each key to subset of `text` that match the key.
|
||||
bool
|
||||
Whether the parsing was successful.
|
||||
str
|
||||
A message to be displayed to the agent if the parsing was not successful.
|
||||
"""
|
||||
all_keys = tuple(keys) + tuple(optional_keys)
|
||||
content_dict = extract_html_tags(text, all_keys)
|
||||
retry_messages = []
|
||||
|
||||
for key in all_keys:
|
||||
if key not in content_dict:
|
||||
if key not in optional_keys:
|
||||
retry_messages.append(f'Missing the key <{key}> in the answer.')
|
||||
else:
|
||||
val = content_dict[key]
|
||||
content_dict[key] = val[0]
|
||||
if len(val) > 1:
|
||||
if not merge_multiple:
|
||||
retry_messages.append(
|
||||
f'Found multiple instances of the key {key}. You should have only one of them.'
|
||||
)
|
||||
else:
|
||||
# merge the multiple instances
|
||||
content_dict[key] = '\n'.join(val)
|
||||
|
||||
valid = len(retry_messages) == 0
|
||||
retry_message = '\n'.join(retry_messages)
|
||||
return content_dict, valid, retry_message
|
||||
@ -112,6 +112,7 @@ class AgentController:
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
logger.error(f'Error while running the agent: {e}')
|
||||
logger.error(traceback.format_exc())
|
||||
await self.report_error(
|
||||
'There was an unexpected error while running the agent', exception=e
|
||||
)
|
||||
|
||||
@ -22,6 +22,7 @@ class BrowserOutputObservation(Observation):
|
||||
dom_object: dict = field(default_factory=dict, repr=False) # don't show in repr
|
||||
axtree_object: dict = field(default_factory=dict, repr=False) # don't show in repr
|
||||
last_browser_action: str = ''
|
||||
last_browser_action_error: str = ''
|
||||
focused_element_bid: str = ''
|
||||
|
||||
@property
|
||||
|
||||
@ -27,6 +27,7 @@ DELETE_FROM_MEMORY_EXTRAS = {
|
||||
'open_pages_urls',
|
||||
'active_page_index',
|
||||
'last_browser_action',
|
||||
'last_browser_action_error',
|
||||
'focused_element_bid',
|
||||
}
|
||||
|
||||
|
||||
@ -37,6 +37,8 @@ class BrowserEnv:
|
||||
)
|
||||
logger.info('Starting browser env...')
|
||||
self.process.start()
|
||||
if not self.check_alive():
|
||||
raise BrowserException('Failed to start browser environment.')
|
||||
atexit.register(self.close)
|
||||
|
||||
def browser_process(self):
|
||||
@ -58,6 +60,9 @@ class BrowserEnv:
|
||||
logger.info('SHUTDOWN recv, shutting down browser env...')
|
||||
env.close()
|
||||
return
|
||||
elif unique_request_id == 'IS_ALIVE':
|
||||
self.browser_side.send(('ALIVE', None))
|
||||
continue
|
||||
action = action_data['action']
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
# add text content of the page
|
||||
@ -86,10 +91,15 @@ class BrowserEnv:
|
||||
if self.agent_side.poll(timeout=0.01):
|
||||
response_id, obs = self.agent_side.recv()
|
||||
if response_id == unique_request_id:
|
||||
if obs['last_action_error']:
|
||||
raise BrowserException(obs['last_action_error'])
|
||||
return obs
|
||||
|
||||
def check_alive(self, timeout: float = 10):
|
||||
self.agent_side.send(('IS_ALIVE', None))
|
||||
if self.agent_side.poll(timeout=timeout):
|
||||
response_id, _ = self.agent_side.recv()
|
||||
if response_id == 'ALIVE':
|
||||
return True
|
||||
|
||||
def close(self):
|
||||
if not self.process.is_alive():
|
||||
logger.info('BrowserEnv already closed, no need to close again')
|
||||
@ -112,7 +122,9 @@ class BrowserEnv:
|
||||
logger.error('Encountered an error when closing browser env', exc_info=True)
|
||||
|
||||
@staticmethod
|
||||
def image_to_png_base64_url(image: np.ndarray | Image.Image):
|
||||
def image_to_png_base64_url(
|
||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
||||
):
|
||||
"""Convert a numpy array to a base64 encoded png image url."""
|
||||
|
||||
if isinstance(image, np.ndarray):
|
||||
@ -123,4 +135,28 @@ class BrowserEnv:
|
||||
image.save(buffered, format='PNG')
|
||||
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
||||
return f'{image_base64}'
|
||||
return (
|
||||
f'data:image/png;base64,{image_base64}'
|
||||
if add_data_prefix
|
||||
else f'{image_base64}'
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def image_to_jpg_base64_url(
|
||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
||||
):
|
||||
"""Convert a numpy array to a base64 encoded jpeg image url."""
|
||||
|
||||
if isinstance(image, np.ndarray):
|
||||
image = Image.fromarray(image)
|
||||
if image.mode in ('RGBA', 'LA'):
|
||||
image = image.convert('RGB')
|
||||
buffered = io.BytesIO()
|
||||
image.save(buffered, format='JPEG')
|
||||
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
||||
return (
|
||||
f'data:image/jpeg;base64,{image_base64}'
|
||||
if add_data_prefix
|
||||
else f'{image_base64}'
|
||||
)
|
||||
|
||||
@ -2,9 +2,10 @@ import os
|
||||
|
||||
from opendevin.core.schema import ActionType
|
||||
from opendevin.events.observation import BrowserOutputObservation
|
||||
from opendevin.runtime.browser.browser_env import BrowserEnv
|
||||
|
||||
|
||||
async def browse(action, browser) -> BrowserOutputObservation: # type: ignore
|
||||
async def browse(action, browser: BrowserEnv) -> BrowserOutputObservation: # type: ignore
|
||||
if action.action == ActionType.BROWSE:
|
||||
# legacy BrowseURLAction
|
||||
asked_url = action.url
|
||||
@ -30,11 +31,16 @@ async def browse(action, browser) -> BrowserOutputObservation: # type: ignore
|
||||
focused_element_bid=obs['focused_element_bid'], # focused element bid
|
||||
screenshot=obs['screenshot'], # base64-encoded screenshot, png
|
||||
url=obs['url'], # URL of the page
|
||||
error=True if obs['last_action_error'] else False, # error flag
|
||||
last_browser_action_error=obs[
|
||||
'last_action_error'
|
||||
], # last browser env action error
|
||||
)
|
||||
except Exception as e:
|
||||
return BrowserOutputObservation(
|
||||
content=str(e),
|
||||
screenshot='',
|
||||
error=True,
|
||||
last_browser_action_error=str(e),
|
||||
url=asked_url if action.action == ActionType.BROWSE else '',
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user