mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[feat] WebArena benchmark, MiniWoB++ benchmark and related arch changes (#2170)
* add webarena, and revamp messaging for webarena eval * add changes for browsergym * update infer script * fix unit tests * update * add multiple run for miniwob * update instruction, remove personal path * update * add code for getting final reward, fix integration, add results * add avg cost calculation
This commit is contained in:
parent
99c6333e1a
commit
48151bdbb0
1
.gitignore
vendored
1
.gitignore
vendored
@ -209,3 +209,4 @@ evaluation/outputs
|
||||
evaluation/evaluation_outputs
|
||||
test_results*
|
||||
/_test_files_tmp/
|
||||
evaluation/webarena/scripts/webarena_env.sh
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import ast
|
||||
import os
|
||||
|
||||
from browsergym.core.action.highlevel import HighLevelActionSet
|
||||
from browsergym.utils.obs import flatten_axtree_to_str
|
||||
@ -12,6 +13,7 @@ from opendevin.events.action import (
|
||||
BrowseInteractiveAction,
|
||||
MessageAction,
|
||||
)
|
||||
from opendevin.events.event import EventSource
|
||||
from opendevin.events.observation import BrowserOutputObservation
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.plugins import (
|
||||
@ -19,21 +21,12 @@ from opendevin.runtime.plugins import (
|
||||
)
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
|
||||
def parse_response(response: str) -> Action:
|
||||
if '```' not in response:
|
||||
# unexpected response format, message back to user
|
||||
return MessageAction(response)
|
||||
thought = response.split('```')[0].strip()
|
||||
action_str = response.split('```')[1].strip()
|
||||
# handle send message to user function call in BrowserGym
|
||||
for sub_action in action_str.split('\n'):
|
||||
if 'send_msg_to_user(' in sub_action:
|
||||
tree = ast.parse(sub_action)
|
||||
args = tree.body[0].value.args # type: ignore
|
||||
return MessageAction(args[0].value)
|
||||
|
||||
return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
|
||||
USE_NAV = (
|
||||
os.environ.get('USE_NAV', 'true') == 'true'
|
||||
) # only disable NAV actions when running webarena and miniwob benchmarks
|
||||
USE_CONCISE_ANSWER = (
|
||||
os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
|
||||
) # only return concise answer when running webarena and miniwob benchmarks
|
||||
|
||||
|
||||
class BrowsingAgent(Agent):
|
||||
@ -56,13 +49,13 @@ class BrowsingAgent(Agent):
|
||||
- llm (LLM): The llm to be used by this agent
|
||||
"""
|
||||
super().__init__(llm)
|
||||
# define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
|
||||
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
|
||||
action_subsets = ['chat', 'bid']
|
||||
if USE_NAV:
|
||||
action_subsets.append('nav')
|
||||
self.action_space = HighLevelActionSet(
|
||||
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
|
||||
subsets=[
|
||||
'chat',
|
||||
'bid',
|
||||
'nav',
|
||||
], # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
|
||||
subsets=action_subsets,
|
||||
strict=False, # less strict on the parsing of the actions
|
||||
multiaction=True, # enable to agent to take multiple actions at once
|
||||
)
|
||||
@ -75,6 +68,32 @@ class BrowsingAgent(Agent):
|
||||
"""
|
||||
super().reset()
|
||||
self.cost_accumulator = 0
|
||||
self.error_accumulator = 0
|
||||
|
||||
def parse_response(self, response: str) -> Action:
|
||||
if '```' not in response:
|
||||
# unexpected response format, message back to user
|
||||
action_str = f'send_msg_to_user("""{response}""")'
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions=action_str,
|
||||
thought=response,
|
||||
browsergym_send_msg_to_user=response,
|
||||
)
|
||||
thought = response.split('```')[0].strip()
|
||||
action_str = response.split('```')[1].strip()
|
||||
# handle send message to user function call in BrowserGym
|
||||
msg_content = ''
|
||||
for sub_action in action_str.split('\n'):
|
||||
if 'send_msg_to_user(' in sub_action:
|
||||
tree = ast.parse(sub_action)
|
||||
args = tree.body[0].value.args # type: ignore
|
||||
msg_content = args[0].value
|
||||
|
||||
return BrowseInteractiveAction(
|
||||
browser_actions=action_str,
|
||||
thought=thought,
|
||||
browsergym_send_msg_to_user=msg_content,
|
||||
)
|
||||
|
||||
def step(self, state: State) -> Action:
|
||||
"""
|
||||
@ -91,26 +110,57 @@ class BrowsingAgent(Agent):
|
||||
"""
|
||||
goal = state.get_current_user_intent()
|
||||
messages = []
|
||||
prev_actions = ''
|
||||
prev_actions = []
|
||||
cur_axtree_txt = ''
|
||||
error_prefix = ''
|
||||
last_obs = None
|
||||
last_action = None
|
||||
if len(state.history) == 1:
|
||||
# initialize and retrieve the first observation by issuing an noop OP
|
||||
# TODO: need more elegant way of doing this
|
||||
return BrowseInteractiveAction(browser_actions='noop()')
|
||||
for prev_action, obs in state.history:
|
||||
if isinstance(prev_action, BrowseInteractiveAction):
|
||||
prev_actions += f'{prev_action.browser_actions}\n'
|
||||
prev_actions.append(prev_action.browser_actions)
|
||||
last_obs = obs
|
||||
last_action = prev_action
|
||||
elif (
|
||||
isinstance(prev_action, MessageAction) and prev_action.source != 'user'
|
||||
isinstance(prev_action, MessageAction)
|
||||
and prev_action.source == EventSource.AGENT
|
||||
):
|
||||
# agent has responded, task finish.
|
||||
return AgentFinishAction()
|
||||
|
||||
prev_action_str = '\n'.join(prev_actions[1:])
|
||||
# if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
|
||||
# we should also send a message back to the user in OpenDevin and call it a day
|
||||
if (
|
||||
isinstance(last_action, BrowseInteractiveAction)
|
||||
and last_action.browsergym_send_msg_to_user
|
||||
):
|
||||
return MessageAction(last_action.browsergym_send_msg_to_user)
|
||||
|
||||
if isinstance(last_obs, BrowserOutputObservation):
|
||||
if last_obs.error:
|
||||
# add error recovery prompt prefix
|
||||
error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
|
||||
cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
|
||||
try:
|
||||
cur_axtree_txt = flatten_axtree_to_str(
|
||||
last_obs.axtree_object,
|
||||
extra_properties=last_obs.extra_element_properties,
|
||||
with_clickable=True,
|
||||
filter_visible_only=True,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
'Error when trying to process the accessibility tree: %s', e
|
||||
)
|
||||
return MessageAction('Error encountered when browsing.')
|
||||
|
||||
if error_prefix:
|
||||
self.error_accumulator += 1
|
||||
if self.error_accumulator > 5:
|
||||
return MessageAction('Too many errors encountered. Task failed.')
|
||||
system_msg = f"""\
|
||||
# Instructions
|
||||
Review the current state of the page and all other information to find the best
|
||||
@ -133,7 +183,7 @@ and executed by a program, make sure to follow the formatting instructions.
|
||||
{cur_axtree_txt}
|
||||
|
||||
# Previous Actions
|
||||
{prev_actions}
|
||||
{prev_action_str}
|
||||
|
||||
Here is an example with chain of thought of a valid action when clicking on a button:
|
||||
"
|
||||
@ -141,16 +191,31 @@ In order to accomplish my goal I need to click on the button with bid 12
|
||||
```click("12")```
|
||||
"
|
||||
""".strip()
|
||||
|
||||
if USE_CONCISE_ANSWER:
|
||||
concise_instruction = """\
|
||||
|
||||
Here is another example with chain of thought of a valid action when providing a concise answer to user:
|
||||
"
|
||||
In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
|
||||
```send_msg_to_user("$279.49")```
|
||||
"
|
||||
"""
|
||||
prompt += concise_instruction
|
||||
messages.append({'role': 'user', 'content': prompt})
|
||||
response = self.llm.completion(
|
||||
messages=messages,
|
||||
temperature=0.0,
|
||||
stop=[')```', ')\n```'],
|
||||
)
|
||||
self.log_cost(response)
|
||||
action_resp = response['choices'][0]['message']['content']
|
||||
action_resp = response['choices'][0]['message']['content'].strip()
|
||||
if not action_resp.endswith('```'):
|
||||
action_resp = action_resp + ')```'
|
||||
|
||||
logger.info(prompt)
|
||||
logger.info(action_resp)
|
||||
return parse_response(action_resp)
|
||||
return self.parse_response(action_resp)
|
||||
|
||||
def search_memory(self, query: str) -> list[str]:
|
||||
raise NotImplementedError('Implement this abstract method')
|
||||
|
||||
81
evaluation/miniwob/README.md
Normal file
81
evaluation/miniwob/README.md
Normal file
@ -0,0 +1,81 @@
|
||||
# WebArena Evaluation with OpenDevin Browsing Agents
|
||||
|
||||
This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
|
||||
|
||||
## Setup OpenDevin Environment
|
||||
|
||||
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace.
|
||||
|
||||
Add the following configurations:
|
||||
|
||||
```toml
|
||||
[core]
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
|
||||
sandbox_type = "ssh"
|
||||
ssh_hostname = "localhost"
|
||||
sandbox_timeout = 120
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[eval_gpt4_1106_preview]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[eval_some_openai_compatible_model]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
|
||||
MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
|
||||
|
||||
- Clone miniwob (use a specific frozen commit for reproducibility)
|
||||
```sh
|
||||
git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
|
||||
git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
|
||||
```
|
||||
|
||||
- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
|
||||
```sh
|
||||
export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
|
||||
```
|
||||
|
||||
## Test if your environment works
|
||||
|
||||
Access with browser the above MiniWoB URLs and see if they load correctly.
|
||||
|
||||
## Run Evaluation
|
||||
|
||||
```sh
|
||||
bash evaluation/miniwob/scripts/run_infer.sh
|
||||
```
|
||||
|
||||
Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
|
||||
|
||||
To calculate the average reward, run:
|
||||
|
||||
```sh
|
||||
poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
```
|
||||
|
||||
## Submit your evaluation results
|
||||
|
||||
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
|
||||
|
||||
|
||||
## BrowsingAgent V1.0 result
|
||||
|
||||
Tested on BrowsingAgent V1.0
|
||||
|
||||
MiniWoB++, 125 tasks (3 runs due to random init task), max step 10
|
||||
|
||||
- GPT4o: 0.384, 0.416, 0.424, avg: 0.408
|
||||
- GPT3.5: 0.288, 0.256, 0.272, avg: 0.272
|
||||
0
evaluation/miniwob/__init__.py
Normal file
0
evaluation/miniwob/__init__.py
Normal file
33
evaluation/miniwob/get_avg_reward.py
Normal file
33
evaluation/miniwob/get_avg_reward.py
Normal file
@ -0,0 +1,33 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
||||
import gymnasium as gym
|
||||
|
||||
parser = argparse.ArgumentParser(description='Calculate average reward.')
|
||||
parser.add_argument('output_path', type=str, help='path to output.jsonl')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
|
||||
]
|
||||
total_num = len(env_ids)
|
||||
print('Total number of tasks: ', total_num)
|
||||
total_reward = 0
|
||||
total_cost = 0
|
||||
actual_num = 0
|
||||
with open(args.output_path, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
actual_num += 1
|
||||
total_cost += data['metrics']['accumulated_cost']
|
||||
total_reward += data['test_result']
|
||||
|
||||
avg_reward = total_reward / total_num
|
||||
print('Avg Reward: ', avg_reward)
|
||||
|
||||
avg_cost = total_cost / actual_num
|
||||
print('Avg Cost: ', avg_cost)
|
||||
print('Actual number of tasks finished: ', actual_num)
|
||||
214
evaluation/miniwob/run_infer.py
Normal file
214
evaluation/miniwob/run_infer.py
Normal file
@ -0,0 +1,214 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
||||
import gymnasium as gym
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
|
||||
def process_instance(
|
||||
env_id: str,
|
||||
metadata: dict,
|
||||
eval_output_dir: str,
|
||||
docker_sandbox: DockerSSHBox,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
runtime_tools_config = {
|
||||
RuntimeTool.BROWSER: {
|
||||
'browsergym_eval': env_id,
|
||||
'browsergym_eval_save_dir': eval_output_dir,
|
||||
}
|
||||
}
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
'PLACEHOLDER_GOAL',
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=docker_sandbox,
|
||||
)
|
||||
)
|
||||
|
||||
# ======= Attempt to evaluate the agent's environment impact =======
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
|
||||
# read goal
|
||||
with open(
|
||||
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
|
||||
) as f:
|
||||
instruction = f.read()
|
||||
# read reward
|
||||
with open(
|
||||
os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
|
||||
) as f:
|
||||
rewards = json.load(f)
|
||||
reward = max(rewards)
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': env_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': reward,
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
|
||||
]
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'miniwob',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
env_ids = env_ids[:eval_n_limit]
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_env_ids = []
|
||||
for idx in env_ids:
|
||||
if idx in finished_instance_ids:
|
||||
logger.info(f'Skipping instance {idx} as it is already finished.')
|
||||
continue
|
||||
new_env_ids.append(idx)
|
||||
|
||||
env_ids = new_env_ids
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
|
||||
docker_sandbox = DockerSSHBox()
|
||||
for env_id in tqdm(env_ids):
|
||||
try:
|
||||
output = process_instance(
|
||||
env_id=env_id,
|
||||
metadata=metadata,
|
||||
eval_output_dir=eval_output_dir,
|
||||
docker_sandbox=docker_sandbox,
|
||||
reset_logger=False,
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
except Exception as e:
|
||||
logger.error(f'Error processing instance {env_id}: {e}')
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
44
evaluation/miniwob/scripts/run_infer.sh
Executable file
44
evaluation/miniwob/scripts/run_infer.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
|
||||
# configure miniwob website, change URL to yours
|
||||
export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
|
||||
|
||||
# configure browsing agent
|
||||
export USE_NAV="false"
|
||||
export USE_CONCISE_ANSWER="true"
|
||||
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
AGENT=$2
|
||||
NOTE=$3
|
||||
EVAL_LIMIT=$4
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default BrowsingAgent"
|
||||
AGENT="BrowsingAgent"
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
|
||||
|
||||
COMMAND="poetry run python evaluation/miniwob/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
91
evaluation/webarena/README.md
Normal file
91
evaluation/webarena/README.md
Normal file
@ -0,0 +1,91 @@
|
||||
# WebArena Evaluation with OpenDevin Browsing Agents
|
||||
|
||||
This folder contains evaluation for [WebArena](https://github.com/web-arena-x/webarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
|
||||
|
||||
## Setup OpenDevin Environment
|
||||
|
||||
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace.
|
||||
|
||||
Add the following configurations:
|
||||
|
||||
```toml
|
||||
[core]
|
||||
max_iterations = 100
|
||||
cache_dir = "/tmp/cache"
|
||||
sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
|
||||
sandbox_type = "ssh"
|
||||
ssh_hostname = "localhost"
|
||||
sandbox_timeout = 120
|
||||
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[eval_gpt4_1106_preview]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[eval_some_openai_compatible_model]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
## Setup WebArena Environment
|
||||
WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenDevin agents.
|
||||
Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
|
||||
Take note of the base URL of the machine where the environment is installed.
|
||||
|
||||
## Setup Environment Variables of WebArena Websites
|
||||
|
||||
Create a script `webarena_env.sh` under `evaluation/webarena/scripts` with the following:
|
||||
|
||||
```bash
|
||||
export BASE_URL=<YOUR_SERVER_URL_HERE>
|
||||
export SHOPPING="$BASE_URL:7770/"
|
||||
export SHOPPING_ADMIN="$BASE_URL:7780/admin"
|
||||
export REDDIT="$BASE_URL:9999"
|
||||
export GITLAB="$BASE_URL:8023"
|
||||
export WIKIPEDIA="$BASE_URL:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
|
||||
export MAP="$BASE_URL:3000"
|
||||
export HOMEPAGE="$BASE_URL:4399"
|
||||
export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
|
||||
```
|
||||
|
||||
## Test if your environment works
|
||||
|
||||
Access with browser the above WebArena website URLs and see if they load correctly.
|
||||
If you cannot access the website, make sure the firewall allows public access of the aforementioned ports on your server
|
||||
Check the network security policy if you are using an AWS machine.
|
||||
Follow the WebArena environment setup guide carefully, and make sure the URL fields are populated with the correct base URL of your server.
|
||||
|
||||
## Run Evaluation
|
||||
|
||||
```sh
|
||||
bash evaluation/webarena/scripts/run_infer.sh
|
||||
```
|
||||
|
||||
Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
|
||||
|
||||
To calculate the success rate, run:
|
||||
|
||||
```sh
|
||||
poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
|
||||
```
|
||||
|
||||
## Submit your evaluation results
|
||||
|
||||
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
|
||||
|
||||
|
||||
## BrowsingAgent V1.0 result
|
||||
|
||||
Tested on BrowsingAgent V1.0
|
||||
|
||||
WebArena, 812 tasks (high cost, single run due to fixed task), max step 15
|
||||
|
||||
- GPT4o: 0.1478
|
||||
- GPT3.5: 0.0517
|
||||
0
evaluation/webarena/__init__.py
Normal file
0
evaluation/webarena/__init__.py
Normal file
33
evaluation/webarena/get_success_rate.py
Normal file
33
evaluation/webarena/get_success_rate.py
Normal file
@ -0,0 +1,33 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
import gymnasium as gym
|
||||
|
||||
parser = argparse.ArgumentParser(description='Calculate average reward.')
|
||||
parser.add_argument('output_path', type=str, help='path to output.jsonl')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if __name__ == '__main__':
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
|
||||
]
|
||||
total_num = len(env_ids)
|
||||
print('Total number of tasks: ', total_num)
|
||||
total_reward = 0
|
||||
total_cost = 0
|
||||
actual_num = 0
|
||||
with open(args.output_path, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
actual_num += 1
|
||||
total_cost += data['metrics']['accumulated_cost']
|
||||
total_reward += data['test_result']
|
||||
|
||||
avg_reward = total_reward / total_num
|
||||
print('Success Rate: ', avg_reward)
|
||||
|
||||
avg_cost = total_cost / actual_num
|
||||
print('Avg Cost: ', avg_cost)
|
||||
print('Actual number of tasks finished: ', actual_num)
|
||||
214
evaluation/webarena/run_infer.py
Normal file
214
evaluation/webarena/run_infer.py
Normal file
@ -0,0 +1,214 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
import gymnasium as gym
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
|
||||
def process_instance(
|
||||
env_id: str,
|
||||
metadata: dict,
|
||||
eval_output_dir: str,
|
||||
docker_sandbox: DockerSSHBox,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
runtime_tools_config = {
|
||||
RuntimeTool.BROWSER: {
|
||||
'browsergym_eval': env_id,
|
||||
'browsergym_eval_save_dir': eval_output_dir,
|
||||
}
|
||||
}
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
'PLACEHOLDER_GOAL',
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=docker_sandbox,
|
||||
)
|
||||
)
|
||||
|
||||
# ======= Attempt to evaluate the agent's environment impact =======
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
|
||||
# read goal
|
||||
with open(
|
||||
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
|
||||
) as f:
|
||||
instruction = f.read()
|
||||
# read reward
|
||||
with open(
|
||||
os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
|
||||
) as f:
|
||||
rewards = json.load(f)
|
||||
reward = max(rewards)
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': env_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': reward,
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
|
||||
]
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'webarena',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
env_ids = env_ids[:eval_n_limit]
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_env_ids = []
|
||||
for idx in env_ids:
|
||||
if idx in finished_instance_ids:
|
||||
logger.info(f'Skipping instance {idx} as it is already finished.')
|
||||
continue
|
||||
new_env_ids.append(idx)
|
||||
|
||||
env_ids = new_env_ids
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
|
||||
docker_sandbox = DockerSSHBox()
|
||||
for env_id in tqdm(env_ids):
|
||||
try:
|
||||
output = process_instance(
|
||||
env_id=env_id,
|
||||
metadata=metadata,
|
||||
eval_output_dir=eval_output_dir,
|
||||
docker_sandbox=docker_sandbox,
|
||||
reset_logger=False,
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
except Exception as e:
|
||||
logger.error(f'Error processing instance {env_id}: {e}')
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
42
evaluation/webarena/scripts/run_infer.sh
Executable file
42
evaluation/webarena/scripts/run_infer.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
# configure webarena websites and environment
|
||||
source evaluation/webarena/scripts/webarena_env.sh
|
||||
|
||||
# configure browsing agent
|
||||
export USE_NAV="false"
|
||||
export USE_CONCISE_ANSWER="true"
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
AGENT=$2
|
||||
EVAL_LIMIT=$3
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default BrowsingAgent"
|
||||
AGENT="BrowsingAgent"
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/webarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 15 \
|
||||
--max-chars 10000000 \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from typing import Callable, Optional, Type
|
||||
|
||||
@ -34,6 +35,7 @@ async def main(
|
||||
exit_on_message: bool = False,
|
||||
fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
|
||||
sandbox: Optional[Sandbox] = None,
|
||||
runtime_tools_config: Optional[dict] = None,
|
||||
) -> Optional[State]:
|
||||
"""Main coroutine to run the agent controller with task input flexibility.
|
||||
It's only used when you launch opendevin backend directly via cmdline.
|
||||
@ -92,7 +94,21 @@ async def main(
|
||||
)
|
||||
runtime = ServerRuntime(event_stream=event_stream, sandbox=sandbox)
|
||||
runtime.init_sandbox_plugins(controller.agent.sandbox_plugins)
|
||||
runtime.init_runtime_tools(controller.agent.runtime_tools, is_async=False)
|
||||
runtime.init_runtime_tools(
|
||||
controller.agent.runtime_tools,
|
||||
is_async=False,
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
)
|
||||
|
||||
# browser eval specific
|
||||
# TODO: move to a better place
|
||||
if runtime.browser and runtime.browser.eval_dir:
|
||||
logger.info(f'Evaluation directory: {runtime.browser.eval_dir}')
|
||||
with open(
|
||||
os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
|
||||
) as f:
|
||||
task = f.read()
|
||||
logger.info(f'Dynamic Eval task: {task}')
|
||||
|
||||
await event_stream.add_event(MessageAction(content=task), EventSource.USER)
|
||||
|
||||
|
||||
@ -29,6 +29,7 @@ class BrowseURLAction(Action):
|
||||
class BrowseInteractiveAction(Action):
|
||||
browser_actions: str
|
||||
thought: str = ''
|
||||
browsergym_send_msg_to_user: str = ''
|
||||
action: str = ActionType.BROWSE_INTERACTIVE
|
||||
runnable: ClassVar[bool] = True
|
||||
|
||||
|
||||
@ -21,6 +21,9 @@ class BrowserOutputObservation(Observation):
|
||||
active_page_index: int = -1
|
||||
dom_object: dict = field(default_factory=dict, repr=False) # don't show in repr
|
||||
axtree_object: dict = field(default_factory=dict, repr=False) # don't show in repr
|
||||
extra_element_properties: dict = field(
|
||||
default_factory=dict, repr=False
|
||||
) # don't show in repr
|
||||
last_browser_action: str = ''
|
||||
last_browser_action_error: str = ''
|
||||
focused_element_bid: str = ''
|
||||
|
||||
@ -20,6 +20,7 @@ DELETE_FROM_MEMORY_EXTRAS = {
|
||||
'last_browser_action',
|
||||
'last_browser_action_error',
|
||||
'focused_element_bid',
|
||||
'extra_element_properties',
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
import atexit
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
@ -18,15 +20,27 @@ from opendevin.core.logger import opendevin_logger as logger
|
||||
|
||||
|
||||
class BrowserEnv:
|
||||
def __init__(self, is_async: bool = True):
|
||||
self.html_text_converter = html2text.HTML2Text()
|
||||
# ignore links and images
|
||||
self.html_text_converter.ignore_links = False
|
||||
self.html_text_converter.ignore_images = True
|
||||
# use alt text for images
|
||||
self.html_text_converter.images_to_alt = True
|
||||
# disable auto text wrapping
|
||||
self.html_text_converter.body_width = 0
|
||||
def __init__(
|
||||
self,
|
||||
is_async: bool = True,
|
||||
browsergym_eval: str = '',
|
||||
browsergym_eval_save_dir: str = '',
|
||||
):
|
||||
self.html_text_converter = self.get_html_text_converter()
|
||||
self.eval_mode = False
|
||||
self.eval_dir = ''
|
||||
# EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
|
||||
self.browsergym_eval = browsergym_eval
|
||||
self.browsergym_eval_save_dir = browsergym_eval_save_dir
|
||||
if self.browsergym_eval:
|
||||
assert (
|
||||
self.browsergym_eval_save_dir
|
||||
), 'browsergym_eval_save_dir must be provided for evaluation.'
|
||||
self.eval_mode = True
|
||||
self.eval_dir = os.path.join(
|
||||
self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
|
||||
)
|
||||
os.makedirs(self.eval_dir, exist_ok=True)
|
||||
# Initialize browser environment process
|
||||
multiprocessing.set_start_method('spawn', force=True)
|
||||
self.browser_side, self.agent_side = multiprocessing.Pipe()
|
||||
@ -39,6 +53,17 @@ class BrowserEnv:
|
||||
self.init_browser()
|
||||
atexit.register(self.close)
|
||||
|
||||
def get_html_text_converter(self):
|
||||
html_text_converter = html2text.HTML2Text()
|
||||
# ignore links and images
|
||||
html_text_converter.ignore_links = False
|
||||
html_text_converter.ignore_images = True
|
||||
# use alt text for images
|
||||
html_text_converter.images_to_alt = True
|
||||
# disable auto text wrapping
|
||||
html_text_converter.body_width = 0
|
||||
return html_text_converter
|
||||
|
||||
def init_browser(self):
|
||||
logger.info('Starting browser env...')
|
||||
self.process.start()
|
||||
@ -47,14 +72,26 @@ class BrowserEnv:
|
||||
raise BrowserInitException('Failed to start browser environment.')
|
||||
|
||||
def browser_process(self):
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
task_kwargs={'start_url': 'about:blank'},
|
||||
wait_for_user_message=False,
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
)
|
||||
if self.eval_mode:
|
||||
logger.info('Creating browser env for evaluation purpose.')
|
||||
env = gym.make(self.browsergym_eval)
|
||||
else:
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
|
||||
wait_for_user_message=False,
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
)
|
||||
obs, info = env.reset()
|
||||
# EVAL only: save the goal into file for evaluation
|
||||
if self.eval_mode:
|
||||
rewards = [] # store rewards if in eval mode
|
||||
logger.info(obs['goal'])
|
||||
with open(
|
||||
os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
|
||||
) as f:
|
||||
f.write(obs['goal'])
|
||||
logger.info('Browser env started.')
|
||||
while True:
|
||||
try:
|
||||
@ -70,6 +107,15 @@ class BrowserEnv:
|
||||
continue
|
||||
action = action_data['action']
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
# EVAL only: save the rewards into file for evaluation
|
||||
if self.eval_mode:
|
||||
rewards.append(reward)
|
||||
with open(
|
||||
os.path.join(self.eval_dir, 'rewards.json'),
|
||||
'w',
|
||||
encoding='utf-8',
|
||||
) as f:
|
||||
f.write(json.dumps(rewards))
|
||||
# add text content of the page
|
||||
html_str = flatten_dom_to_str(obs['dom_object'])
|
||||
obs['text_content'] = self.html_text_converter.handle(html_str)
|
||||
@ -86,7 +132,7 @@ class BrowserEnv:
|
||||
pass
|
||||
return
|
||||
|
||||
def step(self, action_str: str, timeout: float = 10) -> dict:
|
||||
def step(self, action_str: str, timeout: float = 30) -> dict:
|
||||
unique_request_id = str(uuid.uuid4())
|
||||
self.agent_side.send((unique_request_id, {'action': action_str}))
|
||||
start_time = time.time()
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Optional
|
||||
|
||||
from opendevin.core.config import config
|
||||
from opendevin.core.exceptions import BrowserInitException
|
||||
@ -91,12 +92,18 @@ class Runtime:
|
||||
self.sandbox.init_plugins(plugins)
|
||||
|
||||
def init_runtime_tools(
|
||||
self, runtime_tools: list[RuntimeTool], is_async: bool = True
|
||||
self,
|
||||
runtime_tools: list[RuntimeTool],
|
||||
runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
|
||||
is_async: bool = True,
|
||||
) -> None:
|
||||
# if browser in runtime_tools, init it
|
||||
if RuntimeTool.BROWSER in runtime_tools:
|
||||
if runtime_tools_config is None:
|
||||
runtime_tools_config = {}
|
||||
browser_env_config = runtime_tools_config.get(RuntimeTool.BROWSER, {})
|
||||
try:
|
||||
self.browser = BrowserEnv(is_async)
|
||||
self.browser = BrowserEnv(is_async=is_async, **browser_env_config)
|
||||
except BrowserInitException:
|
||||
logger.warn(
|
||||
'Failed to start browser environment, web browsing functionality will not work'
|
||||
|
||||
@ -30,6 +30,9 @@ async def browse(action, browser: BrowserEnv | None) -> BrowserOutputObservation
|
||||
active_page_index=obs['active_page_index'], # index of the active page
|
||||
dom_object=obs['dom_object'], # DOM object
|
||||
axtree_object=obs['axtree_object'], # accessibility tree object
|
||||
extra_element_properties=obs[
|
||||
'extra_element_properties'
|
||||
], # extra element properties
|
||||
last_browser_action=obs['last_action'], # last browser env action performed
|
||||
focused_element_bid=obs['focused_element_bid'], # focused element bid
|
||||
screenshot=obs['screenshot'], # base64-encoded screenshot, png
|
||||
|
||||
@ -114,7 +114,7 @@ Don't execute multiple actions at once if you need feedback from the page.
|
||||
----------
|
||||
|
||||
# Current Accessibility Tree:
|
||||
|
||||
RootWebArea '', focused
|
||||
|
||||
# Previous Actions
|
||||
|
||||
|
||||
@ -118,12 +118,11 @@ RootWebArea 'The Ultimate Answer', focused
|
||||
[8] heading 'The Ultimate Answer'
|
||||
[9] paragraph ''
|
||||
StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
|
||||
[10] button 'Click me'
|
||||
[10] button 'Click me', clickable
|
||||
|
||||
# Previous Actions
|
||||
goto('http://localhost:8000')
|
||||
|
||||
|
||||
Here is an example with chain of thought of a valid action when clicking on a button:
|
||||
"
|
||||
In order to accomplish my goal I need to click on the button with bid 12
|
||||
|
||||
@ -118,14 +118,13 @@ RootWebArea 'The Ultimate Answer', focused
|
||||
[8] heading 'The Ultimate Answer'
|
||||
[9] paragraph ''
|
||||
StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
|
||||
[10] button 'Click me', focused
|
||||
[10] button 'Click me', clickable, focused
|
||||
StaticText 'The answer is OpenDevin is all you need!'
|
||||
|
||||
# Previous Actions
|
||||
goto('http://localhost:8000')
|
||||
click("10")
|
||||
|
||||
|
||||
Here is an example with chain of thought of a valid action when clicking on a button:
|
||||
"
|
||||
In order to accomplish my goal I need to click on the button with bid 12
|
||||
|
||||
@ -112,7 +112,11 @@ def test_browse_url_action_serialization_deserialization():
|
||||
def test_browse_interactive_action_serialization_deserialization():
|
||||
original_action_dict = {
|
||||
'action': 'browse_interactive',
|
||||
'args': {'thought': '', 'browser_actions': 'goto("https://www.example.com")'},
|
||||
'args': {
|
||||
'thought': '',
|
||||
'browser_actions': 'goto("https://www.example.com")',
|
||||
'browsergym_send_msg_to_user': '',
|
||||
},
|
||||
}
|
||||
serialization_deserialization(original_action_dict, BrowseInteractiveAction)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user