[feat] WebArena benchmark, MiniWoB++ benchmark and related arch changes (#2170)

* add webarena, and revamp messaging for webarena eval

* add changes for browsergym

* update infer script

* fix unit tests

* update

* add multiple run for miniwob

* update instruction, remove personal path

* update

* add code for getting final reward, fix integration, add results

* add avg cost calculation
This commit is contained in:
Frank Xu 2024-06-05 21:01:20 -04:00 committed by GitHub
parent 99c6333e1a
commit 48151bdbb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 951 additions and 54 deletions

1
.gitignore vendored
View File

@ -209,3 +209,4 @@ evaluation/outputs
evaluation/evaluation_outputs
test_results*
/_test_files_tmp/
evaluation/webarena/scripts/webarena_env.sh

View File

@ -1,4 +1,5 @@
import ast
import os
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.utils.obs import flatten_axtree_to_str
@ -12,6 +13,7 @@ from opendevin.events.action import (
BrowseInteractiveAction,
MessageAction,
)
from opendevin.events.event import EventSource
from opendevin.events.observation import BrowserOutputObservation
from opendevin.llm.llm import LLM
from opendevin.runtime.plugins import (
@ -19,21 +21,12 @@ from opendevin.runtime.plugins import (
)
from opendevin.runtime.tools import RuntimeTool
def parse_response(response: str) -> Action:
if '```' not in response:
# unexpected response format, message back to user
return MessageAction(response)
thought = response.split('```')[0].strip()
action_str = response.split('```')[1].strip()
# handle send message to user function call in BrowserGym
for sub_action in action_str.split('\n'):
if 'send_msg_to_user(' in sub_action:
tree = ast.parse(sub_action)
args = tree.body[0].value.args # type: ignore
return MessageAction(args[0].value)
return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
USE_NAV = (
os.environ.get('USE_NAV', 'true') == 'true'
) # only disable NAV actions when running webarena and miniwob benchmarks
USE_CONCISE_ANSWER = (
os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
) # only return concise answer when running webarena and miniwob benchmarks
class BrowsingAgent(Agent):
@ -56,13 +49,13 @@ class BrowsingAgent(Agent):
- llm (LLM): The llm to be used by this agent
"""
super().__init__(llm)
# define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
action_subsets = ['chat', 'bid']
if USE_NAV:
action_subsets.append('nav')
self.action_space = HighLevelActionSet(
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
subsets=[
'chat',
'bid',
'nav',
], # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
subsets=action_subsets,
strict=False, # less strict on the parsing of the actions
multiaction=True, # enable to agent to take multiple actions at once
)
@ -75,6 +68,32 @@ class BrowsingAgent(Agent):
"""
super().reset()
self.cost_accumulator = 0
self.error_accumulator = 0
def parse_response(self, response: str) -> Action:
if '```' not in response:
# unexpected response format, message back to user
action_str = f'send_msg_to_user("""{response}""")'
return BrowseInteractiveAction(
browser_actions=action_str,
thought=response,
browsergym_send_msg_to_user=response,
)
thought = response.split('```')[0].strip()
action_str = response.split('```')[1].strip()
# handle send message to user function call in BrowserGym
msg_content = ''
for sub_action in action_str.split('\n'):
if 'send_msg_to_user(' in sub_action:
tree = ast.parse(sub_action)
args = tree.body[0].value.args # type: ignore
msg_content = args[0].value
return BrowseInteractiveAction(
browser_actions=action_str,
thought=thought,
browsergym_send_msg_to_user=msg_content,
)
def step(self, state: State) -> Action:
"""
@ -91,26 +110,57 @@ class BrowsingAgent(Agent):
"""
goal = state.get_current_user_intent()
messages = []
prev_actions = ''
prev_actions = []
cur_axtree_txt = ''
error_prefix = ''
last_obs = None
last_action = None
if len(state.history) == 1:
# initialize and retrieve the first observation by issuing an noop OP
# TODO: need more elegant way of doing this
return BrowseInteractiveAction(browser_actions='noop()')
for prev_action, obs in state.history:
if isinstance(prev_action, BrowseInteractiveAction):
prev_actions += f'{prev_action.browser_actions}\n'
prev_actions.append(prev_action.browser_actions)
last_obs = obs
last_action = prev_action
elif (
isinstance(prev_action, MessageAction) and prev_action.source != 'user'
isinstance(prev_action, MessageAction)
and prev_action.source == EventSource.AGENT
):
# agent has responded, task finish.
return AgentFinishAction()
prev_action_str = '\n'.join(prev_actions[1:])
# if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
# we should also send a message back to the user in OpenDevin and call it a day
if (
isinstance(last_action, BrowseInteractiveAction)
and last_action.browsergym_send_msg_to_user
):
return MessageAction(last_action.browsergym_send_msg_to_user)
if isinstance(last_obs, BrowserOutputObservation):
if last_obs.error:
# add error recovery prompt prefix
error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
try:
cur_axtree_txt = flatten_axtree_to_str(
last_obs.axtree_object,
extra_properties=last_obs.extra_element_properties,
with_clickable=True,
filter_visible_only=True,
)
except Exception as e:
logger.error(
'Error when trying to process the accessibility tree: %s', e
)
return MessageAction('Error encountered when browsing.')
if error_prefix:
self.error_accumulator += 1
if self.error_accumulator > 5:
return MessageAction('Too many errors encountered. Task failed.')
system_msg = f"""\
# Instructions
Review the current state of the page and all other information to find the best
@ -133,7 +183,7 @@ and executed by a program, make sure to follow the formatting instructions.
{cur_axtree_txt}
# Previous Actions
{prev_actions}
{prev_action_str}
Here is an example with chain of thought of a valid action when clicking on a button:
"
@ -141,16 +191,31 @@ In order to accomplish my goal I need to click on the button with bid 12
```click("12")```
"
""".strip()
if USE_CONCISE_ANSWER:
concise_instruction = """\
Here is another example with chain of thought of a valid action when providing a concise answer to user:
"
In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
```send_msg_to_user("$279.49")```
"
"""
prompt += concise_instruction
messages.append({'role': 'user', 'content': prompt})
response = self.llm.completion(
messages=messages,
temperature=0.0,
stop=[')```', ')\n```'],
)
self.log_cost(response)
action_resp = response['choices'][0]['message']['content']
action_resp = response['choices'][0]['message']['content'].strip()
if not action_resp.endswith('```'):
action_resp = action_resp + ')```'
logger.info(prompt)
logger.info(action_resp)
return parse_response(action_resp)
return self.parse_response(action_resp)
def search_memory(self, query: str) -> list[str]:
raise NotImplementedError('Implement this abstract method')

View File

@ -0,0 +1,81 @@
# WebArena Evaluation with OpenDevin Browsing Agents
This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
## Setup OpenDevin Environment
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
sandbox_type = "ssh"
ssh_hostname = "localhost"
sandbox_timeout = 120
# TODO: Change these to the model you want to evaluate
[eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
- Clone miniwob (use a specific frozen commit for reproducibility)
```sh
git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
```
- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
```sh
export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
```
## Test if your environment works
Access with browser the above MiniWoB URLs and see if they load correctly.
## Run Evaluation
```sh
bash evaluation/miniwob/scripts/run_infer.sh
```
Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
To calculate the average reward, run:
```sh
poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
```
## Submit your evaluation results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
## BrowsingAgent V1.0 result
Tested on BrowsingAgent V1.0
MiniWoB++, 125 tasks (3 runs due to random init task), max step 10
- GPT4o: 0.384, 0.416, 0.424, avg: 0.408
- GPT3.5: 0.288, 0.256, 0.272, avg: 0.272

View File

View File

@ -0,0 +1,33 @@
import argparse
import json
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
import gymnasium as gym
parser = argparse.ArgumentParser(description='Calculate average reward.')
parser.add_argument('output_path', type=str, help='path to output.jsonl')
args = parser.parse_args()
if __name__ == '__main__':
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
]
total_num = len(env_ids)
print('Total number of tasks: ', total_num)
total_reward = 0
total_cost = 0
actual_num = 0
with open(args.output_path, 'r') as f:
for line in f:
data = json.loads(line)
actual_num += 1
total_cost += data['metrics']['accumulated_cost']
total_reward += data['test_result']
avg_reward = total_reward / total_num
print('Avg Reward: ', avg_reward)
avg_cost = total_cost / actual_num
print('Avg Cost: ', avg_cost)
print('Actual number of tasks finished: ', actual_num)

View File

@ -0,0 +1,214 @@
import asyncio
import json
import logging
import os
import pathlib
import subprocess
import time
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
import gymnasium as gym
from tqdm import tqdm
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.events.serialization.event import event_to_dict
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.tools import RuntimeTool
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
def process_instance(
env_id: str,
metadata: dict,
eval_output_dir: str,
docker_sandbox: DockerSSHBox,
reset_logger: bool = True,
):
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
else:
logger.info(f'Starting evaluation for instance {env_id}.')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime_tools_config = {
RuntimeTool.BROWSER: {
'browsergym_eval': env_id,
'browsergym_eval_save_dir': eval_output_dir,
}
}
state: State = asyncio.run(
main(
'PLACEHOLDER_GOAL',
runtime_tools_config=runtime_tools_config,
sandbox=docker_sandbox,
)
)
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
# read goal
with open(
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
) as f:
instruction = f.read()
# read reward
with open(
os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
) as f:
rewards = json.load(f)
reward = max(rewards)
# Save the output
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata,
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': reward,
}
return output
if __name__ == '__main__':
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
]
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
# for details of how to set `llm_config`
if args.llm_config:
specified_llm_config = get_llm_config_arg(args.llm_config)
if specified_llm_config:
config.llm = specified_llm_config
logger.info(f'Config for evaluation: {config}')
# TEST METADATA
agent_class = args.agent_cls
assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
model_name = config.llm.model.split('/')[-1]
max_iterations = args.max_iterations
eval_note = ''
if args.eval_note is not None:
eval_note += '_N_' + args.eval_note
eval_output_dir = os.path.join(
args.eval_output_dir,
'miniwob',
agent_class,
model_name + '_maxiter_' + str(max_iterations) + eval_note,
)
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
parents=True, exist_ok=True
)
logger.info(f'Using evaluation output directory: {eval_output_dir}')
metadata = {
'agent_class': agent_class,
'model_name': model_name,
'max_iterations': max_iterations,
'eval_output_dir': eval_output_dir,
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
# get the commit id of current repo for reproduciblity
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
.decode('utf-8')
.strip(),
}
logger.info(f'Metadata: {metadata}')
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f)
# LIMIT EVALUATION
eval_n_limit = args.eval_n_limit
if eval_n_limit:
env_ids = env_ids[:eval_n_limit]
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
# OUTPUT FILE
output_file = os.path.join(eval_output_dir, 'output.jsonl')
logger.info(f'Writing evaluation output to {output_file}')
finished_instance_ids = set()
if os.path.exists(output_file):
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
finished_instance_ids.add(data['instance_id'])
logger.warning(
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
)
output_fp = open(output_file, 'a')
logger.info(
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
)
# =============================================
# filter out finished instances
new_env_ids = []
for idx in env_ids:
if idx in finished_instance_ids:
logger.info(f'Skipping instance {idx} as it is already finished.')
continue
new_env_ids.append(idx)
env_ids = new_env_ids
logger.info(
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
)
# =============================================
docker_sandbox = DockerSSHBox()
for env_id in tqdm(env_ids):
try:
output = process_instance(
env_id=env_id,
metadata=metadata,
eval_output_dir=eval_output_dir,
docker_sandbox=docker_sandbox,
reset_logger=False,
)
output_fp.write(json.dumps(output) + '\n')
output_fp.flush()
except Exception as e:
logger.error(f'Error processing instance {env_id}: {e}')
output_fp.close()
logger.info('Evaluation finished.')

View File

@ -0,0 +1,44 @@
#!/bin/bash
# configure miniwob website, change URL to yours
export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
# configure browsing agent
export USE_NAV="false"
export USE_CONCISE_ANSWER="true"
MODEL_CONFIG=$1
AGENT=$2
NOTE=$3
EVAL_LIMIT=$4
if [ -z "$AGENT" ]; then
echo "Agent not specified, use default BrowsingAgent"
AGENT="BrowsingAgent"
fi
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
COMMAND="poetry run python evaluation/miniwob/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--max-chars 10000000 \
--eval-note $EVAL_NOTE"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND

View File

@ -0,0 +1,91 @@
# WebArena Evaluation with OpenDevin Browsing Agents
This folder contains evaluation for [WebArena](https://github.com/web-arena-x/webarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
## Setup OpenDevin Environment
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
[core]
max_iterations = 100
cache_dir = "/tmp/cache"
sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
sandbox_type = "ssh"
ssh_hostname = "localhost"
sandbox_timeout = 120
# TODO: Change these to the model you want to evaluate
[eval_gpt4_1106_preview]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[eval_some_openai_compatible_model]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
## Setup WebArena Environment
WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenDevin agents.
Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
Take note of the base URL of the machine where the environment is installed.
## Setup Environment Variables of WebArena Websites
Create a script `webarena_env.sh` under `evaluation/webarena/scripts` with the following:
```bash
export BASE_URL=<YOUR_SERVER_URL_HERE>
export SHOPPING="$BASE_URL:7770/"
export SHOPPING_ADMIN="$BASE_URL:7780/admin"
export REDDIT="$BASE_URL:9999"
export GITLAB="$BASE_URL:8023"
export WIKIPEDIA="$BASE_URL:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
export MAP="$BASE_URL:3000"
export HOMEPAGE="$BASE_URL:4399"
export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
```
## Test if your environment works
Access with browser the above WebArena website URLs and see if they load correctly.
If you cannot access the website, make sure the firewall allows public access of the aforementioned ports on your server
Check the network security policy if you are using an AWS machine.
Follow the WebArena environment setup guide carefully, and make sure the URL fields are populated with the correct base URL of your server.
## Run Evaluation
```sh
bash evaluation/webarena/scripts/run_infer.sh
```
Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
To calculate the success rate, run:
```sh
poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
```
## Submit your evaluation results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
## BrowsingAgent V1.0 result
Tested on BrowsingAgent V1.0
WebArena, 812 tasks (high cost, single run due to fixed task), max step 15
- GPT4o: 0.1478
- GPT3.5: 0.0517

View File

View File

@ -0,0 +1,33 @@
import argparse
import json
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
import gymnasium as gym
parser = argparse.ArgumentParser(description='Calculate average reward.')
parser.add_argument('output_path', type=str, help='path to output.jsonl')
args = parser.parse_args()
if __name__ == '__main__':
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
]
total_num = len(env_ids)
print('Total number of tasks: ', total_num)
total_reward = 0
total_cost = 0
actual_num = 0
with open(args.output_path, 'r') as f:
for line in f:
data = json.loads(line)
actual_num += 1
total_cost += data['metrics']['accumulated_cost']
total_reward += data['test_result']
avg_reward = total_reward / total_num
print('Success Rate: ', avg_reward)
avg_cost = total_cost / actual_num
print('Avg Cost: ', avg_cost)
print('Actual number of tasks finished: ', actual_num)

View File

@ -0,0 +1,214 @@
import asyncio
import json
import logging
import os
import pathlib
import subprocess
import time
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
import gymnasium as gym
from tqdm import tqdm
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.events.serialization.event import event_to_dict
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.tools import RuntimeTool
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
def process_instance(
env_id: str,
metadata: dict,
eval_output_dir: str,
docker_sandbox: DockerSSHBox,
reset_logger: bool = True,
):
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
else:
logger.info(f'Starting evaluation for instance {env_id}.')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime_tools_config = {
RuntimeTool.BROWSER: {
'browsergym_eval': env_id,
'browsergym_eval_save_dir': eval_output_dir,
}
}
state: State = asyncio.run(
main(
'PLACEHOLDER_GOAL',
runtime_tools_config=runtime_tools_config,
sandbox=docker_sandbox,
)
)
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
# read goal
with open(
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
) as f:
instruction = f.read()
# read reward
with open(
os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
) as f:
rewards = json.load(f)
reward = max(rewards)
# Save the output
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata,
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': reward,
}
return output
if __name__ == '__main__':
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
]
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
# for details of how to set `llm_config`
if args.llm_config:
specified_llm_config = get_llm_config_arg(args.llm_config)
if specified_llm_config:
config.llm = specified_llm_config
logger.info(f'Config for evaluation: {config}')
# TEST METADATA
agent_class = args.agent_cls
assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
model_name = config.llm.model.split('/')[-1]
max_iterations = args.max_iterations
eval_note = ''
if args.eval_note is not None:
eval_note += '_N_' + args.eval_note
eval_output_dir = os.path.join(
args.eval_output_dir,
'webarena',
agent_class,
model_name + '_maxiter_' + str(max_iterations) + eval_note,
)
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
parents=True, exist_ok=True
)
logger.info(f'Using evaluation output directory: {eval_output_dir}')
metadata = {
'agent_class': agent_class,
'model_name': model_name,
'max_iterations': max_iterations,
'eval_output_dir': eval_output_dir,
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
# get the commit id of current repo for reproduciblity
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
.decode('utf-8')
.strip(),
}
logger.info(f'Metadata: {metadata}')
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f)
# LIMIT EVALUATION
eval_n_limit = args.eval_n_limit
if eval_n_limit:
env_ids = env_ids[:eval_n_limit]
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
# OUTPUT FILE
output_file = os.path.join(eval_output_dir, 'output.jsonl')
logger.info(f'Writing evaluation output to {output_file}')
finished_instance_ids = set()
if os.path.exists(output_file):
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
finished_instance_ids.add(data['instance_id'])
logger.warning(
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
)
output_fp = open(output_file, 'a')
logger.info(
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
)
# =============================================
# filter out finished instances
new_env_ids = []
for idx in env_ids:
if idx in finished_instance_ids:
logger.info(f'Skipping instance {idx} as it is already finished.')
continue
new_env_ids.append(idx)
env_ids = new_env_ids
logger.info(
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
)
# =============================================
docker_sandbox = DockerSSHBox()
for env_id in tqdm(env_ids):
try:
output = process_instance(
env_id=env_id,
metadata=metadata,
eval_output_dir=eval_output_dir,
docker_sandbox=docker_sandbox,
reset_logger=False,
)
output_fp.write(json.dumps(output) + '\n')
output_fp.flush()
except Exception as e:
logger.error(f'Error processing instance {env_id}: {e}')
output_fp.close()
logger.info('Evaluation finished.')

View File

@ -0,0 +1,42 @@
#!/bin/bash
# configure webarena websites and environment
source evaluation/webarena/scripts/webarena_env.sh
# configure browsing agent
export USE_NAV="false"
export USE_CONCISE_ANSWER="true"
MODEL_CONFIG=$1
AGENT=$2
EVAL_LIMIT=$3
if [ -z "$AGENT" ]; then
echo "Agent not specified, use default BrowsingAgent"
AGENT="BrowsingAgent"
fi
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="$AGENT_VERSION"
COMMAND="poetry run python evaluation/webarena/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 15 \
--max-chars 10000000 \
--eval-note $EVAL_NOTE"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND

View File

@ -1,4 +1,5 @@
import asyncio
import os
import sys
from typing import Callable, Optional, Type
@ -34,6 +35,7 @@ async def main(
exit_on_message: bool = False,
fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
sandbox: Optional[Sandbox] = None,
runtime_tools_config: Optional[dict] = None,
) -> Optional[State]:
"""Main coroutine to run the agent controller with task input flexibility.
It's only used when you launch opendevin backend directly via cmdline.
@ -92,7 +94,21 @@ async def main(
)
runtime = ServerRuntime(event_stream=event_stream, sandbox=sandbox)
runtime.init_sandbox_plugins(controller.agent.sandbox_plugins)
runtime.init_runtime_tools(controller.agent.runtime_tools, is_async=False)
runtime.init_runtime_tools(
controller.agent.runtime_tools,
is_async=False,
runtime_tools_config=runtime_tools_config,
)
# browser eval specific
# TODO: move to a better place
if runtime.browser and runtime.browser.eval_dir:
logger.info(f'Evaluation directory: {runtime.browser.eval_dir}')
with open(
os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
) as f:
task = f.read()
logger.info(f'Dynamic Eval task: {task}')
await event_stream.add_event(MessageAction(content=task), EventSource.USER)

View File

@ -29,6 +29,7 @@ class BrowseURLAction(Action):
class BrowseInteractiveAction(Action):
browser_actions: str
thought: str = ''
browsergym_send_msg_to_user: str = ''
action: str = ActionType.BROWSE_INTERACTIVE
runnable: ClassVar[bool] = True

View File

@ -21,6 +21,9 @@ class BrowserOutputObservation(Observation):
active_page_index: int = -1
dom_object: dict = field(default_factory=dict, repr=False) # don't show in repr
axtree_object: dict = field(default_factory=dict, repr=False) # don't show in repr
extra_element_properties: dict = field(
default_factory=dict, repr=False
) # don't show in repr
last_browser_action: str = ''
last_browser_action_error: str = ''
focused_element_bid: str = ''

View File

@ -20,6 +20,7 @@ DELETE_FROM_MEMORY_EXTRAS = {
'last_browser_action',
'last_browser_action_error',
'focused_element_bid',
'extra_element_properties',
}

View File

@ -1,7 +1,9 @@
import atexit
import base64
import io
import json
import multiprocessing
import os
import threading
import time
import uuid
@ -18,15 +20,27 @@ from opendevin.core.logger import opendevin_logger as logger
class BrowserEnv:
def __init__(self, is_async: bool = True):
self.html_text_converter = html2text.HTML2Text()
# ignore links and images
self.html_text_converter.ignore_links = False
self.html_text_converter.ignore_images = True
# use alt text for images
self.html_text_converter.images_to_alt = True
# disable auto text wrapping
self.html_text_converter.body_width = 0
def __init__(
self,
is_async: bool = True,
browsergym_eval: str = '',
browsergym_eval_save_dir: str = '',
):
self.html_text_converter = self.get_html_text_converter()
self.eval_mode = False
self.eval_dir = ''
# EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
self.browsergym_eval = browsergym_eval
self.browsergym_eval_save_dir = browsergym_eval_save_dir
if self.browsergym_eval:
assert (
self.browsergym_eval_save_dir
), 'browsergym_eval_save_dir must be provided for evaluation.'
self.eval_mode = True
self.eval_dir = os.path.join(
self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
)
os.makedirs(self.eval_dir, exist_ok=True)
# Initialize browser environment process
multiprocessing.set_start_method('spawn', force=True)
self.browser_side, self.agent_side = multiprocessing.Pipe()
@ -39,6 +53,17 @@ class BrowserEnv:
self.init_browser()
atexit.register(self.close)
def get_html_text_converter(self):
html_text_converter = html2text.HTML2Text()
# ignore links and images
html_text_converter.ignore_links = False
html_text_converter.ignore_images = True
# use alt text for images
html_text_converter.images_to_alt = True
# disable auto text wrapping
html_text_converter.body_width = 0
return html_text_converter
def init_browser(self):
logger.info('Starting browser env...')
self.process.start()
@ -47,14 +72,26 @@ class BrowserEnv:
raise BrowserInitException('Failed to start browser environment.')
def browser_process(self):
env = gym.make(
'browsergym/openended',
task_kwargs={'start_url': 'about:blank'},
wait_for_user_message=False,
headless=True,
disable_env_checker=True,
)
if self.eval_mode:
logger.info('Creating browser env for evaluation purpose.')
env = gym.make(self.browsergym_eval)
else:
env = gym.make(
'browsergym/openended',
task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
wait_for_user_message=False,
headless=True,
disable_env_checker=True,
)
obs, info = env.reset()
# EVAL only: save the goal into file for evaluation
if self.eval_mode:
rewards = [] # store rewards if in eval mode
logger.info(obs['goal'])
with open(
os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
) as f:
f.write(obs['goal'])
logger.info('Browser env started.')
while True:
try:
@ -70,6 +107,15 @@ class BrowserEnv:
continue
action = action_data['action']
obs, reward, terminated, truncated, info = env.step(action)
# EVAL only: save the rewards into file for evaluation
if self.eval_mode:
rewards.append(reward)
with open(
os.path.join(self.eval_dir, 'rewards.json'),
'w',
encoding='utf-8',
) as f:
f.write(json.dumps(rewards))
# add text content of the page
html_str = flatten_dom_to_str(obs['dom_object'])
obs['text_content'] = self.html_text_converter.handle(html_str)
@ -86,7 +132,7 @@ class BrowserEnv:
pass
return
def step(self, action_str: str, timeout: float = 10) -> dict:
def step(self, action_str: str, timeout: float = 30) -> dict:
unique_request_id = str(uuid.uuid4())
self.agent_side.send((unique_request_id, {'action': action_str}))
start_time = time.time()

View File

@ -1,5 +1,6 @@
import asyncio
from abc import abstractmethod
from typing import Any, Optional
from opendevin.core.config import config
from opendevin.core.exceptions import BrowserInitException
@ -91,12 +92,18 @@ class Runtime:
self.sandbox.init_plugins(plugins)
def init_runtime_tools(
self, runtime_tools: list[RuntimeTool], is_async: bool = True
self,
runtime_tools: list[RuntimeTool],
runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
is_async: bool = True,
) -> None:
# if browser in runtime_tools, init it
if RuntimeTool.BROWSER in runtime_tools:
if runtime_tools_config is None:
runtime_tools_config = {}
browser_env_config = runtime_tools_config.get(RuntimeTool.BROWSER, {})
try:
self.browser = BrowserEnv(is_async)
self.browser = BrowserEnv(is_async=is_async, **browser_env_config)
except BrowserInitException:
logger.warn(
'Failed to start browser environment, web browsing functionality will not work'

View File

@ -30,6 +30,9 @@ async def browse(action, browser: BrowserEnv | None) -> BrowserOutputObservation
active_page_index=obs['active_page_index'], # index of the active page
dom_object=obs['dom_object'], # DOM object
axtree_object=obs['axtree_object'], # accessibility tree object
extra_element_properties=obs[
'extra_element_properties'
], # extra element properties
last_browser_action=obs['last_action'], # last browser env action performed
focused_element_bid=obs['focused_element_bid'], # focused element bid
screenshot=obs['screenshot'], # base64-encoded screenshot, png

View File

@ -114,7 +114,7 @@ Don't execute multiple actions at once if you need feedback from the page.
----------
# Current Accessibility Tree:
RootWebArea '', focused
# Previous Actions

View File

@ -118,12 +118,11 @@ RootWebArea 'The Ultimate Answer', focused
[8] heading 'The Ultimate Answer'
[9] paragraph ''
StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
[10] button 'Click me'
[10] button 'Click me', clickable
# Previous Actions
goto('http://localhost:8000')
Here is an example with chain of thought of a valid action when clicking on a button:
"
In order to accomplish my goal I need to click on the button with bid 12

View File

@ -118,14 +118,13 @@ RootWebArea 'The Ultimate Answer', focused
[8] heading 'The Ultimate Answer'
[9] paragraph ''
StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
[10] button 'Click me', focused
[10] button 'Click me', clickable, focused
StaticText 'The answer is OpenDevin is all you need!'
# Previous Actions
goto('http://localhost:8000')
click("10")
Here is an example with chain of thought of a valid action when clicking on a button:
"
In order to accomplish my goal I need to click on the button with bid 12

View File

@ -112,7 +112,11 @@ def test_browse_url_action_serialization_deserialization():
def test_browse_interactive_action_serialization_deserialization():
original_action_dict = {
'action': 'browse_interactive',
'args': {'thought': '', 'browser_actions': 'goto("https://www.example.com")'},
'args': {
'thought': '',
'browser_actions': 'goto("https://www.example.com")',
'browsergym_send_msg_to_user': '',
},
}
serialization_deserialization(original_action_dict, BrowseInteractiveAction)