[feat] WebArena benchmark, MiniWoB++ benchmark and related arch changes (#2170)

* add webarena, and revamp messaging for webarena eval * add changes for browsergym * update infer script * fix unit tests * update * add multiple run for miniwob * update instruction, remove personal path * update * add code for getting final reward, fix integration, add results * add avg cost calculation
2025-12-26 05:48:36 +08:00 · 2024-06-05 21:01:20 -04:00 · 2024-06-05 21:01:20 -04:00 · 48151bdbb0
commit 48151bdbb0
parent 99c6333e1a
23 changed files with 951 additions and 54 deletions
--- a/.gitignore
+++ b/.gitignore
@ -209,3 +209,4 @@ evaluation/outputs
 evaluation/evaluation_outputs
 test_results*
 /_test_files_tmp/
+evaluation/webarena/scripts/webarena_env.sh
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@ -1,4 +1,5 @@
 import ast
+import os

 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.utils.obs import flatten_axtree_to_str
@ -12,6 +13,7 @@ from opendevin.events.action import (
    BrowseInteractiveAction,
    MessageAction,
 )
+from opendevin.events.event import EventSource
 from opendevin.events.observation import BrowserOutputObservation
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
@ -19,21 +21,12 @@ from opendevin.runtime.plugins import (
 )
 from opendevin.runtime.tools import RuntimeTool

-
-def parse_response(response: str) -> Action:
-    if '```' not in response:
-        # unexpected response format, message back to user
-        return MessageAction(response)
-    thought = response.split('```')[0].strip()
-    action_str = response.split('```')[1].strip()
-    # handle send message to user function call in BrowserGym
-    for sub_action in action_str.split('\n'):
-        if 'send_msg_to_user(' in sub_action:
-            tree = ast.parse(sub_action)
-            args = tree.body[0].value.args  # type: ignore
-            return MessageAction(args[0].value)
-
-    return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
+USE_NAV = (
+    os.environ.get('USE_NAV', 'true') == 'true'
+)  # only disable NAV actions when running webarena and miniwob benchmarks
+USE_CONCISE_ANSWER = (
+    os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
+)  # only return concise answer when running webarena and miniwob benchmarks


 class BrowsingAgent(Agent):
@ -56,13 +49,13 @@ class BrowsingAgent(Agent):
        - llm (LLM): The llm to be used by this agent
        """
        super().__init__(llm)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = ['chat', 'bid']
+        if USE_NAV:
+            action_subsets.append('nav')
        self.action_space = HighLevelActionSet(
-            # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
-            subsets=[
-                'chat',
-                'bid',
-                'nav',
-            ],  # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+            subsets=action_subsets,
            strict=False,  # less strict on the parsing of the actions
            multiaction=True,  # enable to agent to take multiple actions at once
        )
@ -75,6 +68,32 @@ class BrowsingAgent(Agent):
        """
        super().reset()
        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def parse_response(self, response: str) -> Action:
+        if '```' not in response:
+            # unexpected response format, message back to user
+            action_str = f'send_msg_to_user("""{response}""")'
+            return BrowseInteractiveAction(
+                browser_actions=action_str,
+                thought=response,
+                browsergym_send_msg_to_user=response,
+            )
+        thought = response.split('```')[0].strip()
+        action_str = response.split('```')[1].strip()
+        # handle send message to user function call in BrowserGym
+        msg_content = ''
+        for sub_action in action_str.split('\n'):
+            if 'send_msg_to_user(' in sub_action:
+                tree = ast.parse(sub_action)
+                args = tree.body[0].value.args  # type: ignore
+                msg_content = args[0].value
+
+        return BrowseInteractiveAction(
+            browser_actions=action_str,
+            thought=thought,
+            browsergym_send_msg_to_user=msg_content,
+        )

    def step(self, state: State) -> Action:
        """
@ -91,26 +110,57 @@ class BrowsingAgent(Agent):
        """
        goal = state.get_current_user_intent()
        messages = []
-        prev_actions = ''
+        prev_actions = []
        cur_axtree_txt = ''
        error_prefix = ''
        last_obs = None
+        last_action = None
+        if len(state.history) == 1:
+            # initialize and retrieve the first observation by issuing an noop OP
+            # TODO: need more elegant way of doing this
+            return BrowseInteractiveAction(browser_actions='noop()')
        for prev_action, obs in state.history:
            if isinstance(prev_action, BrowseInteractiveAction):
-                prev_actions += f'{prev_action.browser_actions}\n'
+                prev_actions.append(prev_action.browser_actions)
                last_obs = obs
+                last_action = prev_action
            elif (
-                isinstance(prev_action, MessageAction) and prev_action.source != 'user'
+                isinstance(prev_action, MessageAction)
+                and prev_action.source == EventSource.AGENT
            ):
                # agent has responded, task finish.
                return AgentFinishAction()

+        prev_action_str = '\n'.join(prev_actions[1:])
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenDevin and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)
+
        if isinstance(last_obs, BrowserOutputObservation):
            if last_obs.error:
                # add error recovery prompt prefix
                error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
-            cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
+            try:
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_clickable=True,
+                    filter_visible_only=True,
+                )
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')

+        if error_prefix:
+            self.error_accumulator += 1
+            if self.error_accumulator > 5:
+                return MessageAction('Too many errors encountered. Task failed.')
        system_msg = f"""\
 # Instructions
 Review the current state of the page and all other information to find the best
@ -133,7 +183,7 @@ and executed by a program, make sure to follow the formatting instructions.
 {cur_axtree_txt}

 # Previous Actions
-{prev_actions}
+{prev_action_str}

 Here is an example with chain of thought of a valid action when clicking on a button:
 "
@ -141,16 +191,31 @@ In order to accomplish my goal I need to click on the button with bid 12
 ```click("12")```
 "
 """.strip()
+
+        if USE_CONCISE_ANSWER:
+            concise_instruction = """\
+
+Here is another example with chain of thought of a valid action when providing a concise answer to user:
+"
+In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
+```send_msg_to_user("$279.49")```
+"
+"""
+            prompt += concise_instruction
        messages.append({'role': 'user', 'content': prompt})
        response = self.llm.completion(
            messages=messages,
            temperature=0.0,
+            stop=[')```', ')\n```'],
        )
        self.log_cost(response)
-        action_resp = response['choices'][0]['message']['content']
+        action_resp = response['choices'][0]['message']['content'].strip()
+        if not action_resp.endswith('```'):
+            action_resp = action_resp + ')```'
+
        logger.info(prompt)
        logger.info(action_resp)
-        return parse_response(action_resp)
+        return self.parse_response(action_resp)

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@ -0,0 +1,81 @@
+# WebArena Evaluation with OpenDevin Browsing Agents
+
+This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
+
+## Setup OpenDevin Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+ssh_hostname = "localhost"
+sandbox_timeout = 120
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
+MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
+
+- Clone miniwob (use a specific frozen commit for reproducibility)
+```sh
+git clone git@github.com:Farama-Foundation/miniwob-plusplus.git
+git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
+```
+
+- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
+```sh
+export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
+```
+
+## Test if your environment works
+
+Access with browser the above MiniWoB URLs and see if they load correctly.
+
+## Run Evaluation
+
+```sh
+bash evaluation/miniwob/scripts/run_infer.sh
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
+
+To calculate the average reward, run:
+
+```sh
+poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+
+## BrowsingAgent V1.0 result
+
+Tested on BrowsingAgent V1.0
+
+MiniWoB++, 125 tasks (3 runs due to random init task), max step 10
+
+- GPT4o: 0.384, 0.416, 0.424, avg: 0.408
+- GPT3.5: 0.288, 0.256, 0.272, avg: 0.272
--- a/evaluation/miniwob/init.py
+++ b/evaluation/miniwob/init.py
--- a/evaluation/miniwob/get_avg_reward.py
+++ b/evaluation/miniwob/get_avg_reward.py
@ -0,0 +1,33 @@
+import argparse
+import json
+
+import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    total_cost = 0
+    actual_num = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            actual_num += 1
+            total_cost += data['metrics']['accumulated_cost']
+            total_reward += data['test_result']
+
+    avg_reward = total_reward / total_num
+    print('Avg Reward: ', avg_reward)
+
+    avg_cost = total_cost / actual_num
+    print('Avg Cost: ', avg_cost)
+    print('Actual number of tasks finished: ', actual_num)
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@ -0,0 +1,214 @@
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import subprocess
+import time
+
+import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
+import gymnasium as gym
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.tools import RuntimeTool
+
+SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+
+
+def process_instance(
+    env_id: str,
+    metadata: dict,
+    eval_output_dir: str,
+    docker_sandbox: DockerSSHBox,
+    reset_logger: bool = True,
+):
+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime_tools_config = {
+        RuntimeTool.BROWSER: {
+            'browsergym_eval': env_id,
+            'browsergym_eval_save_dir': eval_output_dir,
+        }
+    }
+
+    state: State = asyncio.run(
+        main(
+            'PLACEHOLDER_GOAL',
+            runtime_tools_config=runtime_tools_config,
+            sandbox=docker_sandbox,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
+    # read goal
+    with open(
+        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
+    ) as f:
+        instruction = f.read()
+    # read reward
+    with open(
+        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
+    ) as f:
+        rewards = json.load(f)
+        reward = max(rewards)
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': reward,
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
+    ]
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'miniwob',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        env_ids = env_ids[:eval_n_limit]
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_env_ids = []
+    for idx in env_ids:
+        if idx in finished_instance_ids:
+            logger.info(f'Skipping instance {idx} as it is already finished.')
+            continue
+        new_env_ids.append(idx)
+
+    env_ids = new_env_ids
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
+    )
+
+    # =============================================
+
+    docker_sandbox = DockerSSHBox()
+    for env_id in tqdm(env_ids):
+        try:
+            output = process_instance(
+                env_id=env_id,
+                metadata=metadata,
+                eval_output_dir=eval_output_dir,
+                docker_sandbox=docker_sandbox,
+                reset_logger=False,
+            )
+            output_fp.write(json.dumps(output) + '\n')
+            output_fp.flush()
+        except Exception as e:
+            logger.error(f'Error processing instance {env_id}: {e}')
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+
+# configure miniwob website, change URL to yours
+export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
+
+# configure browsing agent
+export USE_NAV="false"
+export USE_CONCISE_ANSWER="true"
+
+
+MODEL_CONFIG=$1
+AGENT=$2
+NOTE=$3
+EVAL_LIMIT=$4
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default BrowsingAgent"
+  AGENT="BrowsingAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
+
+COMMAND="poetry run python evaluation/miniwob/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 10 \
+  --max-chars 10000000 \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/evaluation/webarena/README.md
+++ b/evaluation/webarena/README.md
@ -0,0 +1,91 @@
+# WebArena Evaluation with OpenDevin Browsing Agents
+
+This folder contains evaluation for [WebArena](https://github.com/web-arena-x/webarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
+
+## Setup OpenDevin Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+ssh_hostname = "localhost"
+sandbox_timeout = 120
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Setup WebArena Environment
+WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenDevin agents.
+Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
+Take note of the base URL of the machine where the environment is installed.
+
+## Setup Environment Variables of WebArena Websites
+
+Create a script `webarena_env.sh` under `evaluation/webarena/scripts` with the following:
+
+```bash
+export BASE_URL=<YOUR_SERVER_URL_HERE>
+export SHOPPING="$BASE_URL:7770/"
+export SHOPPING_ADMIN="$BASE_URL:7780/admin"
+export REDDIT="$BASE_URL:9999"
+export GITLAB="$BASE_URL:8023"
+export WIKIPEDIA="$BASE_URL:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
+export MAP="$BASE_URL:3000"
+export HOMEPAGE="$BASE_URL:4399"
+export OPENAI_API_KEY="yourkey" # this key is required for some WebArena validators that utilize LLMs
+```
+
+## Test if your environment works
+
+Access with browser the above WebArena website URLs and see if they load correctly.
+If you cannot access the website, make sure the firewall allows public access of the aforementioned ports on your server
+Check the network security policy if you are using an AWS machine.
+Follow the WebArena environment setup guide carefully, and make sure the URL fields are populated with the correct base URL of your server.
+
+## Run Evaluation
+
+```sh
+bash evaluation/webarena/scripts/run_infer.sh
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/webarena/`
+
+To calculate the success rate, run:
+
+```sh
+poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_outputs/outputs/webarena/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+
+## BrowsingAgent V1.0 result
+
+Tested on BrowsingAgent V1.0
+
+WebArena, 812 tasks (high cost, single run due to fixed task), max step 15
+
+- GPT4o: 0.1478
+- GPT3.5: 0.0517
--- a/evaluation/webarena/init.py
+++ b/evaluation/webarena/init.py
--- a/evaluation/webarena/get_success_rate.py
+++ b/evaluation/webarena/get_success_rate.py
@ -0,0 +1,33 @@
+import argparse
+import json
+
+import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    total_cost = 0
+    actual_num = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            actual_num += 1
+            total_cost += data['metrics']['accumulated_cost']
+            total_reward += data['test_result']
+
+    avg_reward = total_reward / total_num
+    print('Success Rate: ', avg_reward)
+
+    avg_cost = total_cost / actual_num
+    print('Avg Cost: ', avg_cost)
+    print('Actual number of tasks finished: ', actual_num)
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@ -0,0 +1,214 @@
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import subprocess
+import time
+
+import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
+import gymnasium as gym
+from tqdm import tqdm
+
+from opendevin.controller.state.state import State
+from opendevin.core.config import args, config, get_llm_config_arg
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import main
+from opendevin.events.serialization.event import event_to_dict
+from opendevin.runtime.docker.ssh_box import DockerSSHBox
+from opendevin.runtime.tools import RuntimeTool
+
+SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
+
+
+def process_instance(
+    env_id: str,
+    metadata: dict,
+    eval_output_dir: str,
+    docker_sandbox: DockerSSHBox,
+    reset_logger: bool = True,
+):
+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    runtime_tools_config = {
+        RuntimeTool.BROWSER: {
+            'browsergym_eval': env_id,
+            'browsergym_eval_save_dir': eval_output_dir,
+        }
+    }
+
+    state: State = asyncio.run(
+        main(
+            'PLACEHOLDER_GOAL',
+            runtime_tools_config=runtime_tools_config,
+            sandbox=docker_sandbox,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
+    # read goal
+    with open(
+        os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
+    ) as f:
+        instruction = f.read()
+    # read reward
+    with open(
+        os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
+    ) as f:
+        rewards = json.load(f)
+        reward = max(rewards)
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata,
+        'history': [
+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
+        ],
+        'metrics': metrics,
+        'error': state.error if state and state.error else None,
+        'test_result': reward,
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
+    ]
+
+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
+    # for details of how to set `llm_config`
+    if args.llm_config:
+        specified_llm_config = get_llm_config_arg(args.llm_config)
+        if specified_llm_config:
+            config.llm = specified_llm_config
+    logger.info(f'Config for evaluation: {config}')
+
+    # TEST METADATA
+    agent_class = args.agent_cls
+    assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
+    model_name = config.llm.model.split('/')[-1]
+    max_iterations = args.max_iterations
+    eval_note = ''
+    if args.eval_note is not None:
+        eval_note += '_N_' + args.eval_note
+    eval_output_dir = os.path.join(
+        args.eval_output_dir,
+        'webarena',
+        agent_class,
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
+    )
+
+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
+        parents=True, exist_ok=True
+    )
+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
+
+    metadata = {
+        'agent_class': agent_class,
+        'model_name': model_name,
+        'max_iterations': max_iterations,
+        'eval_output_dir': eval_output_dir,
+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        # get the commit id of current repo for reproduciblity
+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
+        .decode('utf-8')
+        .strip(),
+    }
+    logger.info(f'Metadata: {metadata}')
+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
+        json.dump(metadata, f)
+
+    # LIMIT EVALUATION
+    eval_n_limit = args.eval_n_limit
+    if eval_n_limit:
+        env_ids = env_ids[:eval_n_limit]
+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
+
+    # OUTPUT FILE
+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
+    logger.info(f'Writing evaluation output to {output_file}')
+    finished_instance_ids = set()
+    if os.path.exists(output_file):
+        with open(output_file, 'r') as f:
+            for line in f:
+                data = json.loads(line)
+                finished_instance_ids.add(data['instance_id'])
+        logger.warning(
+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
+        )
+    output_fp = open(output_file, 'a')
+
+    logger.info(
+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    )
+
+    # =============================================
+    # filter out finished instances
+    new_env_ids = []
+    for idx in env_ids:
+        if idx in finished_instance_ids:
+            logger.info(f'Skipping instance {idx} as it is already finished.')
+            continue
+        new_env_ids.append(idx)
+
+    env_ids = new_env_ids
+    logger.info(
+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
+    )
+
+    # =============================================
+
+    docker_sandbox = DockerSSHBox()
+    for env_id in tqdm(env_ids):
+        try:
+            output = process_instance(
+                env_id=env_id,
+                metadata=metadata,
+                eval_output_dir=eval_output_dir,
+                docker_sandbox=docker_sandbox,
+                reset_logger=False,
+            )
+            output_fp.write(json.dumps(output) + '\n')
+            output_fp.flush()
+        except Exception as e:
+            logger.error(f'Error processing instance {env_id}: {e}')
+
+    output_fp.close()
+    logger.info('Evaluation finished.')
--- a/evaluation/webarena/scripts/run_infer.sh
+++ b/evaluation/webarena/scripts/run_infer.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+# configure webarena websites and environment
+source evaluation/webarena/scripts/webarena_env.sh
+
+# configure browsing agent
+export USE_NAV="false"
+export USE_CONCISE_ANSWER="true"
+
+MODEL_CONFIG=$1
+AGENT=$2
+EVAL_LIMIT=$3
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default BrowsingAgent"
+  AGENT="BrowsingAgent"
+fi
+
+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+# We need to track the version of Agent in the evaluation to make sure results are comparable
+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="$AGENT_VERSION"
+
+COMMAND="poetry run python evaluation/webarena/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 15 \
+  --max-chars 10000000 \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/opendevin/core/main.py
+++ b/opendevin/core/main.py
@ -1,4 +1,5 @@
 import asyncio
+import os
 import sys
 from typing import Callable, Optional, Type

@ -34,6 +35,7 @@ async def main(
    exit_on_message: bool = False,
    fake_user_response_fn: Optional[Callable[[Optional[State]], str]] = None,
    sandbox: Optional[Sandbox] = None,
+    runtime_tools_config: Optional[dict] = None,
 ) -> Optional[State]:
    """Main coroutine to run the agent controller with task input flexibility.
    It's only used when you launch opendevin backend directly via cmdline.
@ -92,7 +94,21 @@ async def main(
    )
    runtime = ServerRuntime(event_stream=event_stream, sandbox=sandbox)
    runtime.init_sandbox_plugins(controller.agent.sandbox_plugins)
-    runtime.init_runtime_tools(controller.agent.runtime_tools, is_async=False)
+    runtime.init_runtime_tools(
+        controller.agent.runtime_tools,
+        is_async=False,
+        runtime_tools_config=runtime_tools_config,
+    )
+
+    # browser eval specific
+    # TODO: move to a better place
+    if runtime.browser and runtime.browser.eval_dir:
+        logger.info(f'Evaluation directory: {runtime.browser.eval_dir}')
+        with open(
+            os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
+        ) as f:
+            task = f.read()
+            logger.info(f'Dynamic Eval task: {task}')

    await event_stream.add_event(MessageAction(content=task), EventSource.USER)

--- a/opendevin/events/action/browse.py
+++ b/opendevin/events/action/browse.py
@ -29,6 +29,7 @@ class BrowseURLAction(Action):
 class BrowseInteractiveAction(Action):
    browser_actions: str
    thought: str = ''
+    browsergym_send_msg_to_user: str = ''
    action: str = ActionType.BROWSE_INTERACTIVE
    runnable: ClassVar[bool] = True

--- a/opendevin/events/observation/browse.py
+++ b/opendevin/events/observation/browse.py
@ -21,6 +21,9 @@ class BrowserOutputObservation(Observation):
    active_page_index: int = -1
    dom_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
    axtree_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
+    extra_element_properties: dict = field(
+        default_factory=dict, repr=False
+    )  # don't show in repr
    last_browser_action: str = ''
    last_browser_action_error: str = ''
    focused_element_bid: str = ''
--- a/opendevin/events/serialization/event.py
+++ b/opendevin/events/serialization/event.py
@ -20,6 +20,7 @@ DELETE_FROM_MEMORY_EXTRAS = {
    'last_browser_action',
    'last_browser_action_error',
    'focused_element_bid',
+    'extra_element_properties',
 }


--- a/opendevin/runtime/browser/browser_env.py
+++ b/opendevin/runtime/browser/browser_env.py
@ -1,7 +1,9 @@
 import atexit
 import base64
 import io
+import json
 import multiprocessing
+import os
 import threading
 import time
 import uuid
@ -18,15 +20,27 @@ from opendevin.core.logger import opendevin_logger as logger


 class BrowserEnv:
-    def __init__(self, is_async: bool = True):
-        self.html_text_converter = html2text.HTML2Text()
-        # ignore links and images
-        self.html_text_converter.ignore_links = False
-        self.html_text_converter.ignore_images = True
-        # use alt text for images
-        self.html_text_converter.images_to_alt = True
-        # disable auto text wrapping
-        self.html_text_converter.body_width = 0
+    def __init__(
+        self,
+        is_async: bool = True,
+        browsergym_eval: str = '',
+        browsergym_eval_save_dir: str = '',
+    ):
+        self.html_text_converter = self.get_html_text_converter()
+        self.eval_mode = False
+        self.eval_dir = ''
+        # EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
+        self.browsergym_eval = browsergym_eval
+        self.browsergym_eval_save_dir = browsergym_eval_save_dir
+        if self.browsergym_eval:
+            assert (
+                self.browsergym_eval_save_dir
+            ), 'browsergym_eval_save_dir must be provided for evaluation.'
+            self.eval_mode = True
+            self.eval_dir = os.path.join(
+                self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
+            )
+            os.makedirs(self.eval_dir, exist_ok=True)
        # Initialize browser environment process
        multiprocessing.set_start_method('spawn', force=True)
        self.browser_side, self.agent_side = multiprocessing.Pipe()
@ -39,6 +53,17 @@ class BrowserEnv:
            self.init_browser()
        atexit.register(self.close)

+    def get_html_text_converter(self):
+        html_text_converter = html2text.HTML2Text()
+        # ignore links and images
+        html_text_converter.ignore_links = False
+        html_text_converter.ignore_images = True
+        # use alt text for images
+        html_text_converter.images_to_alt = True
+        # disable auto text wrapping
+        html_text_converter.body_width = 0
+        return html_text_converter
+
    def init_browser(self):
        logger.info('Starting browser env...')
        self.process.start()
@ -47,14 +72,26 @@ class BrowserEnv:
            raise BrowserInitException('Failed to start browser environment.')

    def browser_process(self):
-        env = gym.make(
-            'browsergym/openended',
-            task_kwargs={'start_url': 'about:blank'},
-            wait_for_user_message=False,
-            headless=True,
-            disable_env_checker=True,
-        )
+        if self.eval_mode:
+            logger.info('Creating browser env for evaluation purpose.')
+            env = gym.make(self.browsergym_eval)
+        else:
+            env = gym.make(
+                'browsergym/openended',
+                task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
+                wait_for_user_message=False,
+                headless=True,
+                disable_env_checker=True,
+            )
        obs, info = env.reset()
+        # EVAL only: save the goal into file for evaluation
+        if self.eval_mode:
+            rewards = []  # store rewards if in eval mode
+            logger.info(obs['goal'])
+            with open(
+                os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
+            ) as f:
+                f.write(obs['goal'])
        logger.info('Browser env started.')
        while True:
            try:
@ -70,6 +107,15 @@ class BrowserEnv:
                        continue
                    action = action_data['action']
                    obs, reward, terminated, truncated, info = env.step(action)
+                    # EVAL only: save the rewards into file for evaluation
+                    if self.eval_mode:
+                        rewards.append(reward)
+                        with open(
+                            os.path.join(self.eval_dir, 'rewards.json'),
+                            'w',
+                            encoding='utf-8',
+                        ) as f:
+                            f.write(json.dumps(rewards))
                    # add text content of the page
                    html_str = flatten_dom_to_str(obs['dom_object'])
                    obs['text_content'] = self.html_text_converter.handle(html_str)
@ -86,7 +132,7 @@ class BrowserEnv:
                    pass
                return

-    def step(self, action_str: str, timeout: float = 10) -> dict:
+    def step(self, action_str: str, timeout: float = 30) -> dict:
        unique_request_id = str(uuid.uuid4())
        self.agent_side.send((unique_request_id, {'action': action_str}))
        start_time = time.time()
--- a/opendevin/runtime/runtime.py
+++ b/opendevin/runtime/runtime.py
@ -1,5 +1,6 @@
 import asyncio
 from abc import abstractmethod
+from typing import Any, Optional

 from opendevin.core.config import config
 from opendevin.core.exceptions import BrowserInitException
@ -91,12 +92,18 @@ class Runtime:
        self.sandbox.init_plugins(plugins)

    def init_runtime_tools(
-        self, runtime_tools: list[RuntimeTool], is_async: bool = True
+        self,
+        runtime_tools: list[RuntimeTool],
+        runtime_tools_config: Optional[dict[RuntimeTool, Any]] = None,
+        is_async: bool = True,
    ) -> None:
        # if browser in runtime_tools, init it
        if RuntimeTool.BROWSER in runtime_tools:
+            if runtime_tools_config is None:
+                runtime_tools_config = {}
+            browser_env_config = runtime_tools_config.get(RuntimeTool.BROWSER, {})
            try:
-                self.browser = BrowserEnv(is_async)
+                self.browser = BrowserEnv(is_async=is_async, **browser_env_config)
            except BrowserInitException:
                logger.warn(
                    'Failed to start browser environment, web browsing functionality will not work'
--- a/opendevin/runtime/server/browse.py
+++ b/opendevin/runtime/server/browse.py
@ -30,6 +30,9 @@ async def browse(action, browser: BrowserEnv | None) -> BrowserOutputObservation
            active_page_index=obs['active_page_index'],  # index of the active page
            dom_object=obs['dom_object'],  # DOM object
            axtree_object=obs['axtree_object'],  # accessibility tree object
+            extra_element_properties=obs[
+                'extra_element_properties'
+            ],  # extra element properties
            last_browser_action=obs['last_action'],  # last browser env action performed
            focused_element_bid=obs['focused_element_bid'],  # focused element bid
            screenshot=obs['screenshot'],  # base64-encoded screenshot, png
--- a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_001.log
+++ b/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_001.log
@ -114,7 +114,7 @@ Don't execute multiple actions at once if you need feedback from the page.
 ----------

 # Current Accessibility Tree:
-
+RootWebArea '', focused

 # Previous Actions

--- a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_002.log
+++ b/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_002.log
@ -118,12 +118,11 @@ RootWebArea 'The Ultimate Answer', focused
 	[8] heading 'The Ultimate Answer'
 	[9] paragraph ''
 		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
-	[10] button 'Click me'
+	[10] button 'Click me', clickable

 # Previous Actions
 goto('http://localhost:8000')

-
 Here is an example with chain of thought of a valid action when clicking on a button:
 "
 In order to accomplish my goal I need to click on the button with bid 12
--- a/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_003.log
+++ b/tests/integration/mock/BrowsingAgent/test_browse_internet/prompt_003.log
@ -118,14 +118,13 @@ RootWebArea 'The Ultimate Answer', focused
 	[8] heading 'The Ultimate Answer'
 	[9] paragraph ''
 		StaticText 'Click the button to reveal the answer to life, the universe, and everything.'
-	[10] button 'Click me', focused
+	[10] button 'Click me', clickable, focused
 	StaticText 'The answer is OpenDevin is all you need!'

 # Previous Actions
 goto('http://localhost:8000')
 click("10")

-
 Here is an example with chain of thought of a valid action when clicking on a button:
 "
 In order to accomplish my goal I need to click on the button with bid 12
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@ -112,7 +112,11 @@ def test_browse_url_action_serialization_deserialization():
 def test_browse_interactive_action_serialization_deserialization():
    original_action_dict = {
        'action': 'browse_interactive',
-        'args': {'thought': '', 'browser_actions': 'goto("https://www.example.com")'},
+        'args': {
+            'thought': '',
+            'browser_actions': 'goto("https://www.example.com")',
+            'browsergym_send_msg_to_user': '',
+        },
    }
    serialization_deserialization(original_action_dict, BrowseInteractiveAction)