[feat] Integrate BrowserGym (#1452)

* add a single-threaded server serving browsergym

* update poetry

* update browser page content

* add import to make sure browsergym environments are registered properly

* remove flask server, use multiprocess impl and Pipe

* fix

* refactor BrowserEnv

* update browser action and obs to include more complete info

* fix screenshot

* update poetry lock

* add playwright install to workflow

* update

* add better html to text conversion

* update for better text conversion to maintain parity with the current handling of browseurlaction

* update

* update poetry

* update multiprocessing mp

* fix multiprocessing

* update

* update github workflow

---------

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
This commit is contained in:
Frank Xu 2024-05-02 07:52:53 -04:00 committed by GitHub
parent 0d77f495e3
commit 836864fa88
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 877 additions and 285 deletions

View File

@ -15,6 +15,7 @@ jobs:
run: |
curl -sSL https://install.python-poetry.org | python3 -
poetry install --without evaluation
poetry run playwright install --with-deps chromium
wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
- name: Run tests
run: |

View File

@ -159,7 +159,7 @@ build-frontend:
# Start backend
start-backend:
@echo "$(YELLOW)Starting backend...$(RESET)"
@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude workspace/*
@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
# Start frontend
start-frontend:

View File

@ -0,0 +1,20 @@
---
sidebar_label: browser_env
title: opendevin.browser.browser_env
---
## BrowserEnv Objects
```python
class BrowserEnv()
```
#### image\_to\_png\_base64\_url
```python
@staticmethod
def image_to_png_base64_url(image: np.ndarray | Image.Image)
```
Convert a numpy array to a base64 encoded png image url.

View File

@ -80,6 +80,13 @@
"label": "opendevin.action",
"type": "category"
},
{
"items": [
"python/opendevin/browser/browser_env"
],
"label": "opendevin.browser",
"type": "category"
},
{
"items": [
"python/opendevin/controller/agent_controller"

View File

@ -1,10 +1,7 @@
import base64
import os
from dataclasses import dataclass
from typing import TYPE_CHECKING
from playwright.async_api import async_playwright
from opendevin.observation import BrowserOutputObservation
from opendevin.schema import ActionType
@ -25,29 +22,21 @@ class BrowseURLAction(ExecutableAction):
if not asked_url.startswith('http'):
asked_url = os.path.abspath(os.curdir) + self.url
try:
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
response = await page.goto(asked_url)
try:
# domcontentloaded: Wait for the DOMContentLoaded event to be fired.
# load: Wait for the load event to be fired.
# networkidle: Wait until there are no more network connections
await page.wait_for_load_state('networkidle', timeout=3000)
except TimeoutError:
pass
# content = await page.content()
inner_text = await page.evaluate('() => document.body.innerText')
screenshot_bytes = await page.screenshot(full_page=True)
await browser.close()
screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8')
return BrowserOutputObservation(
content=inner_text, # HTML content of the page
screenshot=screenshot_base64, # Base64-encoded screenshot
url=asked_url,
status_code=response.status if response else 0, # HTTP status code
)
# action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
action_str = f'goto("{asked_url}")'
# obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
obs = controller.browser.step(action_str)
return BrowserOutputObservation(
content=obs['text_content'], # text content of the page
open_pages_urls=obs['open_pages_urls'], # list of open pages
active_page_index=obs['active_page_index'], # index of the active page
dom_object=obs['dom_object'], # DOM object
axtree_object=obs['axtree_object'], # accessibility tree object
last_browser_action=obs['last_action'], # last browser env action performed
focused_element_bid=obs['focused_element_bid'], # focused element bid
screenshot=obs['screenshot'], # base64-encoded screenshot, png
url=asked_url,
)
except Exception as e:
return BrowserOutputObservation(
content=str(e), screenshot='', error=True, url=asked_url

View File

View File

@ -0,0 +1,102 @@
import atexit
import base64
import io
import multiprocessing
import time
import uuid
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
import gymnasium as gym
import html2text
import numpy as np
from browsergym.utils.obs import flatten_dom_to_str
from PIL import Image
from opendevin.logger import opendevin_logger as logger
class BrowserException(Exception):
pass
class BrowserEnv:
def __init__(self):
self.html_text_converter = html2text.HTML2Text()
# ignore links and images
self.html_text_converter.ignore_links = True
self.html_text_converter.ignore_images = True
# use alt text for images
self.html_text_converter.images_to_alt = True
# disable auto text wrapping
self.html_text_converter.body_width = 0
# Initialize browser environment process
multiprocessing.set_start_method('spawn', force=True)
self.browser_side, self.agent_side = multiprocessing.Pipe()
self.process = multiprocessing.Process(target=self.browser_process,)
logger.info('Starting browser env...')
self.process.start()
atexit.register(self.close)
def browser_process(self):
env = gym.make(
'browsergym/openended',
start_url='about:blank',
wait_for_user_message=False,
headless=True,
disable_env_checker=True,
)
obs, info = env.reset()
logger.info('Browser env started.')
while True:
try:
if self.browser_side.poll(timeout=0.01):
unique_request_id , action_data = self.browser_side.recv()
# shutdown the browser environment
if unique_request_id == 'SHUTDOWN':
env.close()
return
action = action_data['action']
obs, reward, terminated, truncated, info = env.step(action)
# add text content of the page
html_str = flatten_dom_to_str(obs['dom_object'])
obs['text_content'] = self.html_text_converter.handle(html_str)
# make observation serializable
obs['screenshot'] = self.image_to_png_base64_url(obs['screenshot'])
obs['active_page_index'] = obs['active_page_index'].item()
obs['elapsed_time'] = obs['elapsed_time'].item()
self.browser_side.send((unique_request_id, obs))
except KeyboardInterrupt:
logger.info('Browser env process interrupted by user.')
return
def step(self, action_str: str, timeout: float = 10) -> dict:
unique_request_id = str(uuid.uuid4())
self.agent_side.send((unique_request_id, {'action': action_str}))
start_time = time.time()
while True:
if time.time() - start_time > timeout:
raise TimeoutError('Browser environment took too long to respond.')
if self.agent_side.poll(timeout=0.01):
response_id, obs = self.agent_side.recv()
if response_id == unique_request_id:
if obs['last_action_error']:
raise BrowserException(obs['last_action_error'])
return obs
def close(self):
self.agent_side.send(('SHUTDOWN', None))
self.process.join()
@staticmethod
def image_to_png_base64_url(image: np.ndarray | Image.Image):
"""Convert a numpy array to a base64 encoded png image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ('RGBA', 'LA'):
image = image.convert('RGB')
buffered = io.BytesIO()
image.save(buffered, format='PNG')
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return f'{image_base64}'

View File

@ -12,6 +12,7 @@ from opendevin.action import (
)
from opendevin.action.tasks import TaskStateChangedAction
from opendevin.agent import Agent
from opendevin.browser.browser_env import BrowserEnv
from opendevin.controller.action_manager import ActionManager
from opendevin.exceptions import (
AgentMalformedActionError,
@ -43,6 +44,7 @@ class AgentController:
max_iterations: int
action_manager: ActionManager
callbacks: List[Callable]
browser: BrowserEnv
delegate: 'AgentController | None' = None
state: State | None = None
@ -67,6 +69,9 @@ class AgentController:
self.callbacks = callbacks
# Initialize agent-required plugins for sandbox (if any)
self.action_manager.init_sandbox_plugins(agent.sandbox_plugins)
# Initialize browser environment
self.browser = BrowserEnv()
if isinstance(agent, CodeActAgent) and not isinstance(self.action_manager.sandbox, DockerSSHBox):
logger.warning('CodeActAgent requires DockerSSHBox as sandbox! Using other sandbox that are not stateful (LocalBox, DockerExecBox) will not work properly.')

View File

@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
from opendevin.schema import ObservationType
@ -16,6 +16,25 @@ class BrowserOutputObservation(Observation):
status_code: int = 200
error: bool = False
observation: str = ObservationType.BROWSE
# do not include in the memory
open_pages_urls: list = field(default_factory=list)
active_page_index: int = -1
dom_object: dict = field(default_factory=dict)
axtree_object: dict = field(default_factory=dict)
last_browser_action: str = ''
focused_element_bid: str = ''
def to_memory(self) -> dict:
memory_dict = super().to_memory()
# remove some fields from the memory, as currently they are too big for LLMs
# TODO: find a more elegant way to handle this
memory_dict['extras'].pop('dom_object', None)
memory_dict['extras'].pop('axtree_object', None)
memory_dict['extras'].pop('open_pages_urls', None)
memory_dict['extras'].pop('active_page_index', None)
memory_dict['extras'].pop('last_browser_action', None)
memory_dict['extras'].pop('focused_element_bid', None)
return memory_dict
@property
def message(self) -> str:

960
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,8 @@ uvicorn = "*"
types-toml = "*"
numpy = "*"
json-repair = "*"
playwright = "*"
browsergym = "*" # integrate browsergym as the browsing interface
html2text = "*"
e2b = "^0.14.13"
pexpect = "*"
jinja2 = "^3.1.3"