mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[feat] Integrate BrowserGym (#1452)
* add a single-threaded server serving browsergym * update poetry * update browser page content * add import to make sure browsergym environments are registered properly * remove flask server, use multiprocess impl and Pipe * fix * refactor BrowserEnv * update browser action and obs to include more complete info * fix screenshot * update poetry lock * add playwright install to workflow * update * add better html to text conversion * update for better text conversion to maintain parity with the current handling of browseurlaction * update * update poetry * update multiprocessing mp * fix multiprocessing * update * update github workflow --------- Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
This commit is contained in:
parent
0d77f495e3
commit
836864fa88
1
.github/workflows/dummy-agent-test.yml
vendored
1
.github/workflows/dummy-agent-test.yml
vendored
@ -15,6 +15,7 @@ jobs:
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
poetry install --without evaluation
|
||||
poetry run playwright install --with-deps chromium
|
||||
wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
|
||||
- name: Run tests
|
||||
run: |
|
||||
|
||||
2
Makefile
2
Makefile
@ -159,7 +159,7 @@ build-frontend:
|
||||
# Start backend
|
||||
start-backend:
|
||||
@echo "$(YELLOW)Starting backend...$(RESET)"
|
||||
@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude workspace/*
|
||||
@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
|
||||
|
||||
# Start frontend
|
||||
start-frontend:
|
||||
|
||||
20
docs/modules/python/opendevin/browser/browser_env.md
Normal file
20
docs/modules/python/opendevin/browser/browser_env.md
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
sidebar_label: browser_env
|
||||
title: opendevin.browser.browser_env
|
||||
---
|
||||
|
||||
## BrowserEnv Objects
|
||||
|
||||
```python
|
||||
class BrowserEnv()
|
||||
```
|
||||
|
||||
#### image\_to\_png\_base64\_url
|
||||
|
||||
```python
|
||||
@staticmethod
|
||||
def image_to_png_base64_url(image: np.ndarray | Image.Image)
|
||||
```
|
||||
|
||||
Convert a numpy array to a base64 encoded png image url.
|
||||
|
||||
@ -80,6 +80,13 @@
|
||||
"label": "opendevin.action",
|
||||
"type": "category"
|
||||
},
|
||||
{
|
||||
"items": [
|
||||
"python/opendevin/browser/browser_env"
|
||||
],
|
||||
"label": "opendevin.browser",
|
||||
"type": "category"
|
||||
},
|
||||
{
|
||||
"items": [
|
||||
"python/opendevin/controller/agent_controller"
|
||||
|
||||
@ -1,10 +1,7 @@
|
||||
import base64
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from opendevin.observation import BrowserOutputObservation
|
||||
from opendevin.schema import ActionType
|
||||
|
||||
@ -25,29 +22,21 @@ class BrowseURLAction(ExecutableAction):
|
||||
if not asked_url.startswith('http'):
|
||||
asked_url = os.path.abspath(os.curdir) + self.url
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
page = await browser.new_page()
|
||||
response = await page.goto(asked_url)
|
||||
try:
|
||||
# domcontentloaded: Wait for the DOMContentLoaded event to be fired.
|
||||
# load: Wait for the load event to be fired.
|
||||
# networkidle: Wait until there are no more network connections
|
||||
await page.wait_for_load_state('networkidle', timeout=3000)
|
||||
except TimeoutError:
|
||||
pass
|
||||
# content = await page.content()
|
||||
inner_text = await page.evaluate('() => document.body.innerText')
|
||||
screenshot_bytes = await page.screenshot(full_page=True)
|
||||
await browser.close()
|
||||
|
||||
screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8')
|
||||
return BrowserOutputObservation(
|
||||
content=inner_text, # HTML content of the page
|
||||
screenshot=screenshot_base64, # Base64-encoded screenshot
|
||||
url=asked_url,
|
||||
status_code=response.status if response else 0, # HTTP status code
|
||||
)
|
||||
# action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
|
||||
action_str = f'goto("{asked_url}")'
|
||||
# obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
|
||||
obs = controller.browser.step(action_str)
|
||||
return BrowserOutputObservation(
|
||||
content=obs['text_content'], # text content of the page
|
||||
open_pages_urls=obs['open_pages_urls'], # list of open pages
|
||||
active_page_index=obs['active_page_index'], # index of the active page
|
||||
dom_object=obs['dom_object'], # DOM object
|
||||
axtree_object=obs['axtree_object'], # accessibility tree object
|
||||
last_browser_action=obs['last_action'], # last browser env action performed
|
||||
focused_element_bid=obs['focused_element_bid'], # focused element bid
|
||||
screenshot=obs['screenshot'], # base64-encoded screenshot, png
|
||||
url=asked_url,
|
||||
)
|
||||
except Exception as e:
|
||||
return BrowserOutputObservation(
|
||||
content=str(e), screenshot='', error=True, url=asked_url
|
||||
|
||||
0
opendevin/browser/__init__.py
Normal file
0
opendevin/browser/__init__.py
Normal file
102
opendevin/browser/browser_env.py
Normal file
102
opendevin/browser/browser_env.py
Normal file
@ -0,0 +1,102 @@
|
||||
import atexit
|
||||
import base64
|
||||
import io
|
||||
import multiprocessing
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
|
||||
import gymnasium as gym
|
||||
import html2text
|
||||
import numpy as np
|
||||
from browsergym.utils.obs import flatten_dom_to_str
|
||||
from PIL import Image
|
||||
|
||||
from opendevin.logger import opendevin_logger as logger
|
||||
|
||||
|
||||
class BrowserException(Exception):
|
||||
pass
|
||||
|
||||
class BrowserEnv:
|
||||
|
||||
def __init__(self):
|
||||
self.html_text_converter = html2text.HTML2Text()
|
||||
# ignore links and images
|
||||
self.html_text_converter.ignore_links = True
|
||||
self.html_text_converter.ignore_images = True
|
||||
# use alt text for images
|
||||
self.html_text_converter.images_to_alt = True
|
||||
# disable auto text wrapping
|
||||
self.html_text_converter.body_width = 0
|
||||
# Initialize browser environment process
|
||||
multiprocessing.set_start_method('spawn', force=True)
|
||||
self.browser_side, self.agent_side = multiprocessing.Pipe()
|
||||
self.process = multiprocessing.Process(target=self.browser_process,)
|
||||
logger.info('Starting browser env...')
|
||||
self.process.start()
|
||||
atexit.register(self.close)
|
||||
|
||||
def browser_process(self):
|
||||
env = gym.make(
|
||||
'browsergym/openended',
|
||||
start_url='about:blank',
|
||||
wait_for_user_message=False,
|
||||
headless=True,
|
||||
disable_env_checker=True,
|
||||
)
|
||||
obs, info = env.reset()
|
||||
logger.info('Browser env started.')
|
||||
while True:
|
||||
try:
|
||||
if self.browser_side.poll(timeout=0.01):
|
||||
unique_request_id , action_data = self.browser_side.recv()
|
||||
# shutdown the browser environment
|
||||
if unique_request_id == 'SHUTDOWN':
|
||||
env.close()
|
||||
return
|
||||
action = action_data['action']
|
||||
obs, reward, terminated, truncated, info = env.step(action)
|
||||
# add text content of the page
|
||||
html_str = flatten_dom_to_str(obs['dom_object'])
|
||||
obs['text_content'] = self.html_text_converter.handle(html_str)
|
||||
# make observation serializable
|
||||
obs['screenshot'] = self.image_to_png_base64_url(obs['screenshot'])
|
||||
obs['active_page_index'] = obs['active_page_index'].item()
|
||||
obs['elapsed_time'] = obs['elapsed_time'].item()
|
||||
self.browser_side.send((unique_request_id, obs))
|
||||
except KeyboardInterrupt:
|
||||
logger.info('Browser env process interrupted by user.')
|
||||
return
|
||||
|
||||
def step(self, action_str: str, timeout: float = 10) -> dict:
|
||||
unique_request_id = str(uuid.uuid4())
|
||||
self.agent_side.send((unique_request_id, {'action': action_str}))
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError('Browser environment took too long to respond.')
|
||||
if self.agent_side.poll(timeout=0.01):
|
||||
response_id, obs = self.agent_side.recv()
|
||||
if response_id == unique_request_id:
|
||||
if obs['last_action_error']:
|
||||
raise BrowserException(obs['last_action_error'])
|
||||
return obs
|
||||
|
||||
def close(self):
|
||||
self.agent_side.send(('SHUTDOWN', None))
|
||||
self.process.join()
|
||||
|
||||
@staticmethod
|
||||
def image_to_png_base64_url(image: np.ndarray | Image.Image):
|
||||
"""Convert a numpy array to a base64 encoded png image url."""
|
||||
|
||||
if isinstance(image, np.ndarray):
|
||||
image = Image.fromarray(image)
|
||||
if image.mode in ('RGBA', 'LA'):
|
||||
image = image.convert('RGB')
|
||||
buffered = io.BytesIO()
|
||||
image.save(buffered, format='PNG')
|
||||
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
||||
return f'{image_base64}'
|
||||
@ -12,6 +12,7 @@ from opendevin.action import (
|
||||
)
|
||||
from opendevin.action.tasks import TaskStateChangedAction
|
||||
from opendevin.agent import Agent
|
||||
from opendevin.browser.browser_env import BrowserEnv
|
||||
from opendevin.controller.action_manager import ActionManager
|
||||
from opendevin.exceptions import (
|
||||
AgentMalformedActionError,
|
||||
@ -43,6 +44,7 @@ class AgentController:
|
||||
max_iterations: int
|
||||
action_manager: ActionManager
|
||||
callbacks: List[Callable]
|
||||
browser: BrowserEnv
|
||||
|
||||
delegate: 'AgentController | None' = None
|
||||
state: State | None = None
|
||||
@ -67,6 +69,9 @@ class AgentController:
|
||||
self.callbacks = callbacks
|
||||
# Initialize agent-required plugins for sandbox (if any)
|
||||
self.action_manager.init_sandbox_plugins(agent.sandbox_plugins)
|
||||
# Initialize browser environment
|
||||
self.browser = BrowserEnv()
|
||||
|
||||
|
||||
if isinstance(agent, CodeActAgent) and not isinstance(self.action_manager.sandbox, DockerSSHBox):
|
||||
logger.warning('CodeActAgent requires DockerSSHBox as sandbox! Using other sandbox that are not stateful (LocalBox, DockerExecBox) will not work properly.')
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from opendevin.schema import ObservationType
|
||||
|
||||
@ -16,6 +16,25 @@ class BrowserOutputObservation(Observation):
|
||||
status_code: int = 200
|
||||
error: bool = False
|
||||
observation: str = ObservationType.BROWSE
|
||||
# do not include in the memory
|
||||
open_pages_urls: list = field(default_factory=list)
|
||||
active_page_index: int = -1
|
||||
dom_object: dict = field(default_factory=dict)
|
||||
axtree_object: dict = field(default_factory=dict)
|
||||
last_browser_action: str = ''
|
||||
focused_element_bid: str = ''
|
||||
|
||||
def to_memory(self) -> dict:
|
||||
memory_dict = super().to_memory()
|
||||
# remove some fields from the memory, as currently they are too big for LLMs
|
||||
# TODO: find a more elegant way to handle this
|
||||
memory_dict['extras'].pop('dom_object', None)
|
||||
memory_dict['extras'].pop('axtree_object', None)
|
||||
memory_dict['extras'].pop('open_pages_urls', None)
|
||||
memory_dict['extras'].pop('active_page_index', None)
|
||||
memory_dict['extras'].pop('last_browser_action', None)
|
||||
memory_dict['extras'].pop('focused_element_bid', None)
|
||||
return memory_dict
|
||||
|
||||
@property
|
||||
def message(self) -> str:
|
||||
|
||||
960
poetry.lock
generated
960
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -22,7 +22,8 @@ uvicorn = "*"
|
||||
types-toml = "*"
|
||||
numpy = "*"
|
||||
json-repair = "*"
|
||||
playwright = "*"
|
||||
browsergym = "*" # integrate browsergym as the browsing interface
|
||||
html2text = "*"
|
||||
e2b = "^0.14.13"
|
||||
pexpect = "*"
|
||||
jinja2 = "^3.1.3"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user