mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
230 lines
9.8 KiB
Python
230 lines
9.8 KiB
Python
import atexit
|
|
import json
|
|
import multiprocessing
|
|
import time
|
|
import uuid
|
|
|
|
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
|
|
import gymnasium as gym
|
|
import html2text
|
|
import tenacity
|
|
from browsergym.utils.obs import flatten_dom_to_str, overlay_som
|
|
|
|
from openhands.core.exceptions import BrowserInitException
|
|
from openhands.core.logger import openhands_logger as logger
|
|
from openhands.runtime.browser.base64 import image_to_png_base64_url
|
|
from openhands.utils.shutdown_listener import should_continue, should_exit
|
|
from openhands.utils.tenacity_stop import stop_if_should_exit
|
|
|
|
BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
|
|
BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
|
|
|
|
|
|
class BrowserEnv:
|
|
def __init__(self, browsergym_eval_env: str | None = None):
|
|
self.html_text_converter = self.get_html_text_converter()
|
|
self.eval_mode = False
|
|
self.eval_dir = ''
|
|
|
|
# EVAL only: browsergym_eval_env must be provided for evaluation
|
|
self.browsergym_eval_env = browsergym_eval_env
|
|
self.eval_mode = bool(browsergym_eval_env)
|
|
|
|
# Initialize browser environment process
|
|
multiprocessing.set_start_method('spawn', force=True)
|
|
self.browser_side, self.agent_side = multiprocessing.Pipe()
|
|
|
|
self.init_browser()
|
|
atexit.register(self.close)
|
|
|
|
def get_html_text_converter(self) -> html2text.HTML2Text:
|
|
html_text_converter = html2text.HTML2Text()
|
|
# ignore links and images
|
|
html_text_converter.ignore_links = False
|
|
html_text_converter.ignore_images = True
|
|
# use alt text for images
|
|
html_text_converter.images_to_alt = True
|
|
# disable auto text wrapping
|
|
html_text_converter.body_width = 0
|
|
return html_text_converter
|
|
|
|
@tenacity.retry(
|
|
wait=tenacity.wait_fixed(1),
|
|
stop=tenacity.stop_after_attempt(5) | stop_if_should_exit(),
|
|
retry=tenacity.retry_if_exception_type(BrowserInitException),
|
|
)
|
|
def init_browser(self) -> None:
|
|
logger.debug('Starting browser env...')
|
|
try:
|
|
self.process = multiprocessing.Process(target=self.browser_process)
|
|
self.process.start()
|
|
except Exception as e:
|
|
logger.error(f'Failed to start browser process: {e}')
|
|
raise
|
|
|
|
if not self.check_alive(timeout=200):
|
|
self.close()
|
|
raise BrowserInitException('Failed to start browser environment.')
|
|
|
|
def browser_process(self) -> None:
|
|
if self.eval_mode:
|
|
assert self.browsergym_eval_env is not None
|
|
logger.info('Initializing browser env for web browsing evaluation.')
|
|
if not self.browsergym_eval_env.startswith('browsergym/'):
|
|
self.browsergym_eval_env = 'browsergym/' + self.browsergym_eval_env
|
|
if 'visualwebarena' in self.browsergym_eval_env:
|
|
import browsergym.visualwebarena # noqa F401 register visualwebarena tasks as gym environments
|
|
import nltk
|
|
|
|
nltk.download('punkt_tab')
|
|
elif 'webarena' in self.browsergym_eval_env:
|
|
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
|
elif 'miniwob' in self.browsergym_eval_env:
|
|
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
|
else:
|
|
raise ValueError(
|
|
f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
|
|
)
|
|
env = gym.make(self.browsergym_eval_env, tags_to_mark='all', timeout=100000)
|
|
else:
|
|
env = gym.make(
|
|
'browsergym/openended',
|
|
task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
|
|
wait_for_user_message=False,
|
|
headless=True,
|
|
disable_env_checker=True,
|
|
tags_to_mark='all',
|
|
timeout=100000,
|
|
pw_context_kwargs={'accept_downloads': True},
|
|
pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
|
|
)
|
|
obs, info = env.reset()
|
|
|
|
logger.info('Successfully called env.reset')
|
|
# EVAL ONLY: save the goal into file for evaluation
|
|
self.eval_goal = None
|
|
self.goal_image_urls = []
|
|
self.eval_rewards: list[float] = []
|
|
if self.eval_mode:
|
|
self.eval_goal = obs['goal']
|
|
if 'goal_object' in obs:
|
|
obs['goal_object'] = list(obs['goal_object'])
|
|
if len(obs['goal_object']) > 0:
|
|
self.eval_goal = obs['goal_object'][0]['text']
|
|
for message in obs['goal_object']:
|
|
if message['type'] == 'image_url':
|
|
image_src = message['image_url']
|
|
if isinstance(image_src, dict):
|
|
image_src = image_src['url']
|
|
self.goal_image_urls.append(image_src)
|
|
logger.debug(f'Browsing goal: {self.eval_goal}')
|
|
logger.info('Browser env started.')
|
|
|
|
while should_continue():
|
|
try:
|
|
if self.browser_side.poll(timeout=0.01):
|
|
unique_request_id, action_data = self.browser_side.recv()
|
|
|
|
# shutdown the browser environment
|
|
if unique_request_id == 'SHUTDOWN':
|
|
logger.debug('SHUTDOWN recv, shutting down browser env...')
|
|
env.close()
|
|
return
|
|
elif unique_request_id == 'IS_ALIVE':
|
|
self.browser_side.send(('ALIVE', None))
|
|
continue
|
|
|
|
# EVAL ONLY: Get evaluation info
|
|
if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
|
|
self.browser_side.send(
|
|
(
|
|
unique_request_id,
|
|
{
|
|
'text_content': self.eval_goal,
|
|
'image_content': self.goal_image_urls,
|
|
},
|
|
)
|
|
)
|
|
continue
|
|
elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
|
|
self.browser_side.send(
|
|
(
|
|
unique_request_id,
|
|
{'text_content': json.dumps(self.eval_rewards)},
|
|
)
|
|
)
|
|
continue
|
|
|
|
action = action_data['action']
|
|
obs, reward, terminated, truncated, info = env.step(action)
|
|
|
|
# EVAL ONLY: Save the rewards into file for evaluation
|
|
if self.eval_mode:
|
|
self.eval_rewards.append(reward)
|
|
|
|
# add text content of the page
|
|
html_str = flatten_dom_to_str(obs['dom_object'])
|
|
obs['text_content'] = self.html_text_converter.handle(html_str)
|
|
# make observation serializable
|
|
obs['set_of_marks'] = image_to_png_base64_url(
|
|
overlay_som(
|
|
obs['screenshot'], obs.get('extra_element_properties', {})
|
|
),
|
|
add_data_prefix=True,
|
|
)
|
|
obs['screenshot'] = image_to_png_base64_url(
|
|
obs['screenshot'], add_data_prefix=True
|
|
)
|
|
obs['active_page_index'] = obs['active_page_index'].item()
|
|
obs['elapsed_time'] = obs['elapsed_time'].item()
|
|
self.browser_side.send((unique_request_id, obs))
|
|
except KeyboardInterrupt:
|
|
logger.debug('Browser env process interrupted by user.')
|
|
try:
|
|
env.close()
|
|
except Exception:
|
|
pass
|
|
return
|
|
|
|
def step(self, action_str: str, timeout: float = 120) -> dict:
|
|
"""Execute an action in the browser environment and return the observation."""
|
|
unique_request_id = str(uuid.uuid4())
|
|
self.agent_side.send((unique_request_id, {'action': action_str}))
|
|
start_time = time.time()
|
|
while True:
|
|
if should_exit() or time.time() - start_time > timeout:
|
|
raise TimeoutError('Browser environment took too long to respond.')
|
|
if self.agent_side.poll(timeout=0.01):
|
|
response_id, obs = self.agent_side.recv()
|
|
if response_id == unique_request_id:
|
|
return dict(obs)
|
|
|
|
def check_alive(self, timeout: float = 60) -> bool:
|
|
self.agent_side.send(('IS_ALIVE', None))
|
|
if self.agent_side.poll(timeout=timeout):
|
|
response_id, _ = self.agent_side.recv()
|
|
if response_id == 'ALIVE':
|
|
return True
|
|
logger.debug(f'Browser env is not alive. Response ID: {response_id}')
|
|
return False
|
|
|
|
def close(self) -> None:
|
|
if not self.process.is_alive():
|
|
return
|
|
try:
|
|
self.agent_side.send(('SHUTDOWN', None))
|
|
self.process.join(5) # Wait for the process to terminate
|
|
if self.process.is_alive():
|
|
logger.error(
|
|
'Browser process did not terminate, forcefully terminating...'
|
|
)
|
|
self.process.terminate()
|
|
self.process.join(5) # Wait for the process to terminate
|
|
if self.process.is_alive():
|
|
self.process.kill()
|
|
self.process.join(5) # Wait for the process to terminate
|
|
self.agent_side.close()
|
|
self.browser_side.close()
|
|
except Exception as e:
|
|
logger.error(f'Encountered an error when closing browser env: {e}')
|