mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
parent
a17c57d82e
commit
3d02c0c3a3
@ -14,6 +14,7 @@ class BrowserOutputObservation(Observation):
|
||||
url: str
|
||||
trigger_by_action: str
|
||||
screenshot: str = field(repr=False, default='') # don't show in repr
|
||||
screenshot_path: str | None = field(default=None) # path to saved screenshot file
|
||||
set_of_marks: str = field(default='', repr=False) # don't show in repr
|
||||
error: bool = False
|
||||
observation: str = ObservationType.BROWSE
|
||||
@ -49,6 +50,8 @@ class BrowserOutputObservation(Observation):
|
||||
f'Last browser action error: {self.last_browser_action_error}\n'
|
||||
f'Focused element bid: {self.focused_element_bid}\n'
|
||||
)
|
||||
if self.screenshot_path:
|
||||
ret += f'Screenshot saved to: {self.screenshot_path}\n'
|
||||
ret += '--- Agent Observation ---\n'
|
||||
ret += self.get_agent_obs_text()
|
||||
return ret
|
||||
@ -57,7 +60,14 @@ class BrowserOutputObservation(Observation):
|
||||
"""Get a concise text that will be shown to the agent."""
|
||||
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
|
||||
text = f'[Current URL: {self.url}]\n'
|
||||
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
|
||||
text += f'[Focused element bid: {self.focused_element_bid}]\n'
|
||||
|
||||
# Add screenshot path information if available
|
||||
if self.screenshot_path:
|
||||
text += f'[Screenshot saved to: {self.screenshot_path}]\n'
|
||||
|
||||
text += '\n'
|
||||
|
||||
if self.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
@ -85,6 +95,7 @@ class BrowserOutputObservation(Observation):
|
||||
|
||||
elif self.trigger_by_action == ActionType.BROWSE:
|
||||
text = f'[Current URL: {self.url}]\n'
|
||||
|
||||
if self.error:
|
||||
text += (
|
||||
'================ BEGIN error message ===============\n'
|
||||
|
||||
@ -602,7 +602,7 @@ class ActionExecutor:
|
||||
'Browser functionality is not supported on Windows.'
|
||||
)
|
||||
await self._ensure_browser_ready()
|
||||
return await browse(action, self.browser)
|
||||
return await browse(action, self.browser, self.initial_cwd)
|
||||
|
||||
async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
|
||||
if self.browser is None:
|
||||
@ -610,7 +610,7 @@ class ActionExecutor:
|
||||
'Browser functionality is not supported on Windows.'
|
||||
)
|
||||
await self._ensure_browser_ready()
|
||||
return await browse(action, self.browser)
|
||||
return await browse(action, self.browser, self.initial_cwd)
|
||||
|
||||
def close(self):
|
||||
self.memory_monitor.stop_monitoring()
|
||||
|
||||
31
openhands/runtime/browser/base64.py
Normal file
31
openhands/runtime/browser/base64.py
Normal file
@ -0,0 +1,31 @@
|
||||
import io
|
||||
import base64
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
def image_to_png_base64_url(
|
||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
||||
) -> str:
|
||||
"""Convert a numpy array to a base64 encoded png image url."""
|
||||
if isinstance(image, np.ndarray):
|
||||
image = Image.fromarray(image)
|
||||
if image.mode in ('RGBA', 'LA'):
|
||||
image = image.convert('RGB')
|
||||
buffered = io.BytesIO()
|
||||
image.save(buffered, format='PNG')
|
||||
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
||||
return (
|
||||
f'data:image/png;base64,{image_base64}'
|
||||
if add_data_prefix
|
||||
else f'{image_base64}'
|
||||
)
|
||||
|
||||
def png_base64_url_to_image(png_base64_url: str) -> Image.Image:
|
||||
"""Convert a base64 encoded png image url to a PIL Image."""
|
||||
splited = png_base64_url.split(',')
|
||||
if len(splited) == 2:
|
||||
base64_data = splited[1]
|
||||
else:
|
||||
base64_data = png_base64_url
|
||||
return Image.open(io.BytesIO(base64.b64decode(base64_data)))
|
||||
@ -1,6 +1,4 @@
|
||||
import atexit
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import multiprocessing
|
||||
import time
|
||||
@ -9,20 +7,18 @@ import uuid
|
||||
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
|
||||
import gymnasium as gym
|
||||
import html2text
|
||||
import numpy as np
|
||||
import tenacity
|
||||
from browsergym.utils.obs import flatten_dom_to_str, overlay_som
|
||||
from PIL import Image
|
||||
|
||||
from openhands.core.exceptions import BrowserInitException
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
from openhands.utils.shutdown_listener import should_continue, should_exit
|
||||
from openhands.utils.tenacity_stop import stop_if_should_exit
|
||||
from openhands.runtime.browser.base64 import image_to_png_base64_url
|
||||
|
||||
BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
|
||||
BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
|
||||
|
||||
|
||||
class BrowserEnv:
|
||||
def __init__(self, browsergym_eval_env: str | None = None):
|
||||
self.html_text_converter = self.get_html_text_converter()
|
||||
@ -165,13 +161,13 @@ class BrowserEnv:
|
||||
html_str = flatten_dom_to_str(obs['dom_object'])
|
||||
obs['text_content'] = self.html_text_converter.handle(html_str)
|
||||
# make observation serializable
|
||||
obs['set_of_marks'] = self.image_to_png_base64_url(
|
||||
obs['set_of_marks'] = image_to_png_base64_url(
|
||||
overlay_som(
|
||||
obs['screenshot'], obs.get('extra_element_properties', {})
|
||||
),
|
||||
add_data_prefix=True,
|
||||
)
|
||||
obs['screenshot'] = self.image_to_png_base64_url(
|
||||
obs['screenshot'] = image_to_png_base64_url(
|
||||
obs['screenshot'], add_data_prefix=True
|
||||
)
|
||||
obs['active_page_index'] = obs['active_page_index'].item()
|
||||
@ -226,41 +222,3 @@ class BrowserEnv:
|
||||
self.browser_side.close()
|
||||
except Exception as e:
|
||||
logger.error(f'Encountered an error when closing browser env: {e}')
|
||||
|
||||
@staticmethod
|
||||
def image_to_png_base64_url(
|
||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
||||
) -> str:
|
||||
"""Convert a numpy array to a base64 encoded png image url."""
|
||||
if isinstance(image, np.ndarray):
|
||||
image = Image.fromarray(image)
|
||||
if image.mode in ('RGBA', 'LA'):
|
||||
image = image.convert('RGB')
|
||||
buffered = io.BytesIO()
|
||||
image.save(buffered, format='PNG')
|
||||
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
||||
return (
|
||||
f'data:image/png;base64,{image_base64}'
|
||||
if add_data_prefix
|
||||
else f'{image_base64}'
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def image_to_jpg_base64_url(
|
||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
||||
) -> str:
|
||||
"""Convert a numpy array to a base64 encoded jpeg image url."""
|
||||
if isinstance(image, np.ndarray):
|
||||
image = Image.fromarray(image)
|
||||
if image.mode in ('RGBA', 'LA'):
|
||||
image = image.convert('RGB')
|
||||
buffered = io.BytesIO()
|
||||
image.save(buffered, format='JPEG')
|
||||
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
||||
return (
|
||||
f'data:image/jpeg;base64,{image_base64}'
|
||||
if add_data_prefix
|
||||
else f'{image_base64}'
|
||||
)
|
||||
|
||||
@ -1,15 +1,23 @@
|
||||
import base64
|
||||
import datetime
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from openhands.core.exceptions import BrowserUnavailableException
|
||||
from openhands.core.schema import ActionType
|
||||
from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
|
||||
from openhands.events.observation import BrowserOutputObservation
|
||||
from openhands.runtime.browser.base64 import png_base64_url_to_image
|
||||
from openhands.runtime.browser.browser_env import BrowserEnv
|
||||
from openhands.utils.async_utils import call_sync_from_async
|
||||
|
||||
|
||||
async def browse(
|
||||
action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None
|
||||
action: BrowseURLAction | BrowseInteractiveAction,
|
||||
browser: BrowserEnv | None,
|
||||
workspace_dir: str | None = None,
|
||||
) -> BrowserOutputObservation:
|
||||
if browser is None:
|
||||
raise BrowserUnavailableException()
|
||||
@ -31,10 +39,50 @@ async def browse(
|
||||
try:
|
||||
# obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
|
||||
obs = await call_sync_from_async(browser.step, action_str)
|
||||
|
||||
# Save screenshot if workspace_dir is provided
|
||||
screenshot_path = None
|
||||
if workspace_dir is not None and obs.get('screenshot'):
|
||||
# Create screenshots directory if it doesn't exist
|
||||
screenshots_dir = Path(workspace_dir) / '.browser_screenshots'
|
||||
screenshots_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Generate a filename based on timestamp
|
||||
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')
|
||||
screenshot_filename = f'screenshot_{timestamp}.png'
|
||||
screenshot_path = str(screenshots_dir / screenshot_filename)
|
||||
|
||||
# Direct image saving from base64 data without using PIL's Image.open
|
||||
# This approach bypasses potential encoding issues that might occur when
|
||||
# converting between different image representations, ensuring the raw PNG
|
||||
# data from the browser is saved directly to disk.
|
||||
|
||||
# Extract the base64 data
|
||||
base64_data = obs.get('screenshot', '')
|
||||
if ',' in base64_data:
|
||||
base64_data = base64_data.split(',')[1]
|
||||
|
||||
try:
|
||||
# Decode base64 directly to binary
|
||||
image_data = base64.b64decode(base64_data)
|
||||
|
||||
# Write binary data directly to file
|
||||
with open(screenshot_path, 'wb') as f:
|
||||
f.write(image_data)
|
||||
|
||||
# Verify the image was saved correctly by opening it
|
||||
# This is just a verification step and can be removed in production
|
||||
Image.open(screenshot_path).verify()
|
||||
except Exception:
|
||||
# If direct saving fails, fall back to the original method
|
||||
image = png_base64_url_to_image(obs.get('screenshot'))
|
||||
image.save(screenshot_path, format='PNG', optimize=True)
|
||||
|
||||
return BrowserOutputObservation(
|
||||
content=obs['text_content'], # text content of the page
|
||||
url=obs.get('url', ''), # URL of the page
|
||||
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
|
||||
screenshot_path=screenshot_path, # path to saved screenshot file
|
||||
set_of_marks=obs.get(
|
||||
'set_of_marks', None
|
||||
), # base64-encoded Set-of-Marks annotated screenshot, png,
|
||||
@ -60,6 +108,7 @@ async def browse(
|
||||
return BrowserOutputObservation(
|
||||
content=str(e),
|
||||
screenshot='',
|
||||
screenshot_path=None,
|
||||
error=True,
|
||||
last_browser_action_error=str(e),
|
||||
url=asked_url if action.action == ActionType.BROWSE else '',
|
||||
|
||||
@ -117,7 +117,20 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
observation_text = str(obs)
|
||||
assert '[Action executed successfully.]' in observation_text
|
||||
assert 'Canvas' in observation_text
|
||||
assert (
|
||||
'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
|
||||
in observation_text
|
||||
)
|
||||
|
||||
# Check the /workspace/.browser_screenshots folder
|
||||
action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
|
||||
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_cmd)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert 'screenshot_' in obs.content
|
||||
assert '.png' in obs.content
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
@ -169,6 +182,19 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
|
||||
observation_text = str(obs)
|
||||
assert '[Action executed successfully.]' in observation_text
|
||||
assert 'File Viewer - test_image.png' in observation_text
|
||||
assert (
|
||||
'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
|
||||
in observation_text
|
||||
)
|
||||
|
||||
# Check the /workspace/.browser_screenshots folder
|
||||
action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
|
||||
logger.info(action_cmd, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action_cmd)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
assert obs.exit_code == 0
|
||||
assert 'screenshot_' in obs.content
|
||||
assert '.png' in obs.content
|
||||
finally:
|
||||
_close_test_runtime(runtime)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user