From 3d02c0c3a3985c40ae863ee4c48e2d7b5917f3fe Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Sun, 11 May 2025 15:51:18 +0800
Subject: [PATCH] Fix issue #8372: Implement browser screenshot saving
 functionality (#8383)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Graham Neubig <neubig@gmail.com>
---
 openhands/events/observation/browse.py       | 13 ++++-
 openhands/runtime/action_execution_server.py |  4 +-
 openhands/runtime/browser/base64.py          | 31 ++++++++++++
 openhands/runtime/browser/browser_env.py     | 48 ++----------------
 openhands/runtime/browser/utils.py           | 51 +++++++++++++++++++-
 tests/runtime/test_browsing.py               | 26 ++++++++++
 6 files changed, 124 insertions(+), 49 deletions(-)
 create mode 100644 openhands/runtime/browser/base64.py

diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py
index da268a16e8..4474cfcb66 100644
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -14,6 +14,7 @@ class BrowserOutputObservation(Observation):
     url: str
     trigger_by_action: str
     screenshot: str = field(repr=False, default='')  # don't show in repr
+    screenshot_path: str | None = field(default=None)  # path to saved screenshot file
     set_of_marks: str = field(default='', repr=False)  # don't show in repr
     error: bool = False
     observation: str = ObservationType.BROWSE
@@ -49,6 +50,8 @@ class BrowserOutputObservation(Observation):
             f'Last browser action error: {self.last_browser_action_error}\n'
             f'Focused element bid: {self.focused_element_bid}\n'
         )
+        if self.screenshot_path:
+            ret += f'Screenshot saved to: {self.screenshot_path}\n'
         ret += '--- Agent Observation ---\n'
         ret += self.get_agent_obs_text()
         return ret
@@ -57,7 +60,14 @@ class BrowserOutputObservation(Observation):
         """Get a concise text that will be shown to the agent."""
         if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
             text = f'[Current URL: {self.url}]\n'
-            text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
+            text += f'[Focused element bid: {self.focused_element_bid}]\n'
+
+            # Add screenshot path information if available
+            if self.screenshot_path:
+                text += f'[Screenshot saved to: {self.screenshot_path}]\n'
+
+            text += '\n'
+
             if self.error:
                 text += (
                     '================ BEGIN error message ===============\n'
@@ -85,6 +95,7 @@ class BrowserOutputObservation(Observation):
 
         elif self.trigger_by_action == ActionType.BROWSE:
             text = f'[Current URL: {self.url}]\n'
+
             if self.error:
                 text += (
                     '================ BEGIN error message ===============\n'
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index 3c82d51c09..ebd21e5634 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -602,7 +602,7 @@ class ActionExecutor:
                 'Browser functionality is not supported on Windows.'
             )
         await self._ensure_browser_ready()
-        return await browse(action, self.browser)
+        return await browse(action, self.browser, self.initial_cwd)
 
     async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
         if self.browser is None:
@@ -610,7 +610,7 @@ class ActionExecutor:
                 'Browser functionality is not supported on Windows.'
             )
         await self._ensure_browser_ready()
-        return await browse(action, self.browser)
+        return await browse(action, self.browser, self.initial_cwd)
 
     def close(self):
         self.memory_monitor.stop_monitoring()
diff --git a/openhands/runtime/browser/base64.py b/openhands/runtime/browser/base64.py
new file mode 100644
index 0000000000..94890e73c8
--- /dev/null
+++ b/openhands/runtime/browser/base64.py
@@ -0,0 +1,31 @@
+import io
+import base64
+from PIL import Image
+import numpy as np
+
+def image_to_png_base64_url(
+    image: np.ndarray | Image.Image, add_data_prefix: bool = False
+) -> str:
+    """Convert a numpy array to a base64 encoded png image url."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in ('RGBA', 'LA'):
+        image = image.convert('RGB')
+    buffered = io.BytesIO()
+    image.save(buffered, format='PNG')
+
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return (
+        f'data:image/png;base64,{image_base64}'
+        if add_data_prefix
+        else f'{image_base64}'
+    )
+
+def png_base64_url_to_image(png_base64_url: str) -> Image.Image:
+    """Convert a base64 encoded png image url to a PIL Image."""
+    splited = png_base64_url.split(',')
+    if len(splited) == 2:
+        base64_data = splited[1]
+    else:
+        base64_data = png_base64_url
+    return Image.open(io.BytesIO(base64.b64decode(base64_data)))
diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py
index 7880d52165..e7087a1458 100644
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@@ -1,6 +1,4 @@
 import atexit
-import base64
-import io
 import json
 import multiprocessing
 import time
@@ -9,20 +7,18 @@ import uuid
 import browsergym.core  # noqa F401 (we register the openended task as a gym environment)
 import gymnasium as gym
 import html2text
-import numpy as np
 import tenacity
 from browsergym.utils.obs import flatten_dom_to_str, overlay_som
-from PIL import Image
 
 from openhands.core.exceptions import BrowserInitException
 from openhands.core.logger import openhands_logger as logger
 from openhands.utils.shutdown_listener import should_continue, should_exit
 from openhands.utils.tenacity_stop import stop_if_should_exit
+from openhands.runtime.browser.base64 import image_to_png_base64_url
 
 BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
 BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
 
-
 class BrowserEnv:
     def __init__(self, browsergym_eval_env: str | None = None):
         self.html_text_converter = self.get_html_text_converter()
@@ -165,13 +161,13 @@ class BrowserEnv:
                     html_str = flatten_dom_to_str(obs['dom_object'])
                     obs['text_content'] = self.html_text_converter.handle(html_str)
                     # make observation serializable
-                    obs['set_of_marks'] = self.image_to_png_base64_url(
+                    obs['set_of_marks'] = image_to_png_base64_url(
                         overlay_som(
                             obs['screenshot'], obs.get('extra_element_properties', {})
                         ),
                         add_data_prefix=True,
                     )
-                    obs['screenshot'] = self.image_to_png_base64_url(
+                    obs['screenshot'] = image_to_png_base64_url(
                         obs['screenshot'], add_data_prefix=True
                     )
                     obs['active_page_index'] = obs['active_page_index'].item()
@@ -226,41 +222,3 @@ class BrowserEnv:
             self.browser_side.close()
         except Exception as e:
             logger.error(f'Encountered an error when closing browser env: {e}')
-
-    @staticmethod
-    def image_to_png_base64_url(
-        image: np.ndarray | Image.Image, add_data_prefix: bool = False
-    ) -> str:
-        """Convert a numpy array to a base64 encoded png image url."""
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        if image.mode in ('RGBA', 'LA'):
-            image = image.convert('RGB')
-        buffered = io.BytesIO()
-        image.save(buffered, format='PNG')
-
-        image_base64 = base64.b64encode(buffered.getvalue()).decode()
-        return (
-            f'data:image/png;base64,{image_base64}'
-            if add_data_prefix
-            else f'{image_base64}'
-        )
-
-    @staticmethod
-    def image_to_jpg_base64_url(
-        image: np.ndarray | Image.Image, add_data_prefix: bool = False
-    ) -> str:
-        """Convert a numpy array to a base64 encoded jpeg image url."""
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        if image.mode in ('RGBA', 'LA'):
-            image = image.convert('RGB')
-        buffered = io.BytesIO()
-        image.save(buffered, format='JPEG')
-
-        image_base64 = base64.b64encode(buffered.getvalue()).decode()
-        return (
-            f'data:image/jpeg;base64,{image_base64}'
-            if add_data_prefix
-            else f'{image_base64}'
-        )
diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py
index b029ac0841..ca9fe84143 100644
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@@ -1,15 +1,23 @@
+import base64
+import datetime
 import os
+from pathlib import Path
+
+from PIL import Image
 
 from openhands.core.exceptions import BrowserUnavailableException
 from openhands.core.schema import ActionType
 from openhands.events.action import BrowseInteractiveAction, BrowseURLAction
 from openhands.events.observation import BrowserOutputObservation
+from openhands.runtime.browser.base64 import png_base64_url_to_image
 from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.utils.async_utils import call_sync_from_async
 
 
 async def browse(
-    action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None
+    action: BrowseURLAction | BrowseInteractiveAction,
+    browser: BrowserEnv | None,
+    workspace_dir: str | None = None,
 ) -> BrowserOutputObservation:
     if browser is None:
         raise BrowserUnavailableException()
@@ -31,10 +39,50 @@ async def browse(
     try:
         # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
         obs = await call_sync_from_async(browser.step, action_str)
+
+        # Save screenshot if workspace_dir is provided
+        screenshot_path = None
+        if workspace_dir is not None and obs.get('screenshot'):
+            # Create screenshots directory if it doesn't exist
+            screenshots_dir = Path(workspace_dir) / '.browser_screenshots'
+            screenshots_dir.mkdir(exist_ok=True)
+
+            # Generate a filename based on timestamp
+            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')
+            screenshot_filename = f'screenshot_{timestamp}.png'
+            screenshot_path = str(screenshots_dir / screenshot_filename)
+
+            # Direct image saving from base64 data without using PIL's Image.open
+            # This approach bypasses potential encoding issues that might occur when
+            # converting between different image representations, ensuring the raw PNG
+            # data from the browser is saved directly to disk.
+
+            # Extract the base64 data
+            base64_data = obs.get('screenshot', '')
+            if ',' in base64_data:
+                base64_data = base64_data.split(',')[1]
+
+            try:
+                # Decode base64 directly to binary
+                image_data = base64.b64decode(base64_data)
+
+                # Write binary data directly to file
+                with open(screenshot_path, 'wb') as f:
+                    f.write(image_data)
+
+                # Verify the image was saved correctly by opening it
+                # This is just a verification step and can be removed in production
+                Image.open(screenshot_path).verify()
+            except Exception:
+                # If direct saving fails, fall back to the original method
+                image = png_base64_url_to_image(obs.get('screenshot'))
+                image.save(screenshot_path, format='PNG', optimize=True)
+
         return BrowserOutputObservation(
             content=obs['text_content'],  # text content of the page
             url=obs.get('url', ''),  # URL of the page
             screenshot=obs.get('screenshot', None),  # base64-encoded screenshot, png
+            screenshot_path=screenshot_path,  # path to saved screenshot file
             set_of_marks=obs.get(
                 'set_of_marks', None
             ),  # base64-encoded Set-of-Marks annotated screenshot, png,
@@ -60,6 +108,7 @@ async def browse(
         return BrowserOutputObservation(
             content=str(e),
             screenshot='',
+            screenshot_path=None,
             error=True,
             last_browser_action_error=str(e),
             url=asked_url if action.action == ActionType.BROWSE else '',
diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py
index b8937f5c9f..79b46b93cb 100644
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@@ -117,7 +117,20 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
         observation_text = str(obs)
         assert '[Action executed successfully.]' in observation_text
         assert 'Canvas' in observation_text
+        assert (
+            'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
+            in observation_text
+        )
 
+        # Check the /workspace/.browser_screenshots folder
+        action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'screenshot_' in obs.content
+        assert '.png' in obs.content
     finally:
         _close_test_runtime(runtime)
 
@@ -169,6 +182,19 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
         observation_text = str(obs)
         assert '[Action executed successfully.]' in observation_text
         assert 'File Viewer - test_image.png' in observation_text
+        assert (
+            'Screenshot saved to: /workspace/.browser_screenshots/screenshot_'
+            in observation_text
+        )
 
+        # Check the /workspace/.browser_screenshots folder
+        action_cmd = CmdRunAction(command='ls /workspace/.browser_screenshots')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert 'screenshot_' in obs.content
+        assert '.png' in obs.content
     finally:
         _close_test_runtime(runtime)