From dfa54673d20a764fb6528e47329de55a513f5cba Mon Sep 17 00:00:00 2001
From: "Ryan H. Tran" <descience.thh10@gmail.com>
Date: Wed, 25 Jun 2025 12:36:15 +0700
Subject: [PATCH] [OH-Versa] Add remaining browsing & GAIA eval improvement
 (#9015)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
---
 evaluation/benchmarks/gaia/run_infer.py       |  73 ++++++--
 evaluation/benchmarks/gaia/utils.py           |  43 +++++
 evaluation/utils/shared.py                    |   4 +-
 openhands/core/schema/observation.py          |   3 +
 openhands/events/observation/__init__.py      |   2 +
 openhands/events/observation/browse.py        |   1 +
 openhands/events/observation/file_download.py |  21 +++
 openhands/events/serialization/observation.py |   2 +
 openhands/memory/conversation_memory.py       |  14 +-
 openhands/runtime/action_execution_server.py  |  44 ++++-
 openhands/runtime/browser/browser_env.py      |   6 +-
 openhands/runtime/browser/utils.py            |  21 ++-
 .../utils/runtime_templates/Dockerfile.j2     |   2 +-
 tests/runtime/test_aci_edit.py                |   4 +-
 tests/runtime/test_browsing.py                | 166 ++++++++++++++++++
 tests/runtime/test_mcp_action.py              |   6 +-
 16 files changed, 383 insertions(+), 29 deletions(-)
 create mode 100644 evaluation/benchmarks/gaia/utils.py
 create mode 100644 openhands/events/observation/file_download.py

diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
index 81420eb33c..82b656e52c 100644
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -3,13 +3,20 @@ import copy
 import functools
 import os
 import re
+import shutil
+import zipfile
 
 import huggingface_hub
 import pandas as pd
 from datasets import load_dataset
+from PIL import Image
 from pydantic import SecretStr
 
 from evaluation.benchmarks.gaia.scorer import question_scorer
+from evaluation.benchmarks.gaia.utils import (
+    image_to_jpg_base64_url,
+    image_to_png_base64_url,
+)
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
@@ -97,27 +104,44 @@ def initialize_runtime(
     if instance['file_name'] != '':
         # if this question comes with a file, we need to save it to the workspace
         assert metadata.data_split is not None
+        extension_name = instance['file_name'].split('.')[-1]
         src_file = os.path.join(
             DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
         )
         assert os.path.exists(src_file)
-        dest_file = os.path.join('/workspace', instance['file_name'])
-        runtime.copy_to(src_file, dest_file)
+        if extension_name == 'zip':
+            temp_dir = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata.data_split, 'tmp_file'
+            )
+            os.makedirs(temp_dir, exist_ok=True)
+            with zipfile.ZipFile(src_file, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            for root, dirs, files in os.walk(temp_dir):
+                for file in files:
+                    dest_file = '/workspace'
+                    runtime.copy_to(os.path.join(root, file), dest_file)
+            shutil.rmtree(temp_dir)
+        elif extension_name not in ['jpg', 'png']:
+            dest_file = '/workspace'
+            runtime.copy_to(src_file, dest_file)
 
-        # rename to file.extension_name
-        extension_name = instance['file_name'].split('.')[-1]
-        action = CmdRunAction(
-            command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
-        )
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        assert obs.exit_code == 0
+            # rename to file.extension_name
+            action = CmdRunAction(
+                command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
+            )
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            assert obs.exit_code == 0
 
     action = CmdRunAction(command='cd /workspace')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     assert obs.exit_code == 0
 
+    action = CmdRunAction(
+        command='apt-get update && apt-get install -y ffmpeg && apt-get install -y ffprobe'
+    )
+    runtime.run_action(action)
     logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
 
 
@@ -151,8 +175,31 @@ Here is the task:
         task_question=instance['Question'],
     )
     logger.info(f'Instruction: {instruction}')
+    image_urls = []
     if dest_file:
-        instruction += f'\n\nThe mentioned file is provided in the workspace at: {dest_file.split("/")[-1]}'
+        if extension_name not in ['jpg', 'png', 'zip']:
+            instruction += f'To solve this task you will have to use the attached file provided in the workspace at location: {dest_file}\n\n'
+        elif extension_name == 'zip':
+            filenames = []
+            src_file = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
+            )
+            with zipfile.ZipFile(src_file, 'r') as zip_ref:
+                filenames = zip_ref.namelist()
+
+            filenames = [f'/workspace/{file}' for file in filenames]
+            filenames = ', '.join(filenames)
+            instruction += f'To solve this task you will have to use the attached files provided in the workspace at locations: {filenames}\n\n'
+        else:  # Image files: jpg, png
+            src_file = os.path.join(
+                DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
+            )
+            instruction += 'Image: To solve this task you will have to use the image shown below.\n\n'
+            image = Image.open(src_file)
+            if extension_name == 'jpg':
+                image_urls.append(image_to_jpg_base64_url(image))
+            else:
+                image_urls.append(image_to_png_base64_url(image))
 
     instruction += """IMPORTANT: When seeking information from a website, REFRAIN from arbitrary URL navigation. You should utilize the designated search engine tool with precise keywords to obtain relevant URLs or use the specific website's search interface. DO NOT navigate directly to specific URLs as they may not exist.\n\nFor example: if you want to search for a research paper on Arxiv, either use the search engine tool with specific keywords or navigate to arxiv.org and then use its interface.\n"""
     instruction += 'IMPORTANT: You should NEVER ask for Human Help.\n'
@@ -174,7 +221,9 @@ Here is the task:
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            initial_user_action=MessageAction(content=instruction),
+            initial_user_action=MessageAction(
+                content=instruction, image_urls=image_urls
+            ),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 metadata.agent_class
diff --git a/evaluation/benchmarks/gaia/utils.py b/evaluation/benchmarks/gaia/utils.py
new file mode 100644
index 0000000000..5ca6f34ca1
--- /dev/null
+++ b/evaluation/benchmarks/gaia/utils.py
@@ -0,0 +1,43 @@
+import base64
+import io
+
+import numpy as np
+from PIL import Image
+
+
+def image_to_png_base64_url(
+    image: np.ndarray | Image.Image, add_data_prefix: bool = True
+):
+    """Convert a numpy array to a base64 encoded png image url."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in ('RGBA', 'LA'):
+        image = image.convert('RGB')
+    buffered = io.BytesIO()
+    image.save(buffered, format='PNG')
+
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return (
+        f'data:image/png;base64,{image_base64}'
+        if add_data_prefix
+        else f'{image_base64}'
+    )
+
+
+def image_to_jpg_base64_url(
+    image: np.ndarray | Image.Image, add_data_prefix: bool = True
+):
+    """Convert a numpy array to a base64 encoded jpeg image url."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in ('RGBA', 'LA'):
+        image = image.convert('RGB')
+    buffered = io.BytesIO()
+    image.save(buffered, format='JPEG')
+
+    image_base64 = base64.b64encode(buffered.getvalue()).decode()
+    return (
+        f'data:image/jpeg;base64,{image_base64}'
+        if add_data_prefix
+        else f'{image_base64}'
+    )
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index f2e9542abb..2338277d48 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -109,7 +109,7 @@ def codeact_user_response(
 ) -> str:
     encaps_str = (
         (
-            'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
+            'Your final answer MUST be encapsulated within <solution> and </solution>.\n'
             'For example: The answer to the question is <solution> 42 </solution>.\n'
         )
         if encapsulate_solution
@@ -117,7 +117,7 @@ def codeact_user_response(
     )
     msg = (
         'Please continue working on the task on whatever approach you think is suitable.\n'
-        'If you think you have solved the task, please first send your answer to user through message and then finish the interaction.\n'
+        'When you think you have solved the question, please use the finish tool and include your final answer in the message parameter of the finish tool.\n'
         f'{encaps_str}'
         'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
     )
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
index 5955c19884..aefeb512ca 100644
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -52,3 +52,6 @@ class ObservationType(str, Enum):
 
     MCP = 'mcp'
     """Result of a MCP Server operation"""
+
+    DOWNLOAD = 'download'
+    """Result of downloading/opening a file via the browser"""
diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py
index 2334cc0095..cc4e3637c5 100644
--- a/openhands/events/observation/__init__.py
+++ b/openhands/events/observation/__init__.py
@@ -16,6 +16,7 @@ from openhands.events.observation.empty import (
     NullObservation,
 )
 from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.file_download import FileDownloadObservation
 from openhands.events.observation.files import (
     FileEditObservation,
     FileReadObservation,
@@ -46,4 +47,5 @@ __all__ = [
     'RecallObservation',
     'RecallType',
     'MCPObservation',
+    'FileDownloadObservation',
 ]
diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py
index dcecd86123..9a565767ac 100644
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -32,6 +32,7 @@ class BrowserOutputObservation(Observation):
     last_browser_action: str = ''
     last_browser_action_error: str = ''
     focused_element_bid: str = ''
+    filter_visible_only: bool = False
 
     @property
     def message(self) -> str:
diff --git a/openhands/events/observation/file_download.py b/openhands/events/observation/file_download.py
new file mode 100644
index 0000000000..f80b6019d4
--- /dev/null
+++ b/openhands/events/observation/file_download.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from openhands.core.schema import ObservationType
+from openhands.events.observation.observation import Observation
+
+
+@dataclass
+class FileDownloadObservation(Observation):
+    file_path: str
+    observation: str = ObservationType.DOWNLOAD
+
+    @property
+    def message(self) -> str:
+        return f'Downloaded the file at location: {self.file_path}'
+
+    def __str__(self) -> str:
+        ret = (
+            '**FileDownloadObservation**\n'
+            f'Location of downloaded file: {self.file_path}\n'
+        )
+        return ret
diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py
index 98efad0d76..4722fcd03f 100644
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@@ -20,6 +20,7 @@ from openhands.events.observation.empty import (
     NullObservation,
 )
 from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.file_download import FileDownloadObservation
 from openhands.events.observation.files import (
     FileEditObservation,
     FileReadObservation,
@@ -47,6 +48,7 @@ observations = (
     AgentThinkObservation,
     RecallObservation,
     MCPObservation,
+    FileDownloadObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 4d8e45b4a5..7d226816d9 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -28,6 +28,7 @@ from openhands.events.observation import (
     AgentThinkObservation,
     BrowserOutputObservation,
     CmdOutputObservation,
+    FileDownloadObservation,
     FileEditObservation,
     FileReadObservation,
     IPythonRunCellObservation,
@@ -288,7 +289,12 @@ class ConversationMemory:
             role = 'user' if action.source == 'user' else 'assistant'
             content = [TextContent(text=action.content or '')]
             if vision_is_active and action.image_urls:
-                content.append(ImageContent(image_urls=action.image_urls))
+                if role == 'user':
+                    for idx, url in enumerate(action.image_urls):
+                        content.append(TextContent(text=f'Image {idx + 1}:'))
+                        content.append(ImageContent(image_urls=[url]))
+                else:
+                    content.append(ImageContent(image_urls=action.image_urls))
             if role not in ('user', 'system', 'assistant', 'tool'):
                 raise ValueError(f'Invalid role: {role}')
             return [
@@ -339,6 +345,7 @@ class ConversationMemory:
         - AgentDelegateObservation: Formats results from delegated agent tasks
         - ErrorObservation: Formats error messages from failed actions
         - UserRejectObservation: Formats user rejection messages
+        - FileDownloadObservation: Formats the result of a browsing action that opened/downloaded a file
 
         In function calling mode, observations with tool_call_metadata are stored in
         tool_call_id_to_message for later processing instead of being returned immediately.
@@ -429,7 +436,7 @@ class ConversationMemory:
                 and enable_som_visual_browsing
                 and vision_is_active
             ):
-                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
+                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. However, the Accessibility tree contains information from the entire webpage.)\n'
 
                 # Determine which image to use and validate it
                 image_url = None
@@ -492,6 +499,9 @@ class ConversationMemory:
         elif isinstance(obs, AgentCondensationObservation):
             text = truncate_content(obs.content, max_message_chars)
             message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, FileDownloadObservation):
+            text = truncate_content(obs.content, max_message_chars)
+            message = Message(role='user', content=[TextContent(text=text)])
         elif (
             isinstance(obs, RecallObservation)
             and self.agent_config.enable_prompt_extensions
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index 2b628339b5..6da0f5e1bb 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -20,6 +20,7 @@ from contextlib import asynccontextmanager
 from pathlib import Path
 from zipfile import ZipFile
 
+import puremagic
 from binaryornot.check import is_binary
 from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
 from fastapi.exceptions import RequestValidationError
@@ -51,6 +52,7 @@ from openhands.events.event import FileEditSource, FileReadSource
 from openhands.events.observation import (
     CmdOutputObservation,
     ErrorObservation,
+    FileDownloadObservation,
     FileEditObservation,
     FileReadObservation,
     FileWriteObservation,
@@ -193,6 +195,8 @@ class ActionExecutor:
         self.start_time = time.time()
         self.last_execution_time = self.start_time
         self._initialized = False
+        self.downloaded_files: list[str] = []
+        self.downloads_directory = '/workspace/.downloads'
 
         self.max_memory_gb: int | None = None
         if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
@@ -603,7 +607,45 @@ class ActionExecutor:
                 'Browser functionality is not supported on Windows.'
             )
         await self._ensure_browser_ready()
-        return await browse(action, self.browser, self.initial_cwd)
+        browser_observation = await browse(action, self.browser, self.initial_cwd)
+        if not browser_observation.error:
+            return browser_observation
+        else:
+            curr_files = os.listdir(self.downloads_directory)
+            new_download = False
+            for file in curr_files:
+                if file not in self.downloaded_files:
+                    new_download = True
+                    self.downloaded_files.append(file)
+                    break  # FIXME: assuming only one file will be downloaded for simplicity
+
+            if not new_download:
+                return browser_observation
+            else:
+                # A new file is downloaded in self.downloads_directory, shift file to /workspace
+                src_path = os.path.join(
+                    self.downloads_directory, self.downloaded_files[-1]
+                )
+                # Guess extension of file using puremagic and add it to tgt_path file name
+                file_ext = ''
+                try:
+                    guesses = puremagic.magic_file(src_path)
+                    if len(guesses) > 0:
+                        ext = guesses[0].extension.strip()
+                        if len(ext) > 0:
+                            file_ext = ext
+                except Exception as _:
+                    pass
+
+                tgt_path = os.path.join(
+                    '/workspace', f'file_{len(self.downloaded_files)}{file_ext}'
+                )
+                shutil.copy(src_path, tgt_path)
+                file_download_obs = FileDownloadObservation(
+                    content=f'Execution of the previous action {action.browser_actions} resulted in a file download. The downloaded file is saved at location: {tgt_path}',
+                    file_path=tgt_path,
+                )
+                return file_download_obs
 
     def close(self):
         self.memory_monitor.stop_monitoring()
diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py
index e3dfc4c7cc..55e3ce1890 100644
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@@ -94,6 +94,9 @@ class BrowserEnv:
                 headless=True,
                 disable_env_checker=True,
                 tags_to_mark='all',
+                timeout=100000,
+                pw_context_kwargs={'accept_downloads': True},
+                pw_chromium_kwargs={'downloads_path': '/workspace/.downloads/'},
             )
         obs, info = env.reset()
 
@@ -105,6 +108,7 @@ class BrowserEnv:
         if self.eval_mode:
             self.eval_goal = obs['goal']
             if 'goal_object' in obs:
+                obs['goal_object'] = list(obs['goal_object'])
                 if len(obs['goal_object']) > 0:
                     self.eval_goal = obs['goal_object'][0]['text']
                 for message in obs['goal_object']:
@@ -182,7 +186,7 @@ class BrowserEnv:
                     pass
                 return
 
-    def step(self, action_str: str, timeout: float = 100) -> dict:
+    def step(self, action_str: str, timeout: float = 120) -> dict:
         """Execute an action in the browser environment and return the observation."""
         unique_request_id = str(uuid.uuid4())
         self.agent_side.send((unique_request_id, {'action': action_str}))
diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py
index cb8be5509d..378e3fea32 100644
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@@ -59,13 +59,22 @@ def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
             cur_axtree_txt = get_axtree_str(
                 obs.axtree_object,
                 obs.extra_element_properties,
-                filter_visible_only=False,
-            )
-            text += (
-                f'============== BEGIN accessibility tree ==============\n'
-                f'{cur_axtree_txt}\n'
-                f'============== END accessibility tree ==============\n'
+                filter_visible_only=obs.filter_visible_only,
             )
+            if not obs.filter_visible_only:
+                text += (
+                    f'Accessibility tree of the COMPLETE webpage:\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
+                    f'============== BEGIN accessibility tree ==============\n'
+                    f'{cur_axtree_txt}\n'
+                    f'============== END accessibility tree ==============\n'
+                )
+            else:
+                text += (
+                    f'Accessibility tree of the VISIBLE portion of the webpage (accessibility tree of complete webpage is too large and you may need to scroll to view remaining portion of the webpage):\nNote: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.\n'
+                    f'============== BEGIN accessibility tree ==============\n'
+                    f'{cur_axtree_txt}\n'
+                    f'============== END accessibility tree ==============\n'
+                )
         except Exception as e:
             text += f'\n[Error encountered when processing the accessibility tree: {e}]'
         return text
diff --git a/openhands/runtime/utils/runtime_templates/Dockerfile.j2 b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
index d192c180ab..f9561574b9 100644
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@@ -15,7 +15,7 @@ ENV POETRY_VIRTUALENVS_PATH=/openhands/poetry \
 # Install base system dependencies
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-        wget curl ca-certificates sudo apt-utils git jq tmux build-essential ripgrep \
+        wget curl ca-certificates sudo apt-utils git jq tmux build-essential ripgrep ffmpeg \
         {%- if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) -%}
         libgl1 \
         {%- else %}
diff --git a/tests/runtime/test_aci_edit.py b/tests/runtime/test_aci_edit.py
index 71a4b9a576..68d2f8c0c5 100644
--- a/tests/runtime/test_aci_edit.py
+++ b/tests/runtime/test_aci_edit.py
@@ -59,7 +59,9 @@ def test_view_directory(temp_dir, runtime_cls, run_as_openhands):
             obs.content
             == f"""Here's the files and directories up to 2 levels deep in {config.workspace_mount_path_in_sandbox}, excluding hidden items:
 {config.workspace_mount_path_in_sandbox}/
-{config.workspace_mount_path_in_sandbox}/test.txt"""
+{config.workspace_mount_path_in_sandbox}/test.txt
+
+1 hidden files/directories in this directory are excluded. You can use 'ls -la /workspace' to see them."""  # The hidden dir is the /workspace/.downloads
         )
 
     finally:
diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py
index df42f5dff4..49b1ef7847 100644
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@@ -14,6 +14,7 @@ from openhands.events.action import (
 from openhands.events.observation import (
     BrowserOutputObservation,
     CmdOutputObservation,
+    FileDownloadObservation,
 )
 
 # ============================================================================================================================
@@ -215,3 +216,168 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
         assert '.png' in obs.content
     finally:
         _close_test_runtime(runtime)
+
+
+@pytest.mark.skipif(
+    os.environ.get('TEST_RUNTIME') == 'cli',
+    reason='CLIRuntime does not support browsing actions',
+)
+def test_download_file(temp_dir, runtime_cls, run_as_openhands):
+    """Test downloading a file using the browser."""
+    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Minimal PDF content for testing
+        pdf_content = b"""%PDF-1.4
+        1 0 obj
+
+        /Type /Catalog
+        /Pages 2 0 R
+        >>
+        endobj
+        2 0 obj
+
+        /Type /Pages
+        /Kids [3 0 R]
+        /Count 1
+        >>
+        endobj
+        3 0 obj
+
+        /Type /Page
+        /Parent 2 0 R
+        /MediaBox [0 0 612 792]
+        >>
+        endobj
+        xref
+        0 4
+        0000000000 65535 f
+        0000000010 00000 n
+        0000000053 00000 n
+        0000000125 00000 n
+        trailer
+
+        /Size 4
+        /Root 1 0 R
+        >>
+        startxref
+        212
+        %%EOF"""
+
+        test_file_name = 'test_download.pdf'
+        test_file_path = os.path.join(temp_dir, test_file_name)
+        with open(test_file_path, 'wb') as f:
+            f.write(pdf_content)
+
+        # Copy the file to the sandbox
+        sandbox_dir = config.workspace_mount_path_in_sandbox
+        runtime.copy_to(test_file_path, sandbox_dir)
+
+        # Create a simple HTML page with a download link
+        html_content = f"""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Download Test</title>
+        </head>
+        <body>
+            <h1>Download Test Page</h1>
+            <p>Click the link below to download the test file:</p>
+            <a href="/{test_file_name}" download="{test_file_name}" id="download-link">Download Test File</a>
+        </body>
+        </html>
+        """
+
+        html_file_path = os.path.join(temp_dir, 'download_test.html')
+        with open(html_file_path, 'w') as f:
+            f.write(html_content)
+
+        # Copy the HTML file to the sandbox
+        runtime.copy_to(html_file_path, sandbox_dir)
+
+        # Verify the files exist in the sandbox
+        action_cmd = CmdRunAction(command='ls -alh')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert test_file_name in obs.content
+        assert 'download_test.html' in obs.content
+
+        # Ensure downloads directory exists
+        action_cmd = CmdRunAction(command='mkdir -p /workspace/.downloads')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Start HTTP server
+        action_cmd = CmdRunAction(
+            command='python3 -m http.server 8000 > server.log 2>&1 &'
+        )
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+
+        # Wait for server to start
+        action_cmd = CmdRunAction(command='sleep 2')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Browse to the HTML page
+        action_browse = BrowseURLAction(url='http://localhost:8000/download_test.html')
+        logger.info(action_browse, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_browse)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Verify the browser observation
+        assert isinstance(obs, BrowserOutputObservation)
+        assert 'http://localhost:8000/download_test.html' in obs.url
+        assert not obs.error
+        assert 'Download Test Page' in obs.content
+
+        # Go to the PDF file url directly - this should trigger download
+        file_url = f'http://localhost:8000/{test_file_name}'
+        action_browse = BrowseInteractiveAction(
+            browser_actions=f'goto("{file_url}")',
+        )
+        logger.info(action_browse, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_browse)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Verify the browser observation after navigating to PDF file
+        downloaded_file_name = 'file_1.pdf'
+        assert isinstance(obs, FileDownloadObservation)
+        assert 'Location of downloaded file:' in str(obs)
+        assert downloaded_file_name in str(obs)  # File is renamed
+
+        # Wait for download to complete
+        action_cmd = CmdRunAction(command='sleep 3')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Check if the file was downloaded
+        action_cmd = CmdRunAction(command='ls -la /workspace')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
+        assert downloaded_file_name in obs.content
+
+        # Clean up
+        action_cmd = CmdRunAction(command='pkill -f "python3 -m http.server" || true')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        action_cmd = CmdRunAction(command='rm -f server.log')
+        logger.info(action_cmd, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action_cmd)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    finally:
+        _close_test_runtime(runtime)
diff --git a/tests/runtime/test_mcp_action.py b/tests/runtime/test_mcp_action.py
index 9eb10b9bf0..983f2a22eb 100644
--- a/tests/runtime/test_mcp_action.py
+++ b/tests/runtime/test_mcp_action.py
@@ -160,7 +160,7 @@ async def test_fetch_mcp_via_stdio(temp_dir, runtime_cls, run_as_openhands):
     assert result_json['content'][0]['type'] == 'text'
     assert (
         result_json['content'][0]['text']
-        == 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
+        == 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
     )
 
     runtime.close()
@@ -269,7 +269,7 @@ async def test_both_stdio_and_sse_mcp(
         assert result_json['content'][0]['type'] == 'text'
         assert (
             result_json['content'][0]['text']
-            == 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
+            == 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
         )
     finally:
         if runtime:
@@ -354,7 +354,7 @@ async def test_microagent_and_one_stdio_mcp_in_config(
         assert result_json['content'][0]['type'] == 'text'
         assert (
             result_json['content'][0]['text']
-            == 'Contents of http://localhost:8000/:\n---\n\n* <server.log>\n\n---'
+            == 'Contents of http://localhost:8000/:\n---\n\n* <.downloads/>\n* <server.log>\n\n---'
         )
     finally:
         if runtime: