diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
index 5c2e6d111d..7948291d6e 100644
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -291,7 +291,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -368,7 +368,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
diff --git a/poetry.lock b/poetry.lock
index d97ef683fe..85831d58dc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -601,43 +601,6 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >
 [package.extras]
 crt = ["awscrt (==0.22.0)"]
 
-[[package]]
-name = "browsergym"
-version = "0.10.2"
-description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
-optional = false
-python-versions = ">3.7"
-files = [
-    {file = "browsergym-0.10.2-py3-none-any.whl", hash = "sha256:9581d1d1f1fcd1cf35266cf30c881d60c147a0d374b3491eeaebb07d9690f868"},
-    {file = "browsergym-0.10.2.tar.gz", hash = "sha256:3cdd7520cca857421aa7ec0a965968df4bcef721299a424397f86d7cad078ab0"},
-]
-
-[package.dependencies]
-browsergym-assistantbench = "0.10.2"
-browsergym-core = "0.10.2"
-browsergym-experiments = "0.10.2"
-browsergym-miniwob = "0.10.2"
-browsergym-visualwebarena = "0.10.2"
-browsergym-webarena = "0.10.2"
-browsergym-workarena = ">=0.4.1"
-
-[[package]]
-name = "browsergym-assistantbench"
-version = "0.10.2"
-description = "AssistantBench benchmark for BrowserGym"
-optional = false
-python-versions = ">3.7"
-files = [
-    {file = "browsergym_assistantbench-0.10.2-py3-none-any.whl", hash = "sha256:af0d3a3e23686066b070feca38f8740262bed6d65ccf9098f393334a005987c0"},
-    {file = "browsergym_assistantbench-0.10.2.tar.gz", hash = "sha256:de18eb7c010403d5d467b927b4713b56f6e97a59493bee4c42599d4d7cb54dce"},
-]
-
-[package.dependencies]
-browsergym-core = "0.10.2"
-datasets = "*"
-numpy = "*"
-scipy = "*"
-
 [[package]]
 name = "browsergym-core"
 version = "0.10.2"
@@ -658,22 +621,6 @@ pillow = ">=10.1"
 playwright = ">=1.39,<2.0"
 pyparsing = ">=3"
 
-[[package]]
-name = "browsergym-experiments"
-version = "0.10.2"
-description = "Experimentation tools for BrowserGym"
-optional = false
-python-versions = ">3.7"
-files = [
-    {file = "browsergym_experiments-0.10.2-py3-none-any.whl", hash = "sha256:60a626b3159ef63b5ff72a6c8156c8f3cf82a9278dfc5a9d3ece39c2b1913595"},
-    {file = "browsergym_experiments-0.10.2.tar.gz", hash = "sha256:b49bc27f315ad12014ff21580c7c7aca6489ca4106e7ab46502f716674efa236"},
-]
-
-[package.dependencies]
-browsergym-core = "0.10.2"
-dataclasses-json = "*"
-tiktoken = ">=0.4"
-
 [[package]]
 name = "browsergym-miniwob"
 version = "0.10.2"
@@ -688,22 +635,6 @@ files = [
 [package.dependencies]
 browsergym-core = "0.10.2"
 
-[[package]]
-name = "browsergym-visualwebarena"
-version = "0.10.2"
-description = "VisualWebArena benchmark for BrowserGym"
-optional = false
-python-versions = ">3.7"
-files = [
-    {file = "browsergym_visualwebarena-0.10.2-py3-none-any.whl", hash = "sha256:87c913ccd4d12a79c625b5c4d9ead7e0bc50b298d19e413204bb586a67736d83"},
-    {file = "browsergym_visualwebarena-0.10.2.tar.gz", hash = "sha256:5f84a4f33a21106c9b650cecb0362b78af2546d9927255828c273fe800d776a1"},
-]
-
-[package.dependencies]
-browsergym-core = "0.10.2"
-libvisualwebarena = "0.0.14"
-requests = "*"
-
 [[package]]
 name = "browsergym-webarena"
 version = "0.10.2"
@@ -719,26 +650,6 @@ files = [
 browsergym-core = "0.10.2"
 libwebarena = "0.0.3"
 
-[[package]]
-name = "browsergym-workarena"
-version = "0.4.1"
-description = "WorkArena benchmark for BrowserGym"
-optional = false
-python-versions = ">3.7"
-files = [
-    {file = "browsergym_workarena-0.4.1-py3-none-any.whl", hash = "sha256:b8f04b2e3801fd32962b7d99f0685c507b258841e2b4bfdb46d041091d2f1b89"},
-    {file = "browsergym_workarena-0.4.1.tar.gz", hash = "sha256:ba2958d804b80836c7f81360d66b99c6c655c5070eddc5fae9c1c88306a23403"},
-]
-
-[package.dependencies]
-browsergym-core = ">=0.2"
-english-words = ">=2.0.1"
-faker = ">=24.8.0"
-numpy = ">=1.14"
-requests = ">=2.31"
-tenacity = ">=8.2.3"
-tqdm = ">=4.66.2"
-
 [[package]]
 name = "build"
 version = "1.2.2.post1"
@@ -1659,16 +1570,6 @@ typing-extensions = ">=4.8.0"
 urllib3 = ">=1.25.3"
 websockets = ">=11.0.3"
 
-[[package]]
-name = "english-words"
-version = "2.0.1"
-description = "Generate sets of english words by combining different word lists"
-optional = false
-python-versions = "*"
-files = [
-    {file = "english-words-2.0.1.tar.gz", hash = "sha256:a4105c57493bb757a3d8973fcf8e1dc05e7ca09c836dff467c3fb445f84bc43d"},
-]
-
 [[package]]
 name = "evaluate"
 version = "0.4.3"
@@ -1732,21 +1633,6 @@ files = [
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
 
-[[package]]
-name = "faker"
-version = "33.0.0"
-description = "Faker is a Python package that generates fake data for you."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "Faker-33.0.0-py3-none-any.whl", hash = "sha256:68e5580cb6b4226710886e595eabc13127149d6e71e9d1db65506a7fbe2c7fce"},
-    {file = "faker-33.0.0.tar.gz", hash = "sha256:9b01019c1ddaf2253ca2308c0472116e993f4ad8fc9905f82fa965e0c6f932e9"},
-]
-
-[package.dependencies]
-python-dateutil = ">=2.4"
-typing-extensions = "*"
-
 [[package]]
 name = "farama-notifications"
 version = "0.0.4"
@@ -3099,39 +2985,6 @@ files = [
 [package.extras]
 all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
-[[package]]
-name = "imageio"
-version = "2.36.0"
-description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats."
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "imageio-2.36.0-py3-none-any.whl", hash = "sha256:471f1eda55618ee44a3c9960911c35e647d9284c68f077e868df633398f137f0"},
-    {file = "imageio-2.36.0.tar.gz", hash = "sha256:1c8f294db862c256e9562354d65aa54725b8dafed7f10f02bb3ec20ec1678850"},
-]
-
-[package.dependencies]
-numpy = "*"
-pillow = ">=8.3.2"
-
-[package.extras]
-all-plugins = ["astropy", "av", "imageio-ffmpeg", "numpy (>2)", "pillow-heif", "psutil", "rawpy", "tifffile"]
-all-plugins-pypy = ["av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"]
-build = ["wheel"]
-dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"]
-docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"]
-ffmpeg = ["imageio-ffmpeg", "psutil"]
-fits = ["astropy"]
-full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpy (>2)", "numpydoc", "pillow-heif", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "rawpy", "sphinx (<6)", "tifffile", "wheel"]
-gdal = ["gdal"]
-itk = ["itk"]
-linting = ["black", "flake8"]
-pillow-heif = ["pillow-heif"]
-pyav = ["av"]
-rawpy = ["numpy (>2)", "rawpy"]
-test = ["fsspec[github]", "pytest", "pytest-cov"]
-tifffile = ["tifffile"]
-
 [[package]]
 name = "importlib-metadata"
 version = "7.1.0"
@@ -3885,52 +3738,6 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
 [package.extras]
 adal = ["adal (>=1.0.2)"]
 
-[[package]]
-name = "lazy-loader"
-version = "0.4"
-description = "Makes it easy to load subpackages and functions on demand."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
-    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
-]
-
-[package.dependencies]
-packaging = "*"
-
-[package.extras]
-dev = ["changelist (==0.5)"]
-lint = ["pre-commit (==3.7.0)"]
-test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
-
-[[package]]
-name = "libvisualwebarena"
-version = "0.0.14"
-description = "This is an unofficial, use-at-your-own risks port of the visualwebarena benchmark, for use as a standalone library package."
-optional = false
-python-versions = "<4,>=3.7"
-files = [
-    {file = "libvisualwebarena-0.0.14-py3-none-any.whl", hash = "sha256:636b06ca1d52f1a363503b5b563492e83f2482efaf85bb26b69744565a499f0f"},
-    {file = "libvisualwebarena-0.0.14.tar.gz", hash = "sha256:7e660179f60f1df8d884204f2b742a2117e7fe050823d839ca5744ea1c0709a7"},
-]
-
-[package.dependencies]
-aiolimiter = "*"
-beartype = "0.12.0"
-evaluate = "*"
-flask = "*"
-gymnasium = "*"
-nltk = "*"
-openai = ">=1"
-Pillow = "*"
-playwright = ">=1.32,<1.40"
-scikit-image = ">=0.16"
-text-generation = "*"
-tiktoken = "*"
-transformers = "*"
-types-tqdm = "*"
-
 [[package]]
 name = "libwebarena"
 version = "0.0.3"
@@ -8114,54 +7921,6 @@ files = [
 attrs = ">=18.0.0"
 pathspec = ">=0.10.1"
 
-[[package]]
-name = "scikit-image"
-version = "0.24.0"
-description = "Image processing in Python"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"},
-    {file = "scikit_image-0.24.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9c7a52e20cdd760738da38564ba1fed7942b623c0317489af1a598a8dedf088b"},
-    {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93f46e6ce42e5409f4d09ce1b0c7f80dd7e4373bcec635b6348b63e3c886eac8"},
-    {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39ee0af13435c57351a3397eb379e72164ff85161923eec0c38849fecf1b4764"},
-    {file = "scikit_image-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:7ac7913b028b8aa780ffae85922894a69e33d1c0bf270ea1774f382fe8bf95e7"},
-    {file = "scikit_image-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:272909e02a59cea3ed4aa03739bb88df2625daa809f633f40b5053cf09241831"},
-    {file = "scikit_image-0.24.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:190ebde80b4470fe8838764b9b15f232a964f1a20391663e31008d76f0c696f7"},
-    {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59c98cc695005faf2b79904e4663796c977af22586ddf1b12d6af2fa22842dc2"},
-    {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c"},
-    {file = "scikit_image-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:dacf591ac0c272a111181afad4b788a27fe70d213cfddd631d151cbc34f8ca2c"},
-    {file = "scikit_image-0.24.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6fccceb54c9574590abcddc8caf6cefa57c13b5b8b4260ab3ff88ad8f3c252b3"},
-    {file = "scikit_image-0.24.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ccc01e4760d655aab7601c1ba7aa4ddd8b46f494ac46ec9c268df6f33ccddf4c"},
-    {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563"},
-    {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8579bda9c3f78cb3b3ed8b9425213c53a25fa7e994b7ac01f2440b395babf660"},
-    {file = "scikit_image-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:82ab903afa60b2da1da2e6f0c8c65e7c8868c60a869464c41971da929b3e82bc"},
-    {file = "scikit_image-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009"},
-    {file = "scikit_image-0.24.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e9aadb442360a7e76f0c5c9d105f79a83d6df0e01e431bd1d5757e2c5871a1f3"},
-    {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e37de6f4c1abcf794e13c258dc9b7d385d5be868441de11c180363824192ff7"},
-    {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4688c18bd7ec33c08d7bf0fd19549be246d90d5f2c1d795a89986629af0a1e83"},
-    {file = "scikit_image-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:56dab751d20b25d5d3985e95c9b4e975f55573554bd76b0aedf5875217c93e69"},
-    {file = "scikit_image-0.24.0.tar.gz", hash = "sha256:5d16efe95da8edbeb363e0c4157b99becbd650a60b77f6e3af5768b66cf007ab"},
-]
-
-[package.dependencies]
-imageio = ">=2.33"
-lazy-loader = ">=0.4"
-networkx = ">=2.8"
-numpy = ">=1.23"
-packaging = ">=21"
-pillow = ">=9.1"
-scipy = ">=1.9"
-tifffile = ">=2022.8.12"
-
-[package.extras]
-build = ["Cython (>=3.0.4)", "build", "meson-python (>=0.15)", "ninja", "numpy (>=2.0.0rc1)", "packaging (>=21)", "pythran", "setuptools (>=67)", "spin (==0.8)", "wheel"]
-data = ["pooch (>=1.6.0)"]
-developer = ["ipython", "pre-commit", "tomli"]
-docs = ["PyWavelets (>=1.1.1)", "dask[array] (>=2022.9.2)", "ipykernel", "ipywidgets", "kaleido", "matplotlib (>=3.6)", "myst-parser", "numpydoc (>=1.7)", "pandas (>=1.5)", "plotly (>=5.10)", "pooch (>=1.6)", "pydata-sphinx-theme (>=0.15.2)", "pytest-doctestplus", "pytest-runner", "scikit-learn (>=1.1)", "seaborn (>=0.11)", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-gallery (>=0.14)", "sphinx_design (>=0.5)", "tifffile (>=2022.8.12)"]
-optional = ["PyWavelets (>=1.1.1)", "SimpleITK", "astropy (>=5.0)", "cloudpickle (>=0.2.1)", "dask[array] (>=2021.1.0)", "matplotlib (>=3.6)", "pooch (>=1.6.0)", "pyamg", "scikit-learn (>=1.1)"]
-test = ["asv", "numpydoc (>=1.7)", "pooch (>=1.6.0)", "pytest (>=7.0)", "pytest-cov (>=2.11.0)", "pytest-doctestplus", "pytest-faulthandler", "pytest-localserver"]
-
 [[package]]
 name = "scikit-learn"
 version = "1.5.2"
@@ -8843,28 +8602,6 @@ files = [
     {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"},
 ]
 
-[[package]]
-name = "tifffile"
-version = "2024.9.20"
-description = "Read and write TIFF files"
-optional = false
-python-versions = ">=3.10"
-files = [
-    {file = "tifffile-2024.9.20-py3-none-any.whl", hash = "sha256:c54dc85bc1065d972cb8a6ffb3181389d597876aa80177933459733e4ed243dd"},
-    {file = "tifffile-2024.9.20.tar.gz", hash = "sha256:3fbf3be2f995a7051a8ae05a4be70c96fc0789f22ed6f1c4104c973cf68a640b"},
-]
-
-[package.dependencies]
-numpy = "*"
-
-[package.extras]
-all = ["defusedxml", "fsspec", "imagecodecs (>=2023.8.12)", "lxml", "matplotlib", "zarr"]
-codecs = ["imagecodecs (>=2023.8.12)"]
-plot = ["matplotlib"]
-test = ["cmapfile", "czifile", "dask", "defusedxml", "fsspec", "imagecodecs", "lfdfiles", "lxml", "ndtiff", "oiffile", "psdtags", "pytest", "roifile", "xarray", "zarr"]
-xml = ["defusedxml", "lxml"]
-zarr = ["fsspec", "zarr"]
-
 [[package]]
 name = "tiktoken"
 version = "0.8.0"
@@ -10350,4 +10087,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "ff370b7b5077720b73fe3b90cc1b7fb9c7a262bfbd35885bb717369061e8a466"
+content-hash = "56a80082afb76e518239060855598921d94a0373123b2d9222cf8c7b6238b7ad"
diff --git a/pyproject.toml b/pyproject.toml
index ec148baadc..6f15945e22 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ uvicorn = "*"
 types-toml = "*"
 numpy = "*"
 json-repair = "*"
-browsergym = "0.10.2" # integrate browsergym as the browsing interface
+browsergym-core = "0.10.2" # integrate browsergym-core as the browsing interface
 html2text = "*"
 e2b = ">=0.17.1,<1.1.0"
 pexpect = "*"
@@ -63,6 +63,7 @@ opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
 modal = "^0.66.26"
 runloop-api-client = "0.10.0"
 pygithub = "^2.5.0"
+joblib = "*"
 openhands-aci = "^0.1.1"
 python-socketio = "^5.11.4"
 redis = "^5.2.0"
@@ -142,6 +143,8 @@ gdown = "*"
 matplotlib = "*"
 seaborn = "*"
 tabulate = "*"
+browsergym-webarena = "0.10.2"
+browsergym-miniwob = "0.10.2"
 
 [tool.poetry-dynamic-versioning]
 enable = true
diff --git a/tests/runtime/test_browsergym_envs.py b/tests/runtime/test_browsergym_envs.py
new file mode 100644
index 0000000000..426ecacaf5
--- /dev/null
+++ b/tests/runtime/test_browsergym_envs.py
@@ -0,0 +1,73 @@
+import json
+
+import pytest
+
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.browse import BrowseInteractiveAction
+from openhands.events.observation.browse import BrowserOutputObservation
+from tests.runtime.conftest import _close_test_runtime, _load_runtime
+
+
+def has_miniwob():
+    try:
+        import importlib.util
+
+        # try to find this browser environment, if it was installed
+        spec = importlib.util.find_spec('browsergym.miniwob')
+        if spec is None:
+            return False
+
+        # try to import this environment
+        importlib.util.module_from_spec(spec)
+        return True
+    except ImportError:
+        return False
+
+
+@pytest.mark.skipif(
+    not has_miniwob(),
+    reason='Requires browsergym-miniwob package to be installed',
+)
+def test_browsergym_eval_env(runtime_cls, temp_dir):
+    runtime = _load_runtime(
+        temp_dir,
+        runtime_cls=runtime_cls,
+        run_as_openhands=False,  # need root permission to access file
+        base_container_image='xingyaoww/od-eval-miniwob:v1.0',
+        browsergym_eval_env='browsergym/miniwob.choose-list',
+        force_rebuild_runtime=True,
+    )
+    from openhands.runtime.browser.browser_env import (
+        BROWSER_EVAL_GET_GOAL_ACTION,
+        BROWSER_EVAL_GET_REWARDS_ACTION,
+    )
+
+    # Test browse
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert isinstance(obs, BrowserOutputObservation)
+    assert not obs.error
+    assert 'Select' in obs.content
+    assert 'from the list and click Submit' in obs.content
+
+    # Make sure the browser can produce observation in eval env
+    action = BrowseInteractiveAction(browser_actions='noop()')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert (
+        obs.url.strip()
+        == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html'
+    )
+
+    # Make sure the rewards are working
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert json.loads(obs.content) == [0.0]
+
+    _close_test_runtime(runtime)
diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py
index f24e37cd06..6097c89190 100644
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@@ -1,12 +1,9 @@
 """Browsing-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
 
-import json
-
 from conftest import _close_test_runtime, _load_runtime
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import (
-    BrowseInteractiveAction,
     BrowseURLAction,
     CmdRunAction,
 )
@@ -16,7 +13,8 @@ from openhands.events.observation import (
 )
 
 # ============================================================================================================================
-# Browsing tests
+# Browsing tests, without evaluation (poetry install --without evaluation)
+# For eval environments, tests need to run with poetry install
 # ============================================================================================================================
 
 PY3_FOR_TESTING = '/openhands/micromamba/bin/micromamba run -n openhands python3'
@@ -66,48 +64,3 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
     assert obs.exit_code == 0
 
     _close_test_runtime(runtime)
-
-
-def test_browsergym_eval_env(runtime_cls, temp_dir):
-    runtime = _load_runtime(
-        temp_dir,
-        runtime_cls=runtime_cls,
-        run_as_openhands=False,  # need root permission to access file
-        base_container_image='xingyaoww/od-eval-miniwob:v1.0',
-        browsergym_eval_env='browsergym/miniwob.choose-list',
-        force_rebuild_runtime=True,
-    )
-    from openhands.runtime.browser.browser_env import (
-        BROWSER_EVAL_GET_GOAL_ACTION,
-        BROWSER_EVAL_GET_REWARDS_ACTION,
-    )
-
-    # Test browse
-    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-    assert isinstance(obs, BrowserOutputObservation)
-    assert not obs.error
-    assert 'Select' in obs.content
-    assert 'from the list and click Submit' in obs.content
-
-    # Make sure the browser can produce observation in eva[l
-    action = BrowseInteractiveAction(browser_actions='noop()')
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert (
-        obs.url.strip()
-        == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html'
-    )
-
-    # Make sure the rewards are working
-    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert json.loads(obs.content) == [0.0]
-
-    _close_test_runtime(runtime)