diff --git a/.gitignore b/.gitignore
index f61f3ced52..f9fb4dc408 100644
--- a/.gitignore
+++ b/.gitignore
@@ -257,3 +257,5 @@ containers/runtime/code
 
 # test results
 test-results
+
+.eval_sessions
diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py
index ff25172722..ed9c77ee7e 100644
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -9,8 +9,8 @@ from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
     compatibility_for_eval_history_pairs,
-    get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -61,18 +61,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
 def get_config(
     metadata: EvalMetadata,
 ) -> OpenHandsConfig:
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    # Create config with EDA-specific container image
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
     )
+
+    # Override the container image for EDA
+    config.sandbox.base_container_image = 'python:3.12-bookworm'
+
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
index 562df0024a..6a112c6abb 100644
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -17,8 +17,8 @@ from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
     compatibility_for_eval_history_pairs,
-    get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync
 def get_config(
     metadata: EvalMetadata,
 ) -> OpenHandsConfig:
-    sandbox_config = get_default_sandbox_config_for_eval()
-    sandbox_config.base_container_image = 'python:3.12-slim'
+    # Create config with agent_bench-specific container image
+    config = get_openhands_config_for_eval(metadata=metadata)
+
+    # Override the container image for agent_bench
+    config.sandbox.base_container_image = 'python:3.12-slim'
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
     agent_config.enable_prompt_extensions = False
diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 338315747d..f9eb65a9f8 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -50,15 +51,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.11-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
index 3dbc632a1b..39a01e2e4f 100644
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -61,15 +62,10 @@ def get_config(
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
index 71886a0406..2f04011cc8 100644
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -19,6 +19,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -75,15 +76,10 @@ def get_config(
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py
index e6eb259d86..9f59817aed 100644
--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -40,14 +41,8 @@ def get_config(
     )
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        workspace_base=None,
-        workspace_mount_path=None,
+    config = get_openhands_config_for_eval(
+        metadata=metadata, runtime='docker', sandbox_config=sandbox_config
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/commit0/run_infer.py b/evaluation/benchmarks/commit0/run_infer.py
index bad4a735cd..fb125498c3 100644
--- a/evaluation/benchmarks/commit0/run_infer.py
+++ b/evaluation/benchmarks/commit0/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -114,16 +115,11 @@ def get_config(
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = base_container_image
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        enable_browser=RUN_WITH_BROWSING,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        enable_browser=RUN_WITH_BROWSING,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py
index 448bc18e7b..1ffacc4543 100644
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -65,15 +66,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
index a7e69489c3..480df59bd2 100644
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -60,15 +61,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
     )
     config.set_llm_config(metadata.llm_config)
     if metadata.agent_config:
diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py
index 87a71b4f00..932b0f6f75 100644
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -43,15 +44,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py
index 28ac39588a..ce1cc3c8d0 100644
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -31,6 +31,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -64,15 +65,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
index 8cf8b7efa1..7b882eab88 100644
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -24,6 +24,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -85,15 +86,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
index 2aad6fb1b5..9363651545 100644
--- a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py
@@ -16,6 +16,7 @@ import ruamel.yaml
 from evaluation.utils.shared import (
     EvalMetadata,
     get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
     make_metadata,
 )
 from openhands.core.config import (
@@ -37,15 +38,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
index cbb53259ba..e6938301e0 100644
--- a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -48,15 +49,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py
index 23fa4e1d88..86fe82e6db 100644
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -52,15 +53,10 @@ def get_config(
         '$OH_INTERPRETER_PATH -m pip install scitools-pyke'
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py
index ef2fdc1412..a0e24a1bbb 100644
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@@ -14,6 +14,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -58,15 +59,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
index 72031e8d7b..890c4e5c0a 100644
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -110,15 +111,10 @@ def get_config(
         f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py
index 32a8f76ca9..48e3d6785a 100644
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -27,6 +27,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -80,15 +81,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/multi_swe_bench/eval_infer.py b/evaluation/benchmarks/multi_swe_bench/eval_infer.py
index 74364f2fe5..22fdcc764b 100644
--- a/evaluation/benchmarks/multi_swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/eval_infer.py
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
     get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
@@ -87,13 +88,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
         dataset_name=metadata.dataset,
         instance_id=instance['instance_id'],
     )
-    config = OpenHandsConfig(
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     return config
 
diff --git a/evaluation/benchmarks/multi_swe_bench/run_infer.py b/evaluation/benchmarks/multi_swe_bench/run_infer.py
index ca33f65298..db6524a8d5 100644
--- a/evaluation/benchmarks/multi_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py
@@ -21,6 +21,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
@@ -341,16 +342,11 @@ def get_config(
         instance_id=instance['instance_id'],
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         enable_browser=RUN_WITH_BROWSING,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/nocode_bench/run_infer_nc.py b/evaluation/benchmarks/nocode_bench/run_infer_nc.py
index 0022ed484d..3c3d40bdfc 100644
--- a/evaluation/benchmarks/nocode_bench/run_infer_nc.py
+++ b/evaluation/benchmarks/nocode_bench/run_infer_nc.py
@@ -31,6 +31,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
@@ -174,15 +175,10 @@ def get_config(
         instance_id=instance['instance_id'],
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
 
     config.set_llm_config(
diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
index 76b937308b..beabf3783f 100644
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -64,16 +65,10 @@ def get_config(
     sandbox_config.base_container_image = (
         'docker.io/xingyaoww/openhands-eval-scienceagentbench'
     )
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        max_budget_per_task=4,
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
index 2fdc3e2e2f..46f3629be8 100644
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -19,6 +19,7 @@ from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
     get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
@@ -83,13 +84,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
         dataset_name=metadata.dataset,
         instance_id=instance['instance_id'],
     )
-    config = OpenHandsConfig(
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     return config
 
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index 4413f07d3b..3dbed38dbd 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -32,6 +32,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
@@ -227,16 +228,11 @@ def get_config(
         instance_id=instance['instance_id'],
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         enable_browser=RUN_WITH_BROWSING,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
 
     config.set_llm_config(
diff --git a/evaluation/benchmarks/swe_bench/run_localize.py b/evaluation/benchmarks/swe_bench/run_localize.py
index 0c34991577..2f7f09912a 100644
--- a/evaluation/benchmarks/swe_bench/run_localize.py
+++ b/evaluation/benchmarks/swe_bench/run_localize.py
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
@@ -199,16 +200,11 @@ def get_config(
         'REPO_PATH': f'/workspace/{workspace_dir_name}/',
     }
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         enable_browser=RUN_WITH_BROWSING,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/testgeneval/eval_infer.py b/evaluation/benchmarks/testgeneval/eval_infer.py
index 27aaaee1ac..1004a2d55f 100644
--- a/evaluation/benchmarks/testgeneval/eval_infer.py
+++ b/evaluation/benchmarks/testgeneval/eval_infer.py
@@ -37,6 +37,7 @@ from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    get_openhands_config_for_eval,
     prepare_dataset,
     reset_logger_for_multiprocessing,
     run_evaluation,
@@ -58,20 +59,21 @@ def get_config(instance: pd.Series) -> OpenHandsConfig:
         f'Invalid container image for instance {instance["instance_id_swebench"]}.'
     )
     logger.info(f'Using instance container image: {base_container_image}.')
-    return OpenHandsConfig(
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            use_host_network=False,
-            timeout=1800,
-            api_key=os.environ.get('ALLHANDS_API_KEY'),
-            remote_runtime_api_url=os.environ.get(
-                'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
-            ),
+
+    # Create custom sandbox config for testgeneval with specific requirements
+    sandbox_config = SandboxConfig(
+        base_container_image=base_container_image,
+        use_host_network=False,
+        timeout=1800,  # Longer timeout than default (300)
+        api_key=os.environ.get('ALLHANDS_API_KEY'),
+        remote_runtime_api_url=os.environ.get(
+            'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
         ),
-        workspace_base=None,
-        workspace_mount_path=None,
+    )
+
+    return get_openhands_config_for_eval(
+        sandbox_config=sandbox_config,
+        runtime=os.environ.get('RUNTIME', 'docker'),  # Different default runtime
     )
 
 
diff --git a/evaluation/benchmarks/testgeneval/run_infer.py b/evaluation/benchmarks/testgeneval/run_infer.py
index 39288ff537..c8171cca94 100644
--- a/evaluation/benchmarks/testgeneval/run_infer.py
+++ b/evaluation/benchmarks/testgeneval/run_infer.py
@@ -25,6 +25,7 @@ from evaluation.utils.shared import (
     assert_and_raise,
     codeact_user_response,
     get_metrics,
+    get_openhands_config_for_eval,
     is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
@@ -126,29 +127,26 @@ def get_config(
         f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            # Add platform to the sandbox config to solve issue 4401
-            platform='linux/amd64',
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get(
-                'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
-            ),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
+    sandbox_config = SandboxConfig(
+        base_container_image=base_container_image,
+        enable_auto_lint=True,
+        use_host_network=False,
+        # large enough timeout, since some testcases take very long to run
+        timeout=300,
+        # Add platform to the sandbox config to solve issue 4401
+        platform='linux/amd64',
+        api_key=os.environ.get('ALLHANDS_API_KEY', None),
+        remote_runtime_api_url=os.environ.get(
+            'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
         ),
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        keep_runtime_alive=False,
+        remote_runtime_init_timeout=3600,
+    )
+
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
+        sandbox_config=sandbox_config,
+        runtime=os.environ.get('RUNTIME', 'docker'),
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
index 513b02d612..ee768e7f6d 100644
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -12,7 +12,10 @@ import tempfile
 import yaml
 from browsing import pre_login
 
-from evaluation.utils.shared import get_default_sandbox_config_for_eval
+from evaluation.utils.shared import (
+    get_default_sandbox_config_for_eval,
+    get_openhands_config_for_eval,
+)
 from openhands.controller.state.state import State
 from openhands.core.config import (
     LLMConfig,
@@ -42,19 +45,17 @@ def get_config(
     sandbox_config.enable_auto_lint = True
     # If the web services are running on the host machine, this must be set to True
     sandbox_config.use_host_network = True
-    config = OpenHandsConfig(
-        run_as_openhands=False,
-        max_budget_per_task=4,
+    config = get_openhands_config_for_eval(
         max_iterations=100,
-        save_trajectory_path=os.path.join(
-            mount_path_on_host, f'traj_{task_short_name}.json'
-        ),
-        sandbox=sandbox_config,
         # we mount trajectories path so that trajectories, generated by OpenHands
         # controller, can be accessible to the evaluator file in the runtime container
+        sandbox_config=sandbox_config,
         workspace_mount_path=mount_path_on_host,
-        workspace_mount_path_in_sandbox='/outputs',
     )
+    config.save_trajectory_path = os.path.join(
+        mount_path_on_host, f'traj_{task_short_name}.json'
+    )
+    config.max_budget_per_task = 4
     config.set_llm_config(llm_config)
     if agent_config:
         config.set_agent_config(agent_config)
diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
index 8353f76f77..3fa078648d 100644
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -44,15 +45,10 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.base_container_image = 'python:3.12-bookworm'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/benchmarks/visual_swe_bench/run_infer.py b/evaluation/benchmarks/visual_swe_bench/run_infer.py
index 215f7933b1..ca096d9e19 100644
--- a/evaluation/benchmarks/visual_swe_bench/run_infer.py
+++ b/evaluation/benchmarks/visual_swe_bench/run_infer.py
@@ -20,6 +20,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     is_fatal_evaluation_error,
     make_metadata,
     prepare_dataset,
@@ -160,16 +161,11 @@ def get_config(
         instance_id=instance['instance_id'],
     )
 
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
-        max_iterations=metadata.max_iterations,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         enable_browser=RUN_WITH_BROWSING,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py
index ef45663e6a..7c0695d585 100644
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -73,16 +74,10 @@ def get_config(
         'VWA_WIKIPEDIA': f'{base_url}:8888',
         'VWA_HOMEPAGE': f'{base_url}:4399',
     }
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-        attach_to_existing=True,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(
         update_llm_config_for_completions_logging(
diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py
index 316dba63de..273a137124 100644
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -65,15 +66,10 @@ def get_config(
         'MAP': f'{base_url}:3000',
         'HOMEPAGE': f'{base_url}:4399',
     }
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime='docker',
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
+        sandbox_config=sandbox_config,
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index c493cc173a..c2ccf54bc9 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     get_default_sandbox_config_for_eval,
     get_metrics,
+    get_openhands_config_for_eval,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -45,18 +46,12 @@ def get_config(
 ) -> OpenHandsConfig:
     sandbox_config = get_default_sandbox_config_for_eval()
     sandbox_config.platform = 'linux/amd64'
-    config = OpenHandsConfig(
-        default_agent=metadata.agent_class,
-        run_as_openhands=False,
+    config = get_openhands_config_for_eval(
+        metadata=metadata,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        max_iterations=metadata.max_iterations,
-        sandbox=sandbox_config,
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-        # debug
-        debug=True,
+        sandbox_config=sandbox_config,
     )
+    config.debug = True
     config.set_llm_config(
         update_llm_config_for_completions_logging(
             metadata.llm_config, metadata.eval_output_dir, instance_id
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 76ed563ec2..a3d9c125af 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -703,3 +703,79 @@ def get_default_sandbox_config_for_eval() -> SandboxConfig:
         remote_runtime_enable_retries=True,
         remote_runtime_class='sysbox',
     )
+
+
+def get_openhands_config_for_eval(
+    metadata: EvalMetadata | None = None,
+    sandbox_config: SandboxConfig | None = None,
+    runtime: str | None = None,
+    max_iterations: int | None = None,
+    default_agent: str | None = None,
+    enable_browser: bool = False,
+    workspace_base: str | None = None,
+    workspace_mount_path: str | None = None,
+):
+    """Create an OpenHandsConfig with common patterns used across evaluation scripts.
+
+    This function provides a standardized way to create OpenHands configurations
+    for evaluation runs, with sensible defaults that match the patterns used in
+    most run_infer.py scripts. Individual evaluation scripts can override specific
+    attributes as needed.
+
+    Args:
+        metadata: EvalMetadata containing agent class, max iterations, etc.
+        sandbox_config: Custom sandbox config. If None, uses get_default_sandbox_config_for_eval()
+        runtime: Runtime type. If None, uses environment RUNTIME or 'docker'
+        max_iterations: Max iterations for the agent. If None, uses metadata.max_iterations
+        default_agent: Agent class name. If None, uses metadata.agent_class
+        enable_browser: Whether to enable browser functionality
+        workspace_base: Workspace base path. Defaults to None
+        workspace_mount_path: Workspace mount path. Defaults to None
+
+    Returns:
+        OpenHandsConfig: Configured for evaluation with eval-specific overrides applied
+    """
+    # Defer import to avoid circular imports at module load time
+    from openhands.core.config.openhands_config import (
+        OpenHandsConfig as _OHConfig,  # type: ignore
+    )
+
+    # Use provided sandbox config or get default
+    if sandbox_config is None:
+        sandbox_config = get_default_sandbox_config_for_eval()
+
+    # Extract values from metadata if provided
+    if metadata is not None:
+        if max_iterations is None:
+            max_iterations = metadata.max_iterations
+        if default_agent is None:
+            default_agent = metadata.agent_class
+
+    # Use environment runtime or default
+    if runtime is None:
+        runtime = os.environ.get('RUNTIME', 'docker')
+
+    # Provide sensible defaults if still None
+    if default_agent is None:
+        default_agent = 'CodeActAgent'
+    if max_iterations is None:
+        max_iterations = 50
+
+    # Always use repo-local .eval_sessions directory (absolute path)
+    eval_store = os.path.abspath(os.path.join(os.getcwd(), '.eval_sessions'))
+
+    # Create the base config with evaluation-specific overrides
+    config = _OHConfig(
+        default_agent=default_agent,
+        run_as_openhands=False,
+        runtime=runtime,
+        max_iterations=max_iterations,
+        enable_browser=enable_browser,
+        sandbox=sandbox_config,
+        workspace_base=workspace_base,
+        workspace_mount_path=workspace_mount_path,
+        file_store='local',
+        file_store_path=eval_store,
+    )
+
+    return config
diff --git a/tests/unit/evaluation/test_eval_file_store_path.py b/tests/unit/evaluation/test_eval_file_store_path.py
new file mode 100644
index 0000000000..1314612729
--- /dev/null
+++ b/tests/unit/evaluation/test_eval_file_store_path.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+
+from evaluation.utils.shared import get_openhands_config_for_eval
+
+
+def test_eval_file_store_defaults_to_repo_local(tmp_path, monkeypatch):
+    prev_cwd = Path.cwd()
+    try:
+        os.chdir(tmp_path)
+        cfg = get_openhands_config_for_eval()
+        assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve()
+        assert cfg.file_store == 'local'
+    finally:
+        os.chdir(prev_cwd)
+
+
+def test_eval_file_store_is_hard_coded_repo_local(tmp_path):
+    prev_cwd = Path.cwd()
+    try:
+        os.chdir(tmp_path)
+        cfg = get_openhands_config_for_eval()
+        assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve()
+        assert cfg.file_store == 'local'
+    finally:
+        os.chdir(prev_cwd)