Evaluation: redirect sessions to repo-local .eval_sessions via helper; apply across entrypoints; add tests (#10540)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Xingyao Wang 2025-08-22 09:34:02 -04:00 committed by GitHub
parent d9cf5b7302
commit 4507a25b85
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
36 changed files with 274 additions and 293 deletions

2
.gitignore vendored
View File

@ -257,3 +257,5 @@ containers/runtime/code
# test results # test results
test-results test-results
.eval_sessions

View File

@ -9,8 +9,8 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -61,18 +61,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() # Create config with EDA-specific container image
sandbox_config.base_container_image = 'python:3.12-bookworm' config = get_openhands_config_for_eval(
config = OpenHandsConfig( metadata=metadata,
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
# Override the container image for EDA
config.sandbox.base_container_image = 'python:3.12-bookworm'
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False agent_config.enable_prompt_extensions = False

View File

@ -17,8 +17,8 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync
def get_config( def get_config(
metadata: EvalMetadata, metadata: EvalMetadata,
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() # Create config with agent_bench-specific container image
sandbox_config.base_container_image = 'python:3.12-slim' config = get_openhands_config_for_eval(metadata=metadata)
# Override the container image for agent_bench
config.sandbox.base_container_image = 'python:3.12-slim'
config = OpenHandsConfig(
default_agent=metadata.agent_class,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)
agent_config.enable_prompt_extensions = False agent_config.enable_prompt_extensions = False

View File

@ -18,6 +18,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -50,15 +51,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.11-bookworm' sandbox_config.base_container_image = 'python:3.11-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False, sandbox_config=sandbox_config,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -16,6 +16,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -61,15 +62,10 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -19,6 +19,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -75,15 +76,10 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -12,6 +12,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -40,14 +41,8 @@ def get_config(
) )
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata, runtime='docker', sandbox_config=sandbox_config
run_as_openhands=False,
runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -17,6 +17,7 @@ from evaluation.utils.shared import (
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -114,16 +115,11 @@ def get_config(
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = base_container_image sandbox_config.base_container_image = base_container_image
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False, sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, enable_browser=RUN_WITH_BROWSING,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -18,6 +18,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -65,15 +66,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -23,6 +23,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -60,15 +61,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22' sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False, sandbox_config=sandbox_config,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
if metadata.agent_config: if metadata.agent_config:

View File

@ -13,6 +13,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -43,15 +44,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -31,6 +31,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -64,15 +65,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -24,6 +24,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -85,15 +86,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -16,6 +16,7 @@ import ruamel.yaml
from evaluation.utils.shared import ( from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
make_metadata, make_metadata,
) )
from openhands.core.config import ( from openhands.core.config import (
@ -37,15 +38,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -23,6 +23,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -48,15 +49,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -11,6 +11,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -52,15 +53,10 @@ def get_config(
'$OH_INTERPRETER_PATH -m pip install scitools-pyke' '$OH_INTERPRETER_PATH -m pip install scitools-pyke'
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -14,6 +14,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -58,15 +59,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0' sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -16,6 +16,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -110,15 +111,10 @@ def get_config(
f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}' f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -27,6 +27,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -80,15 +81,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench' sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -23,6 +23,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
@ -87,13 +88,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
dataset_name=metadata.dataset, dataset_name=metadata.dataset,
instance_id=instance['instance_id'], instance_id=instance['instance_id'],
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
return config return config

View File

@ -21,6 +21,7 @@ from evaluation.utils.shared import (
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
@ -341,16 +342,11 @@ def get_config(
instance_id=instance['instance_id'], instance_id=instance['instance_id'],
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING, enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -31,6 +31,7 @@ from evaluation.utils.shared import (
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
@ -174,15 +175,10 @@ def get_config(
instance_id=instance['instance_id'], instance_id=instance['instance_id'],
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(

View File

@ -13,6 +13,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -64,16 +65,10 @@ def get_config(
sandbox_config.base_container_image = ( sandbox_config.base_container_image = (
'docker.io/xingyaoww/openhands-eval-scienceagentbench' 'docker.io/xingyaoww/openhands-eval-scienceagentbench'
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_budget_per_task=4, sandbox_config=sandbox_config,
max_iterations=metadata.max_iterations,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -19,6 +19,7 @@ from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
@ -83,13 +84,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
dataset_name=metadata.dataset, dataset_name=metadata.dataset,
instance_id=instance['instance_id'], instance_id=instance['instance_id'],
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
return config return config

View File

@ -32,6 +32,7 @@ from evaluation.utils.shared import (
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
@ -227,16 +228,11 @@ def get_config(
instance_id=instance['instance_id'], instance_id=instance['instance_id'],
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING, enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(

View File

@ -20,6 +20,7 @@ from evaluation.utils.shared import (
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
@ -199,16 +200,11 @@ def get_config(
'REPO_PATH': f'/workspace/{workspace_dir_name}/', 'REPO_PATH': f'/workspace/{workspace_dir_name}/',
} }
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING, enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -37,6 +37,7 @@ from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
from evaluation.utils.shared import ( from evaluation.utils.shared import (
EvalMetadata, EvalMetadata,
EvalOutput, EvalOutput,
get_openhands_config_for_eval,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
run_evaluation, run_evaluation,
@ -58,20 +59,21 @@ def get_config(instance: pd.Series) -> OpenHandsConfig:
f'Invalid container image for instance {instance["instance_id_swebench"]}.' f'Invalid container image for instance {instance["instance_id_swebench"]}.'
) )
logger.info(f'Using instance container image: {base_container_image}.') logger.info(f'Using instance container image: {base_container_image}.')
return OpenHandsConfig(
run_as_openhands=False, # Create custom sandbox config for testgeneval with specific requirements
runtime=os.environ.get('RUNTIME', 'eventstream'), sandbox_config = SandboxConfig(
sandbox=SandboxConfig( base_container_image=base_container_image,
base_container_image=base_container_image, use_host_network=False,
use_host_network=False, timeout=1800, # Longer timeout than default (300)
timeout=1800, api_key=os.environ.get('ALLHANDS_API_KEY'),
api_key=os.environ.get('ALLHANDS_API_KEY'), remote_runtime_api_url=os.environ.get(
remote_runtime_api_url=os.environ.get( 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
),
), ),
workspace_base=None, )
workspace_mount_path=None,
return get_openhands_config_for_eval(
sandbox_config=sandbox_config,
runtime=os.environ.get('RUNTIME', 'docker'), # Different default runtime
) )

View File

@ -25,6 +25,7 @@ from evaluation.utils.shared import (
assert_and_raise, assert_and_raise,
codeact_user_response, codeact_user_response,
get_metrics, get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
@ -126,29 +127,26 @@ def get_config(
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
) )
config = OpenHandsConfig( sandbox_config = SandboxConfig(
default_agent=metadata.agent_class, base_container_image=base_container_image,
run_as_openhands=False, enable_auto_lint=True,
max_iterations=metadata.max_iterations, use_host_network=False,
runtime=os.environ.get('RUNTIME', 'eventstream'), # large enough timeout, since some testcases take very long to run
sandbox=SandboxConfig( timeout=300,
base_container_image=base_container_image, # Add platform to the sandbox config to solve issue 4401
enable_auto_lint=True, platform='linux/amd64',
use_host_network=False, api_key=os.environ.get('ALLHANDS_API_KEY', None),
# large enough timeout, since some testcases take very long to run remote_runtime_api_url=os.environ.get(
timeout=300, 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
# Add platform to the sandbox config to solve issue 4401
platform='linux/amd64',
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get(
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
),
keep_runtime_alive=False,
remote_runtime_init_timeout=3600,
), ),
# do not mount workspace keep_runtime_alive=False,
workspace_base=None, remote_runtime_init_timeout=3600,
workspace_mount_path=None, )
config = get_openhands_config_for_eval(
metadata=metadata,
sandbox_config=sandbox_config,
runtime=os.environ.get('RUNTIME', 'docker'),
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -12,7 +12,10 @@ import tempfile
import yaml import yaml
from browsing import pre_login from browsing import pre_login
from evaluation.utils.shared import get_default_sandbox_config_for_eval from evaluation.utils.shared import (
get_default_sandbox_config_for_eval,
get_openhands_config_for_eval,
)
from openhands.controller.state.state import State from openhands.controller.state.state import State
from openhands.core.config import ( from openhands.core.config import (
LLMConfig, LLMConfig,
@ -42,19 +45,17 @@ def get_config(
sandbox_config.enable_auto_lint = True sandbox_config.enable_auto_lint = True
# If the web services are running on the host machine, this must be set to True # If the web services are running on the host machine, this must be set to True
sandbox_config.use_host_network = True sandbox_config.use_host_network = True
config = OpenHandsConfig( config = get_openhands_config_for_eval(
run_as_openhands=False,
max_budget_per_task=4,
max_iterations=100, max_iterations=100,
save_trajectory_path=os.path.join(
mount_path_on_host, f'traj_{task_short_name}.json'
),
sandbox=sandbox_config,
# we mount trajectories path so that trajectories, generated by OpenHands # we mount trajectories path so that trajectories, generated by OpenHands
# controller, can be accessible to the evaluator file in the runtime container # controller, can be accessible to the evaluator file in the runtime container
sandbox_config=sandbox_config,
workspace_mount_path=mount_path_on_host, workspace_mount_path=mount_path_on_host,
workspace_mount_path_in_sandbox='/outputs',
) )
config.save_trajectory_path = os.path.join(
mount_path_on_host, f'traj_{task_short_name}.json'
)
config.max_budget_per_task = 4
config.set_llm_config(llm_config) config.set_llm_config(llm_config)
if agent_config: if agent_config:
config.set_agent_config(agent_config) config.set_agent_config(agent_config)

View File

@ -12,6 +12,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -44,15 +45,10 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.base_container_image = 'python:3.12-bookworm' sandbox_config.base_container_image = 'python:3.12-bookworm'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -20,6 +20,7 @@ from evaluation.utils.shared import (
codeact_user_response, codeact_user_response,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
is_fatal_evaluation_error, is_fatal_evaluation_error,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
@ -160,16 +161,11 @@ def get_config(
instance_id=instance['instance_id'], instance_id=instance['instance_id'],
) )
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
max_iterations=metadata.max_iterations,
enable_browser=RUN_WITH_BROWSING, enable_browser=RUN_WITH_BROWSING,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
sandbox=sandbox_config, sandbox_config=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -13,6 +13,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -73,16 +74,10 @@ def get_config(
'VWA_WIKIPEDIA': f'{base_url}:8888', 'VWA_WIKIPEDIA': f'{base_url}:8888',
'VWA_HOMEPAGE': f'{base_url}:4399', 'VWA_HOMEPAGE': f'{base_url}:4399',
} }
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
attach_to_existing=True,
) )
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(

View File

@ -13,6 +13,7 @@ from evaluation.utils.shared import (
compatibility_for_eval_history_pairs, compatibility_for_eval_history_pairs,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -65,15 +66,10 @@ def get_config(
'MAP': f'{base_url}:3000', 'MAP': f'{base_url}:3000',
'HOMEPAGE': f'{base_url}:4399', 'HOMEPAGE': f'{base_url}:4399',
} }
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime='docker', runtime='docker',
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
) )
config.set_llm_config(metadata.llm_config) config.set_llm_config(metadata.llm_config)
agent_config = config.get_agent_config(metadata.agent_class) agent_config = config.get_agent_config(metadata.agent_class)

View File

@ -10,6 +10,7 @@ from evaluation.utils.shared import (
EvalOutput, EvalOutput,
get_default_sandbox_config_for_eval, get_default_sandbox_config_for_eval,
get_metrics, get_metrics,
get_openhands_config_for_eval,
make_metadata, make_metadata,
prepare_dataset, prepare_dataset,
reset_logger_for_multiprocessing, reset_logger_for_multiprocessing,
@ -45,18 +46,12 @@ def get_config(
) -> OpenHandsConfig: ) -> OpenHandsConfig:
sandbox_config = get_default_sandbox_config_for_eval() sandbox_config = get_default_sandbox_config_for_eval()
sandbox_config.platform = 'linux/amd64' sandbox_config.platform = 'linux/amd64'
config = OpenHandsConfig( config = get_openhands_config_for_eval(
default_agent=metadata.agent_class, metadata=metadata,
run_as_openhands=False,
runtime=os.environ.get('RUNTIME', 'docker'), runtime=os.environ.get('RUNTIME', 'docker'),
max_iterations=metadata.max_iterations, sandbox_config=sandbox_config,
sandbox=sandbox_config,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
# debug
debug=True,
) )
config.debug = True
config.set_llm_config( config.set_llm_config(
update_llm_config_for_completions_logging( update_llm_config_for_completions_logging(
metadata.llm_config, metadata.eval_output_dir, instance_id metadata.llm_config, metadata.eval_output_dir, instance_id

View File

@ -703,3 +703,79 @@ def get_default_sandbox_config_for_eval() -> SandboxConfig:
remote_runtime_enable_retries=True, remote_runtime_enable_retries=True,
remote_runtime_class='sysbox', remote_runtime_class='sysbox',
) )
def get_openhands_config_for_eval(
metadata: EvalMetadata | None = None,
sandbox_config: SandboxConfig | None = None,
runtime: str | None = None,
max_iterations: int | None = None,
default_agent: str | None = None,
enable_browser: bool = False,
workspace_base: str | None = None,
workspace_mount_path: str | None = None,
):
"""Create an OpenHandsConfig with common patterns used across evaluation scripts.
This function provides a standardized way to create OpenHands configurations
for evaluation runs, with sensible defaults that match the patterns used in
most run_infer.py scripts. Individual evaluation scripts can override specific
attributes as needed.
Args:
metadata: EvalMetadata containing agent class, max iterations, etc.
sandbox_config: Custom sandbox config. If None, uses get_default_sandbox_config_for_eval()
runtime: Runtime type. If None, uses environment RUNTIME or 'docker'
max_iterations: Max iterations for the agent. If None, uses metadata.max_iterations
default_agent: Agent class name. If None, uses metadata.agent_class
enable_browser: Whether to enable browser functionality
workspace_base: Workspace base path. Defaults to None
workspace_mount_path: Workspace mount path. Defaults to None
Returns:
OpenHandsConfig: Configured for evaluation with eval-specific overrides applied
"""
# Defer import to avoid circular imports at module load time
from openhands.core.config.openhands_config import (
OpenHandsConfig as _OHConfig, # type: ignore
)
# Use provided sandbox config or get default
if sandbox_config is None:
sandbox_config = get_default_sandbox_config_for_eval()
# Extract values from metadata if provided
if metadata is not None:
if max_iterations is None:
max_iterations = metadata.max_iterations
if default_agent is None:
default_agent = metadata.agent_class
# Use environment runtime or default
if runtime is None:
runtime = os.environ.get('RUNTIME', 'docker')
# Provide sensible defaults if still None
if default_agent is None:
default_agent = 'CodeActAgent'
if max_iterations is None:
max_iterations = 50
# Always use repo-local .eval_sessions directory (absolute path)
eval_store = os.path.abspath(os.path.join(os.getcwd(), '.eval_sessions'))
# Create the base config with evaluation-specific overrides
config = _OHConfig(
default_agent=default_agent,
run_as_openhands=False,
runtime=runtime,
max_iterations=max_iterations,
enable_browser=enable_browser,
sandbox=sandbox_config,
workspace_base=workspace_base,
workspace_mount_path=workspace_mount_path,
file_store='local',
file_store_path=eval_store,
)
return config

View File

@ -0,0 +1,26 @@
import os
from pathlib import Path
from evaluation.utils.shared import get_openhands_config_for_eval
def test_eval_file_store_defaults_to_repo_local(tmp_path, monkeypatch):
prev_cwd = Path.cwd()
try:
os.chdir(tmp_path)
cfg = get_openhands_config_for_eval()
assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve()
assert cfg.file_store == 'local'
finally:
os.chdir(prev_cwd)
def test_eval_file_store_is_hard_coded_repo_local(tmp_path):
prev_cwd = Path.cwd()
try:
os.chdir(tmp_path)
cfg = get_openhands_config_for_eval()
assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve()
assert cfg.file_store == 'local'
finally:
os.chdir(prev_cwd)