diff --git a/.gitignore b/.gitignore index f61f3ced52..f9fb4dc408 100644 --- a/.gitignore +++ b/.gitignore @@ -257,3 +257,5 @@ containers/runtime/code # test results test-results + +.eval_sessions diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index ff25172722..ed9c77ee7e 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -9,8 +9,8 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, - get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -61,18 +61,15 @@ AGENT_CLS_TO_INST_SUFFIX = { def get_config( metadata: EvalMetadata, ) -> OpenHandsConfig: - sandbox_config = get_default_sandbox_config_for_eval() - sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + # Create config with EDA-specific container image + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, ) + + # Override the container image for EDA + config.sandbox.base_container_image = 'python:3.12-bookworm' + config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index 562df0024a..6a112c6abb 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -17,8 +17,8 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, compatibility_for_eval_history_pairs, - get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync def get_config( metadata: EvalMetadata, ) -> OpenHandsConfig: - sandbox_config = get_default_sandbox_config_for_eval() - sandbox_config.base_container_image = 'python:3.12-slim' + # Create config with agent_bench-specific container image + config = get_openhands_config_for_eval(metadata=metadata) + + # Override the container image for agent_bench + config.sandbox.base_container_image = 'python:3.12-slim' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'docker'), - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, - ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) agent_config.enable_prompt_extensions = False diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index 338315747d..f9eb65a9f8 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -18,6 +18,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -50,15 +51,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.11-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, + sandbox_config=sandbox_config, runtime=os.environ.get('RUNTIME', 'docker'), - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index 3dbc632a1b..39a01e2e4f 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -16,6 +16,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -61,15 +62,10 @@ def get_config( sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 71886a0406..2f04011cc8 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -19,6 +19,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -75,15 +76,10 @@ def get_config( sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py index e6eb259d86..9f59817aed 100644 --- a/evaluation/benchmarks/browsing_delegation/run_infer.py +++ b/evaluation/benchmarks/browsing_delegation/run_infer.py @@ -12,6 +12,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -40,14 +41,8 @@ def get_config( ) sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - workspace_base=None, - workspace_mount_path=None, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', sandbox_config=sandbox_config ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/commit0/run_infer.py b/evaluation/benchmarks/commit0/run_infer.py index bad4a735cd..fb125498c3 100644 --- a/evaluation/benchmarks/commit0/run_infer.py +++ b/evaluation/benchmarks/commit0/run_infer.py @@ -17,6 +17,7 @@ from evaluation.utils.shared import ( codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -114,16 +115,11 @@ def get_config( sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = base_container_image - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, - enable_browser=RUN_WITH_BROWSING, + config = get_openhands_config_for_eval( + metadata=metadata, + sandbox_config=sandbox_config, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + enable_browser=RUN_WITH_BROWSING, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index 448bc18e7b..1ffacc4543 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -18,6 +18,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -65,15 +66,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index a7e69489c3..480df59bd2 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -23,6 +23,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -60,15 +61,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, + sandbox_config=sandbox_config, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, ) config.set_llm_config(metadata.llm_config) if metadata.agent_config: diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index 87a71b4f00..932b0f6f75 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -13,6 +13,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -43,15 +44,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py index 28ac39588a..ce1cc3c8d0 100644 --- a/evaluation/benchmarks/gpqa/run_infer.py +++ b/evaluation/benchmarks/gpqa/run_infer.py @@ -31,6 +31,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -64,15 +65,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py index 8cf8b7efa1..7b882eab88 100644 --- a/evaluation/benchmarks/humanevalfix/run_infer.py +++ b/evaluation/benchmarks/humanevalfix/run_infer.py @@ -24,6 +24,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -85,15 +86,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py index 2aad6fb1b5..9363651545 100644 --- a/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py +++ b/evaluation/benchmarks/lca_ci_build_repair/eval_infer.py @@ -16,6 +16,7 @@ import ruamel.yaml from evaluation.utils.shared import ( EvalMetadata, get_default_sandbox_config_for_eval, + get_openhands_config_for_eval, make_metadata, ) from openhands.core.config import ( @@ -37,15 +38,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py index cbb53259ba..e6938301e0 100644 --- a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py +++ b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py @@ -23,6 +23,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -48,15 +49,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py index 23fa4e1d88..86fe82e6db 100644 --- a/evaluation/benchmarks/logic_reasoning/run_infer.py +++ b/evaluation/benchmarks/logic_reasoning/run_infer.py @@ -11,6 +11,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -52,15 +53,10 @@ def get_config( '$OH_INTERPRETER_PATH -m pip install scitools-pyke' ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py index ef2fdc1412..a0e24a1bbb 100644 --- a/evaluation/benchmarks/miniwob/run_infer.py +++ b/evaluation/benchmarks/miniwob/run_infer.py @@ -14,6 +14,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -58,15 +59,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime=os.environ.get('RUNTIME', 'docker'), - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 72031e8d7b..890c4e5c0a 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -16,6 +16,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -110,15 +111,10 @@ def get_config( f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}' ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py index 32a8f76ca9..48e3d6785a 100644 --- a/evaluation/benchmarks/ml_bench/run_infer.py +++ b/evaluation/benchmarks/ml_bench/run_infer.py @@ -27,6 +27,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -80,15 +81,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/multi_swe_bench/eval_infer.py b/evaluation/benchmarks/multi_swe_bench/eval_infer.py index 74364f2fe5..22fdcc764b 100644 --- a/evaluation/benchmarks/multi_swe_bench/eval_infer.py +++ b/evaluation/benchmarks/multi_swe_bench/eval_infer.py @@ -23,6 +23,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, get_default_sandbox_config_for_eval, + get_openhands_config_for_eval, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, @@ -87,13 +88,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig: dataset_name=metadata.dataset, instance_id=instance['instance_id'], ) - config = OpenHandsConfig( - run_as_openhands=False, + config = get_openhands_config_for_eval( runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) return config diff --git a/evaluation/benchmarks/multi_swe_bench/run_infer.py b/evaluation/benchmarks/multi_swe_bench/run_infer.py index ca33f65298..db6524a8d5 100644 --- a/evaluation/benchmarks/multi_swe_bench/run_infer.py +++ b/evaluation/benchmarks/multi_swe_bench/run_infer.py @@ -21,6 +21,7 @@ from evaluation.utils.shared import ( codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, is_fatal_evaluation_error, make_metadata, prepare_dataset, @@ -341,16 +342,11 @@ def get_config( instance_id=instance['instance_id'], ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, + config = get_openhands_config_for_eval( + metadata=metadata, enable_browser=RUN_WITH_BROWSING, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/nocode_bench/run_infer_nc.py b/evaluation/benchmarks/nocode_bench/run_infer_nc.py index 0022ed484d..3c3d40bdfc 100644 --- a/evaluation/benchmarks/nocode_bench/run_infer_nc.py +++ b/evaluation/benchmarks/nocode_bench/run_infer_nc.py @@ -31,6 +31,7 @@ from evaluation.utils.shared import ( codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, is_fatal_evaluation_error, make_metadata, prepare_dataset, @@ -174,15 +175,10 @@ def get_config( instance_id=instance['instance_id'], ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, + config = get_openhands_config_for_eval( + metadata=metadata, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py index 76b937308b..beabf3783f 100644 --- a/evaluation/benchmarks/scienceagentbench/run_infer.py +++ b/evaluation/benchmarks/scienceagentbench/run_infer.py @@ -13,6 +13,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -64,16 +65,10 @@ def get_config( sandbox_config.base_container_image = ( 'docker.io/xingyaoww/openhands-eval-scienceagentbench' ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime=os.environ.get('RUNTIME', 'docker'), - max_budget_per_task=4, - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index 2fdc3e2e2f..46f3629be8 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -19,6 +19,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, get_default_sandbox_config_for_eval, + get_openhands_config_for_eval, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, @@ -83,13 +84,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig: dataset_name=metadata.dataset, instance_id=instance['instance_id'], ) - config = OpenHandsConfig( - run_as_openhands=False, + config = get_openhands_config_for_eval( runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) return config diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 4413f07d3b..3dbed38dbd 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -32,6 +32,7 @@ from evaluation.utils.shared import ( codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, is_fatal_evaluation_error, make_metadata, prepare_dataset, @@ -227,16 +228,11 @@ def get_config( instance_id=instance['instance_id'], ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, + config = get_openhands_config_for_eval( + metadata=metadata, enable_browser=RUN_WITH_BROWSING, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( diff --git a/evaluation/benchmarks/swe_bench/run_localize.py b/evaluation/benchmarks/swe_bench/run_localize.py index 0c34991577..2f7f09912a 100644 --- a/evaluation/benchmarks/swe_bench/run_localize.py +++ b/evaluation/benchmarks/swe_bench/run_localize.py @@ -20,6 +20,7 @@ from evaluation.utils.shared import ( codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, is_fatal_evaluation_error, make_metadata, prepare_dataset, @@ -199,16 +200,11 @@ def get_config( 'REPO_PATH': f'/workspace/{workspace_dir_name}/', } - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, + config = get_openhands_config_for_eval( + metadata=metadata, enable_browser=RUN_WITH_BROWSING, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/testgeneval/eval_infer.py b/evaluation/benchmarks/testgeneval/eval_infer.py index 27aaaee1ac..1004a2d55f 100644 --- a/evaluation/benchmarks/testgeneval/eval_infer.py +++ b/evaluation/benchmarks/testgeneval/eval_infer.py @@ -37,6 +37,7 @@ from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + get_openhands_config_for_eval, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, @@ -58,20 +59,21 @@ def get_config(instance: pd.Series) -> OpenHandsConfig: f'Invalid container image for instance {instance["instance_id_swebench"]}.' ) logger.info(f'Using instance container image: {base_container_image}.') - return OpenHandsConfig( - run_as_openhands=False, - runtime=os.environ.get('RUNTIME', 'eventstream'), - sandbox=SandboxConfig( - base_container_image=base_container_image, - use_host_network=False, - timeout=1800, - api_key=os.environ.get('ALLHANDS_API_KEY'), - remote_runtime_api_url=os.environ.get( - 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000' - ), + + # Create custom sandbox config for testgeneval with specific requirements + sandbox_config = SandboxConfig( + base_container_image=base_container_image, + use_host_network=False, + timeout=1800, # Longer timeout than default (300) + api_key=os.environ.get('ALLHANDS_API_KEY'), + remote_runtime_api_url=os.environ.get( + 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000' ), - workspace_base=None, - workspace_mount_path=None, + ) + + return get_openhands_config_for_eval( + sandbox_config=sandbox_config, + runtime=os.environ.get('RUNTIME', 'docker'), # Different default runtime ) diff --git a/evaluation/benchmarks/testgeneval/run_infer.py b/evaluation/benchmarks/testgeneval/run_infer.py index 39288ff537..c8171cca94 100644 --- a/evaluation/benchmarks/testgeneval/run_infer.py +++ b/evaluation/benchmarks/testgeneval/run_infer.py @@ -25,6 +25,7 @@ from evaluation.utils.shared import ( assert_and_raise, codeact_user_response, get_metrics, + get_openhands_config_for_eval, is_fatal_evaluation_error, make_metadata, prepare_dataset, @@ -126,29 +127,26 @@ def get_config( f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, - runtime=os.environ.get('RUNTIME', 'eventstream'), - sandbox=SandboxConfig( - base_container_image=base_container_image, - enable_auto_lint=True, - use_host_network=False, - # large enough timeout, since some testcases take very long to run - timeout=300, - # Add platform to the sandbox config to solve issue 4401 - platform='linux/amd64', - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get( - 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000' - ), - keep_runtime_alive=False, - remote_runtime_init_timeout=3600, + sandbox_config = SandboxConfig( + base_container_image=base_container_image, + enable_auto_lint=True, + use_host_network=False, + # large enough timeout, since some testcases take very long to run + timeout=300, + # Add platform to the sandbox config to solve issue 4401 + platform='linux/amd64', + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get( + 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000' ), - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + keep_runtime_alive=False, + remote_runtime_init_timeout=3600, + ) + + config = get_openhands_config_for_eval( + metadata=metadata, + sandbox_config=sandbox_config, + runtime=os.environ.get('RUNTIME', 'docker'), ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py index 513b02d612..ee768e7f6d 100644 --- a/evaluation/benchmarks/the_agent_company/run_infer.py +++ b/evaluation/benchmarks/the_agent_company/run_infer.py @@ -12,7 +12,10 @@ import tempfile import yaml from browsing import pre_login -from evaluation.utils.shared import get_default_sandbox_config_for_eval +from evaluation.utils.shared import ( + get_default_sandbox_config_for_eval, + get_openhands_config_for_eval, +) from openhands.controller.state.state import State from openhands.core.config import ( LLMConfig, @@ -42,19 +45,17 @@ def get_config( sandbox_config.enable_auto_lint = True # If the web services are running on the host machine, this must be set to True sandbox_config.use_host_network = True - config = OpenHandsConfig( - run_as_openhands=False, - max_budget_per_task=4, + config = get_openhands_config_for_eval( max_iterations=100, - save_trajectory_path=os.path.join( - mount_path_on_host, f'traj_{task_short_name}.json' - ), - sandbox=sandbox_config, # we mount trajectories path so that trajectories, generated by OpenHands # controller, can be accessible to the evaluator file in the runtime container + sandbox_config=sandbox_config, workspace_mount_path=mount_path_on_host, - workspace_mount_path_in_sandbox='/outputs', ) + config.save_trajectory_path = os.path.join( + mount_path_on_host, f'traj_{task_short_name}.json' + ) + config.max_budget_per_task = 4 config.set_llm_config(llm_config) if agent_config: config.set_agent_config(agent_config) diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index 8353f76f77..3fa078648d 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -12,6 +12,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -44,15 +45,10 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.base_container_image = 'python:3.12-bookworm' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/benchmarks/visual_swe_bench/run_infer.py b/evaluation/benchmarks/visual_swe_bench/run_infer.py index 215f7933b1..ca096d9e19 100644 --- a/evaluation/benchmarks/visual_swe_bench/run_infer.py +++ b/evaluation/benchmarks/visual_swe_bench/run_infer.py @@ -20,6 +20,7 @@ from evaluation.utils.shared import ( codeact_user_response, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, is_fatal_evaluation_error, make_metadata, prepare_dataset, @@ -160,16 +161,11 @@ def get_config( instance_id=instance['instance_id'], ) - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, - max_iterations=metadata.max_iterations, + config = get_openhands_config_for_eval( + metadata=metadata, enable_browser=RUN_WITH_BROWSING, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py index ef45663e6a..7c0695d585 100644 --- a/evaluation/benchmarks/visualwebarena/run_infer.py +++ b/evaluation/benchmarks/visualwebarena/run_infer.py @@ -13,6 +13,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -73,16 +74,10 @@ def get_config( 'VWA_WIKIPEDIA': f'{base_url}:8888', 'VWA_HOMEPAGE': f'{base_url}:4399', } - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, - attach_to_existing=True, + sandbox_config=sandbox_config, ) config.set_llm_config( update_llm_config_for_completions_logging( diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py index 316dba63de..273a137124 100644 --- a/evaluation/benchmarks/webarena/run_infer.py +++ b/evaluation/benchmarks/webarena/run_infer.py @@ -13,6 +13,7 @@ from evaluation.utils.shared import ( compatibility_for_eval_history_pairs, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -65,15 +66,10 @@ def get_config( 'MAP': f'{base_url}:3000', 'HOMEPAGE': f'{base_url}:4399', } - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime='docker', - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, + sandbox_config=sandbox_config, ) config.set_llm_config(metadata.llm_config) agent_config = config.get_agent_config(metadata.agent_class) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index c493cc173a..c2ccf54bc9 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -10,6 +10,7 @@ from evaluation.utils.shared import ( EvalOutput, get_default_sandbox_config_for_eval, get_metrics, + get_openhands_config_for_eval, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -45,18 +46,12 @@ def get_config( ) -> OpenHandsConfig: sandbox_config = get_default_sandbox_config_for_eval() sandbox_config.platform = 'linux/amd64' - config = OpenHandsConfig( - default_agent=metadata.agent_class, - run_as_openhands=False, + config = get_openhands_config_for_eval( + metadata=metadata, runtime=os.environ.get('RUNTIME', 'docker'), - max_iterations=metadata.max_iterations, - sandbox=sandbox_config, - # do not mount workspace - workspace_base=None, - workspace_mount_path=None, - # debug - debug=True, + sandbox_config=sandbox_config, ) + config.debug = True config.set_llm_config( update_llm_config_for_completions_logging( metadata.llm_config, metadata.eval_output_dir, instance_id diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 76ed563ec2..a3d9c125af 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -703,3 +703,79 @@ def get_default_sandbox_config_for_eval() -> SandboxConfig: remote_runtime_enable_retries=True, remote_runtime_class='sysbox', ) + + +def get_openhands_config_for_eval( + metadata: EvalMetadata | None = None, + sandbox_config: SandboxConfig | None = None, + runtime: str | None = None, + max_iterations: int | None = None, + default_agent: str | None = None, + enable_browser: bool = False, + workspace_base: str | None = None, + workspace_mount_path: str | None = None, +): + """Create an OpenHandsConfig with common patterns used across evaluation scripts. + + This function provides a standardized way to create OpenHands configurations + for evaluation runs, with sensible defaults that match the patterns used in + most run_infer.py scripts. Individual evaluation scripts can override specific + attributes as needed. + + Args: + metadata: EvalMetadata containing agent class, max iterations, etc. + sandbox_config: Custom sandbox config. If None, uses get_default_sandbox_config_for_eval() + runtime: Runtime type. If None, uses environment RUNTIME or 'docker' + max_iterations: Max iterations for the agent. If None, uses metadata.max_iterations + default_agent: Agent class name. If None, uses metadata.agent_class + enable_browser: Whether to enable browser functionality + workspace_base: Workspace base path. Defaults to None + workspace_mount_path: Workspace mount path. Defaults to None + + Returns: + OpenHandsConfig: Configured for evaluation with eval-specific overrides applied + """ + # Defer import to avoid circular imports at module load time + from openhands.core.config.openhands_config import ( + OpenHandsConfig as _OHConfig, # type: ignore + ) + + # Use provided sandbox config or get default + if sandbox_config is None: + sandbox_config = get_default_sandbox_config_for_eval() + + # Extract values from metadata if provided + if metadata is not None: + if max_iterations is None: + max_iterations = metadata.max_iterations + if default_agent is None: + default_agent = metadata.agent_class + + # Use environment runtime or default + if runtime is None: + runtime = os.environ.get('RUNTIME', 'docker') + + # Provide sensible defaults if still None + if default_agent is None: + default_agent = 'CodeActAgent' + if max_iterations is None: + max_iterations = 50 + + # Always use repo-local .eval_sessions directory (absolute path) + eval_store = os.path.abspath(os.path.join(os.getcwd(), '.eval_sessions')) + + # Create the base config with evaluation-specific overrides + config = _OHConfig( + default_agent=default_agent, + run_as_openhands=False, + runtime=runtime, + max_iterations=max_iterations, + enable_browser=enable_browser, + sandbox=sandbox_config, + workspace_base=workspace_base, + workspace_mount_path=workspace_mount_path, + file_store='local', + file_store_path=eval_store, + ) + + return config diff --git a/tests/unit/evaluation/test_eval_file_store_path.py b/tests/unit/evaluation/test_eval_file_store_path.py new file mode 100644 index 0000000000..1314612729 --- /dev/null +++ b/tests/unit/evaluation/test_eval_file_store_path.py @@ -0,0 +1,26 @@ +import os +from pathlib import Path + +from evaluation.utils.shared import get_openhands_config_for_eval + + +def test_eval_file_store_defaults_to_repo_local(tmp_path, monkeypatch): + prev_cwd = Path.cwd() + try: + os.chdir(tmp_path) + cfg = get_openhands_config_for_eval() + assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve() + assert cfg.file_store == 'local' + finally: + os.chdir(prev_cwd) + + +def test_eval_file_store_is_hard_coded_repo_local(tmp_path): + prev_cwd = Path.cwd() + try: + os.chdir(tmp_path) + cfg = get_openhands_config_for_eval() + assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve() + assert cfg.file_store == 'local' + finally: + os.chdir(prev_cwd)