mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Evaluation: redirect sessions to repo-local .eval_sessions via helper; apply across entrypoints; add tests (#10540)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
parent
d9cf5b7302
commit
4507a25b85
2
.gitignore
vendored
2
.gitignore
vendored
@ -257,3 +257,5 @@ containers/runtime/code
|
|||||||
|
|
||||||
# test results
|
# test results
|
||||||
test-results
|
test-results
|
||||||
|
|
||||||
|
.eval_sessions
|
||||||
|
|||||||
@ -9,8 +9,8 @@ from evaluation.utils.shared import (
|
|||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
EvalOutput,
|
EvalOutput,
|
||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -61,18 +61,15 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
|||||||
def get_config(
|
def get_config(
|
||||||
metadata: EvalMetadata,
|
metadata: EvalMetadata,
|
||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
# Create config with EDA-specific container image
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
config = get_openhands_config_for_eval(
|
||||||
config = OpenHandsConfig(
|
metadata=metadata,
|
||||||
default_agent=metadata.agent_class,
|
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Override the container image for EDA
|
||||||
|
config.sandbox.base_container_image = 'python:3.12-bookworm'
|
||||||
|
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
agent_config.enable_prompt_extensions = False
|
agent_config.enable_prompt_extensions = False
|
||||||
|
|||||||
@ -17,8 +17,8 @@ from evaluation.utils.shared import (
|
|||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
EvalOutput,
|
EvalOutput,
|
||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -41,19 +41,12 @@ from openhands.utils.async_utils import call_async_from_sync
|
|||||||
def get_config(
|
def get_config(
|
||||||
metadata: EvalMetadata,
|
metadata: EvalMetadata,
|
||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
# Create config with agent_bench-specific container image
|
||||||
sandbox_config.base_container_image = 'python:3.12-slim'
|
config = get_openhands_config_for_eval(metadata=metadata)
|
||||||
|
|
||||||
|
# Override the container image for agent_bench
|
||||||
|
config.sandbox.base_container_image = 'python:3.12-slim'
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
|
||||||
default_agent=metadata.agent_class,
|
|
||||||
run_as_openhands=False,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
agent_config.enable_prompt_extensions = False
|
agent_config.enable_prompt_extensions = False
|
||||||
|
|||||||
@ -18,6 +18,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -50,15 +51,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.11-bookworm'
|
sandbox_config.base_container_image = 'python:3.11-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
sandbox_config=sandbox_config,
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -61,15 +62,10 @@ def get_config(
|
|||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
|
sandbox_config.base_container_image = BIOCODER_BENCH_CONTAINER_IMAGE
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -75,15 +76,10 @@ def get_config(
|
|||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -40,14 +41,8 @@ def get_config(
|
|||||||
)
|
)
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata, runtime='docker', sandbox_config=sandbox_config
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -17,6 +17,7 @@ from evaluation.utils.shared import (
|
|||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -114,16 +115,11 @@ def get_config(
|
|||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = base_container_image
|
sandbox_config.base_container_image = base_container_image
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
sandbox_config=sandbox_config,
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
enable_browser=RUN_WITH_BROWSING,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
enable_browser=RUN_WITH_BROWSING,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -18,6 +18,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -65,15 +66,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -60,15 +61,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
|
sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
sandbox_config=sandbox_config,
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
if metadata.agent_config:
|
if metadata.agent_config:
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -43,15 +44,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -31,6 +31,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -64,15 +65,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -85,15 +86,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import ruamel.yaml
|
|||||||
from evaluation.utils.shared import (
|
from evaluation.utils.shared import (
|
||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
)
|
)
|
||||||
from openhands.core.config import (
|
from openhands.core.config import (
|
||||||
@ -37,15 +38,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -48,15 +49,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -52,15 +53,10 @@ def get_config(
|
|||||||
'$OH_INTERPRETER_PATH -m pip install scitools-pyke'
|
'$OH_INTERPRETER_PATH -m pip install scitools-pyke'
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -14,6 +14,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -58,15 +59,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
|
sandbox_config.base_container_image = 'xingyaoww/od-eval-miniwob:v1.0'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -110,15 +111,10 @@ def get_config(
|
|||||||
f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
|
f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -27,6 +27,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -80,15 +81,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
|
sandbox_config.base_container_image = 'public.ecr.aws/i5g0m1f6/ml-bench'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from evaluation.utils.shared import (
|
|||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
EvalOutput,
|
EvalOutput,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
@ -87,13 +88,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
|
|||||||
dataset_name=metadata.dataset,
|
dataset_name=metadata.dataset,
|
||||||
instance_id=instance['instance_id'],
|
instance_id=instance['instance_id'],
|
||||||
)
|
)
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
run_as_openhands=False,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|||||||
@ -21,6 +21,7 @@ from evaluation.utils.shared import (
|
|||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
is_fatal_evaluation_error,
|
is_fatal_evaluation_error,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
@ -341,16 +342,11 @@ def get_config(
|
|||||||
instance_id=instance['instance_id'],
|
instance_id=instance['instance_id'],
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
enable_browser=RUN_WITH_BROWSING,
|
enable_browser=RUN_WITH_BROWSING,
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -31,6 +31,7 @@ from evaluation.utils.shared import (
|
|||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
is_fatal_evaluation_error,
|
is_fatal_evaluation_error,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
@ -174,15 +175,10 @@ def get_config(
|
|||||||
instance_id=instance['instance_id'],
|
instance_id=instance['instance_id'],
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -64,16 +65,10 @@ def get_config(
|
|||||||
sandbox_config.base_container_image = (
|
sandbox_config.base_container_image = (
|
||||||
'docker.io/xingyaoww/openhands-eval-scienceagentbench'
|
'docker.io/xingyaoww/openhands-eval-scienceagentbench'
|
||||||
)
|
)
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
max_budget_per_task=4,
|
sandbox_config=sandbox_config,
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from evaluation.utils.shared import (
|
|||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
EvalOutput,
|
EvalOutput,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
@ -83,13 +84,9 @@ def get_config(metadata: EvalMetadata, instance: pd.Series) -> OpenHandsConfig:
|
|||||||
dataset_name=metadata.dataset,
|
dataset_name=metadata.dataset,
|
||||||
instance_id=instance['instance_id'],
|
instance_id=instance['instance_id'],
|
||||||
)
|
)
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
run_as_openhands=False,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@ from evaluation.utils.shared import (
|
|||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
is_fatal_evaluation_error,
|
is_fatal_evaluation_error,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
@ -227,16 +228,11 @@ def get_config(
|
|||||||
instance_id=instance['instance_id'],
|
instance_id=instance['instance_id'],
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
enable_browser=RUN_WITH_BROWSING,
|
enable_browser=RUN_WITH_BROWSING,
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
|
|||||||
@ -20,6 +20,7 @@ from evaluation.utils.shared import (
|
|||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
is_fatal_evaluation_error,
|
is_fatal_evaluation_error,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
@ -199,16 +200,11 @@ def get_config(
|
|||||||
'REPO_PATH': f'/workspace/{workspace_dir_name}/',
|
'REPO_PATH': f'/workspace/{workspace_dir_name}/',
|
||||||
}
|
}
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
enable_browser=RUN_WITH_BROWSING,
|
enable_browser=RUN_WITH_BROWSING,
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -37,6 +37,7 @@ from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
|
|||||||
from evaluation.utils.shared import (
|
from evaluation.utils.shared import (
|
||||||
EvalMetadata,
|
EvalMetadata,
|
||||||
EvalOutput,
|
EvalOutput,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
run_evaluation,
|
run_evaluation,
|
||||||
@ -58,20 +59,21 @@ def get_config(instance: pd.Series) -> OpenHandsConfig:
|
|||||||
f'Invalid container image for instance {instance["instance_id_swebench"]}.'
|
f'Invalid container image for instance {instance["instance_id_swebench"]}.'
|
||||||
)
|
)
|
||||||
logger.info(f'Using instance container image: {base_container_image}.')
|
logger.info(f'Using instance container image: {base_container_image}.')
|
||||||
return OpenHandsConfig(
|
|
||||||
run_as_openhands=False,
|
# Create custom sandbox config for testgeneval with specific requirements
|
||||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
sandbox_config = SandboxConfig(
|
||||||
sandbox=SandboxConfig(
|
base_container_image=base_container_image,
|
||||||
base_container_image=base_container_image,
|
use_host_network=False,
|
||||||
use_host_network=False,
|
timeout=1800, # Longer timeout than default (300)
|
||||||
timeout=1800,
|
api_key=os.environ.get('ALLHANDS_API_KEY'),
|
||||||
api_key=os.environ.get('ALLHANDS_API_KEY'),
|
remote_runtime_api_url=os.environ.get(
|
||||||
remote_runtime_api_url=os.environ.get(
|
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
||||||
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
|
||||||
),
|
|
||||||
),
|
),
|
||||||
workspace_base=None,
|
)
|
||||||
workspace_mount_path=None,
|
|
||||||
|
return get_openhands_config_for_eval(
|
||||||
|
sandbox_config=sandbox_config,
|
||||||
|
runtime=os.environ.get('RUNTIME', 'docker'), # Different default runtime
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,7 @@ from evaluation.utils.shared import (
|
|||||||
assert_and_raise,
|
assert_and_raise,
|
||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
is_fatal_evaluation_error,
|
is_fatal_evaluation_error,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
@ -126,29 +127,26 @@ def get_config(
|
|||||||
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
sandbox_config = SandboxConfig(
|
||||||
default_agent=metadata.agent_class,
|
base_container_image=base_container_image,
|
||||||
run_as_openhands=False,
|
enable_auto_lint=True,
|
||||||
max_iterations=metadata.max_iterations,
|
use_host_network=False,
|
||||||
runtime=os.environ.get('RUNTIME', 'eventstream'),
|
# large enough timeout, since some testcases take very long to run
|
||||||
sandbox=SandboxConfig(
|
timeout=300,
|
||||||
base_container_image=base_container_image,
|
# Add platform to the sandbox config to solve issue 4401
|
||||||
enable_auto_lint=True,
|
platform='linux/amd64',
|
||||||
use_host_network=False,
|
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||||
# large enough timeout, since some testcases take very long to run
|
remote_runtime_api_url=os.environ.get(
|
||||||
timeout=300,
|
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
||||||
# Add platform to the sandbox config to solve issue 4401
|
|
||||||
platform='linux/amd64',
|
|
||||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
|
||||||
remote_runtime_api_url=os.environ.get(
|
|
||||||
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000'
|
|
||||||
),
|
|
||||||
keep_runtime_alive=False,
|
|
||||||
remote_runtime_init_timeout=3600,
|
|
||||||
),
|
),
|
||||||
# do not mount workspace
|
keep_runtime_alive=False,
|
||||||
workspace_base=None,
|
remote_runtime_init_timeout=3600,
|
||||||
workspace_mount_path=None,
|
)
|
||||||
|
|
||||||
|
config = get_openhands_config_for_eval(
|
||||||
|
metadata=metadata,
|
||||||
|
sandbox_config=sandbox_config,
|
||||||
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -12,7 +12,10 @@ import tempfile
|
|||||||
import yaml
|
import yaml
|
||||||
from browsing import pre_login
|
from browsing import pre_login
|
||||||
|
|
||||||
from evaluation.utils.shared import get_default_sandbox_config_for_eval
|
from evaluation.utils.shared import (
|
||||||
|
get_default_sandbox_config_for_eval,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
|
)
|
||||||
from openhands.controller.state.state import State
|
from openhands.controller.state.state import State
|
||||||
from openhands.core.config import (
|
from openhands.core.config import (
|
||||||
LLMConfig,
|
LLMConfig,
|
||||||
@ -42,19 +45,17 @@ def get_config(
|
|||||||
sandbox_config.enable_auto_lint = True
|
sandbox_config.enable_auto_lint = True
|
||||||
# If the web services are running on the host machine, this must be set to True
|
# If the web services are running on the host machine, this must be set to True
|
||||||
sandbox_config.use_host_network = True
|
sandbox_config.use_host_network = True
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
run_as_openhands=False,
|
|
||||||
max_budget_per_task=4,
|
|
||||||
max_iterations=100,
|
max_iterations=100,
|
||||||
save_trajectory_path=os.path.join(
|
|
||||||
mount_path_on_host, f'traj_{task_short_name}.json'
|
|
||||||
),
|
|
||||||
sandbox=sandbox_config,
|
|
||||||
# we mount trajectories path so that trajectories, generated by OpenHands
|
# we mount trajectories path so that trajectories, generated by OpenHands
|
||||||
# controller, can be accessible to the evaluator file in the runtime container
|
# controller, can be accessible to the evaluator file in the runtime container
|
||||||
|
sandbox_config=sandbox_config,
|
||||||
workspace_mount_path=mount_path_on_host,
|
workspace_mount_path=mount_path_on_host,
|
||||||
workspace_mount_path_in_sandbox='/outputs',
|
|
||||||
)
|
)
|
||||||
|
config.save_trajectory_path = os.path.join(
|
||||||
|
mount_path_on_host, f'traj_{task_short_name}.json'
|
||||||
|
)
|
||||||
|
config.max_budget_per_task = 4
|
||||||
config.set_llm_config(llm_config)
|
config.set_llm_config(llm_config)
|
||||||
if agent_config:
|
if agent_config:
|
||||||
config.set_agent_config(agent_config)
|
config.set_agent_config(agent_config)
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -44,15 +45,10 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
sandbox_config.base_container_image = 'python:3.12-bookworm'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -20,6 +20,7 @@ from evaluation.utils.shared import (
|
|||||||
codeact_user_response,
|
codeact_user_response,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
is_fatal_evaluation_error,
|
is_fatal_evaluation_error,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
@ -160,16 +161,11 @@ def get_config(
|
|||||||
instance_id=instance['instance_id'],
|
instance_id=instance['instance_id'],
|
||||||
)
|
)
|
||||||
|
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
max_iterations=metadata.max_iterations,
|
|
||||||
enable_browser=RUN_WITH_BROWSING,
|
enable_browser=RUN_WITH_BROWSING,
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
sandbox=sandbox_config,
|
sandbox_config=sandbox_config,
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -73,16 +74,10 @@ def get_config(
|
|||||||
'VWA_WIKIPEDIA': f'{base_url}:8888',
|
'VWA_WIKIPEDIA': f'{base_url}:8888',
|
||||||
'VWA_HOMEPAGE': f'{base_url}:4399',
|
'VWA_HOMEPAGE': f'{base_url}:4399',
|
||||||
}
|
}
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
attach_to_existing=True,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from evaluation.utils.shared import (
|
|||||||
compatibility_for_eval_history_pairs,
|
compatibility_for_eval_history_pairs,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -65,15 +66,10 @@ def get_config(
|
|||||||
'MAP': f'{base_url}:3000',
|
'MAP': f'{base_url}:3000',
|
||||||
'HOMEPAGE': f'{base_url}:4399',
|
'HOMEPAGE': f'{base_url}:4399',
|
||||||
}
|
}
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime='docker',
|
runtime='docker',
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
)
|
)
|
||||||
config.set_llm_config(metadata.llm_config)
|
config.set_llm_config(metadata.llm_config)
|
||||||
agent_config = config.get_agent_config(metadata.agent_class)
|
agent_config = config.get_agent_config(metadata.agent_class)
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from evaluation.utils.shared import (
|
|||||||
EvalOutput,
|
EvalOutput,
|
||||||
get_default_sandbox_config_for_eval,
|
get_default_sandbox_config_for_eval,
|
||||||
get_metrics,
|
get_metrics,
|
||||||
|
get_openhands_config_for_eval,
|
||||||
make_metadata,
|
make_metadata,
|
||||||
prepare_dataset,
|
prepare_dataset,
|
||||||
reset_logger_for_multiprocessing,
|
reset_logger_for_multiprocessing,
|
||||||
@ -45,18 +46,12 @@ def get_config(
|
|||||||
) -> OpenHandsConfig:
|
) -> OpenHandsConfig:
|
||||||
sandbox_config = get_default_sandbox_config_for_eval()
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
sandbox_config.platform = 'linux/amd64'
|
sandbox_config.platform = 'linux/amd64'
|
||||||
config = OpenHandsConfig(
|
config = get_openhands_config_for_eval(
|
||||||
default_agent=metadata.agent_class,
|
metadata=metadata,
|
||||||
run_as_openhands=False,
|
|
||||||
runtime=os.environ.get('RUNTIME', 'docker'),
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
||||||
max_iterations=metadata.max_iterations,
|
sandbox_config=sandbox_config,
|
||||||
sandbox=sandbox_config,
|
|
||||||
# do not mount workspace
|
|
||||||
workspace_base=None,
|
|
||||||
workspace_mount_path=None,
|
|
||||||
# debug
|
|
||||||
debug=True,
|
|
||||||
)
|
)
|
||||||
|
config.debug = True
|
||||||
config.set_llm_config(
|
config.set_llm_config(
|
||||||
update_llm_config_for_completions_logging(
|
update_llm_config_for_completions_logging(
|
||||||
metadata.llm_config, metadata.eval_output_dir, instance_id
|
metadata.llm_config, metadata.eval_output_dir, instance_id
|
||||||
|
|||||||
@ -703,3 +703,79 @@ def get_default_sandbox_config_for_eval() -> SandboxConfig:
|
|||||||
remote_runtime_enable_retries=True,
|
remote_runtime_enable_retries=True,
|
||||||
remote_runtime_class='sysbox',
|
remote_runtime_class='sysbox',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_openhands_config_for_eval(
|
||||||
|
metadata: EvalMetadata | None = None,
|
||||||
|
sandbox_config: SandboxConfig | None = None,
|
||||||
|
runtime: str | None = None,
|
||||||
|
max_iterations: int | None = None,
|
||||||
|
default_agent: str | None = None,
|
||||||
|
enable_browser: bool = False,
|
||||||
|
workspace_base: str | None = None,
|
||||||
|
workspace_mount_path: str | None = None,
|
||||||
|
):
|
||||||
|
"""Create an OpenHandsConfig with common patterns used across evaluation scripts.
|
||||||
|
|
||||||
|
This function provides a standardized way to create OpenHands configurations
|
||||||
|
for evaluation runs, with sensible defaults that match the patterns used in
|
||||||
|
most run_infer.py scripts. Individual evaluation scripts can override specific
|
||||||
|
attributes as needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata: EvalMetadata containing agent class, max iterations, etc.
|
||||||
|
sandbox_config: Custom sandbox config. If None, uses get_default_sandbox_config_for_eval()
|
||||||
|
runtime: Runtime type. If None, uses environment RUNTIME or 'docker'
|
||||||
|
max_iterations: Max iterations for the agent. If None, uses metadata.max_iterations
|
||||||
|
default_agent: Agent class name. If None, uses metadata.agent_class
|
||||||
|
enable_browser: Whether to enable browser functionality
|
||||||
|
workspace_base: Workspace base path. Defaults to None
|
||||||
|
workspace_mount_path: Workspace mount path. Defaults to None
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
OpenHandsConfig: Configured for evaluation with eval-specific overrides applied
|
||||||
|
"""
|
||||||
|
# Defer import to avoid circular imports at module load time
|
||||||
|
from openhands.core.config.openhands_config import (
|
||||||
|
OpenHandsConfig as _OHConfig, # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use provided sandbox config or get default
|
||||||
|
if sandbox_config is None:
|
||||||
|
sandbox_config = get_default_sandbox_config_for_eval()
|
||||||
|
|
||||||
|
# Extract values from metadata if provided
|
||||||
|
if metadata is not None:
|
||||||
|
if max_iterations is None:
|
||||||
|
max_iterations = metadata.max_iterations
|
||||||
|
if default_agent is None:
|
||||||
|
default_agent = metadata.agent_class
|
||||||
|
|
||||||
|
# Use environment runtime or default
|
||||||
|
if runtime is None:
|
||||||
|
runtime = os.environ.get('RUNTIME', 'docker')
|
||||||
|
|
||||||
|
# Provide sensible defaults if still None
|
||||||
|
if default_agent is None:
|
||||||
|
default_agent = 'CodeActAgent'
|
||||||
|
if max_iterations is None:
|
||||||
|
max_iterations = 50
|
||||||
|
|
||||||
|
# Always use repo-local .eval_sessions directory (absolute path)
|
||||||
|
eval_store = os.path.abspath(os.path.join(os.getcwd(), '.eval_sessions'))
|
||||||
|
|
||||||
|
# Create the base config with evaluation-specific overrides
|
||||||
|
config = _OHConfig(
|
||||||
|
default_agent=default_agent,
|
||||||
|
run_as_openhands=False,
|
||||||
|
runtime=runtime,
|
||||||
|
max_iterations=max_iterations,
|
||||||
|
enable_browser=enable_browser,
|
||||||
|
sandbox=sandbox_config,
|
||||||
|
workspace_base=workspace_base,
|
||||||
|
workspace_mount_path=workspace_mount_path,
|
||||||
|
file_store='local',
|
||||||
|
file_store_path=eval_store,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|||||||
26
tests/unit/evaluation/test_eval_file_store_path.py
Normal file
26
tests/unit/evaluation/test_eval_file_store_path.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from evaluation.utils.shared import get_openhands_config_for_eval
|
||||||
|
|
||||||
|
|
||||||
|
def test_eval_file_store_defaults_to_repo_local(tmp_path, monkeypatch):
|
||||||
|
prev_cwd = Path.cwd()
|
||||||
|
try:
|
||||||
|
os.chdir(tmp_path)
|
||||||
|
cfg = get_openhands_config_for_eval()
|
||||||
|
assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve()
|
||||||
|
assert cfg.file_store == 'local'
|
||||||
|
finally:
|
||||||
|
os.chdir(prev_cwd)
|
||||||
|
|
||||||
|
|
||||||
|
def test_eval_file_store_is_hard_coded_repo_local(tmp_path):
|
||||||
|
prev_cwd = Path.cwd()
|
||||||
|
try:
|
||||||
|
os.chdir(tmp_path)
|
||||||
|
cfg = get_openhands_config_for_eval()
|
||||||
|
assert Path(cfg.file_store_path) == (tmp_path / '.eval_sessions').resolve()
|
||||||
|
assert cfg.file_store == 'local'
|
||||||
|
finally:
|
||||||
|
os.chdir(prev_cwd)
|
||||||
Loading…
x
Reference in New Issue
Block a user