mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Graham Neubig <neubig@gmail.com>
996 lines
39 KiB
Python
996 lines
39 KiB
Python
import asyncio
|
|
import copy
|
|
import json
|
|
import os
|
|
import tempfile
|
|
from typing import Any, Literal
|
|
|
|
import pandas as pd
|
|
import toml
|
|
from datasets import load_dataset
|
|
from jinja2 import Environment, FileSystemLoader
|
|
|
|
import openhands.agenthub
|
|
from evaluation.benchmarks.swe_bench.binary_patch_utils import (
|
|
remove_binary_diffs,
|
|
remove_binary_files_from_git,
|
|
)
|
|
from evaluation.benchmarks.swe_bench.resource.mapping import (
|
|
get_instance_resource_factor,
|
|
)
|
|
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
|
|
MAP_REPO_TO_INSTALL,
|
|
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
|
|
MAP_VERSION_TO_INSTALL,
|
|
)
|
|
from evaluation.utils.shared import (
|
|
EvalException,
|
|
EvalMetadata,
|
|
EvalOutput,
|
|
assert_and_raise,
|
|
check_maximum_retries_exceeded,
|
|
codeact_user_response,
|
|
get_default_sandbox_config_for_eval,
|
|
get_metrics,
|
|
get_openhands_config_for_eval,
|
|
is_fatal_evaluation_error,
|
|
make_metadata,
|
|
prepare_dataset,
|
|
reset_logger_for_multiprocessing,
|
|
run_evaluation,
|
|
update_llm_config_for_completions_logging,
|
|
)
|
|
from openhands.controller.state.state import State
|
|
from openhands.core.config import (
|
|
AgentConfig,
|
|
OpenHandsConfig,
|
|
get_agent_config_arg,
|
|
get_evaluation_parser,
|
|
get_llm_config_arg,
|
|
get_llms_for_routing_config,
|
|
get_model_routing_config_arg,
|
|
)
|
|
from openhands.core.config.condenser_config import NoOpCondenserConfig
|
|
from openhands.core.config.utils import get_condenser_config_arg
|
|
from openhands.core.logger import openhands_logger as logger
|
|
from openhands.core.main import create_runtime, run_controller
|
|
from openhands.critic import AgentFinishedCritic
|
|
from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
|
|
from openhands.events.observation import (
|
|
CmdOutputObservation,
|
|
ErrorObservation,
|
|
FileReadObservation,
|
|
)
|
|
from openhands.events.serialization.event import event_from_dict, event_to_dict
|
|
from openhands.runtime.base import Runtime
|
|
from openhands.utils.async_utils import call_async_from_sync
|
|
from openhands.utils.shutdown_listener import sleep_if_should_continue
|
|
|
|
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
|
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
|
ENABLE_LLM_EDITOR = os.environ.get('ENABLE_LLM_EDITOR', 'false').lower() == 'true'
|
|
BenchMode = Literal['swe', 'swt', 'swt-ci']
|
|
|
|
# Global variable to track dataset type
|
|
DATASET_TYPE = 'SWE-bench'
|
|
|
|
|
|
def set_dataset_type(dataset_name: str) -> str:
|
|
"""Set dataset type based on dataset name."""
|
|
global DATASET_TYPE
|
|
name_lower = dataset_name.lower()
|
|
|
|
if 'swe-gym' in name_lower:
|
|
DATASET_TYPE = 'SWE-Gym'
|
|
elif 'swe-bench-live' in name_lower:
|
|
DATASET_TYPE = 'SWE-bench-Live'
|
|
elif 'swe-rebench' in name_lower:
|
|
DATASET_TYPE = 'SWE-rebench'
|
|
elif 'multimodal' in name_lower:
|
|
DATASET_TYPE = 'Multimodal'
|
|
else:
|
|
DATASET_TYPE = 'SWE-bench'
|
|
|
|
logger.info(f'Dataset type set to: {DATASET_TYPE}')
|
|
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': codeact_user_response,
|
|
}
|
|
|
|
|
|
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
|
if DATASET_TYPE == 'SWE-bench-Live':
|
|
return instance.instance_id
|
|
else:
|
|
return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
|
|
|
|
|
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
mode = metadata.details['mode']
|
|
llm_model = metadata.llm_config.model
|
|
|
|
# Determine the template file based on mode and LLM
|
|
if metadata.instruction_template_name:
|
|
template_name = metadata.instruction_template_name
|
|
elif mode.startswith('swt'):
|
|
template_name = 'swt.j2'
|
|
elif mode == 'swe':
|
|
if 'gpt-4.1' in llm_model:
|
|
template_name = 'swe_gpt4.j2'
|
|
else:
|
|
template_name = (
|
|
'swe_default.j2' # Default for 'swe' mode (regular swe-bench)
|
|
)
|
|
else:
|
|
# Fallback or error handling if mode is unexpected
|
|
logger.error(f'Unexpected evaluation mode: {mode}. Falling back to default.')
|
|
template_name = 'swe_default.j2'
|
|
|
|
logger.debug(f'Using instruction template file: {template_name}')
|
|
# Set up Jinja2 environment
|
|
# Assuming templates are in 'evaluation/benchmarks/swe_bench/prompts' relative to this script
|
|
prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts')
|
|
env = Environment(loader=FileSystemLoader(prompts_dir))
|
|
template = env.get_template(template_name)
|
|
|
|
# Prepare context for rendering
|
|
context = {
|
|
'instance': instance,
|
|
'workspace_dir_name': workspace_dir_name,
|
|
'metadata': metadata, # Pass metadata if needed in templates
|
|
}
|
|
|
|
# Add specific context for swt-ci mode if needed
|
|
if mode == 'swt-ci':
|
|
context['test_instructions'] = (
|
|
f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
|
|
)
|
|
else:
|
|
context['test_instructions'] = '' # Ensure it's defined for other modes
|
|
|
|
# Render the instruction
|
|
instruction = template.render(context)
|
|
|
|
if RUN_WITH_BROWSING:
|
|
instruction += (
|
|
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
|
)
|
|
|
|
if 'image_assets' in instance:
|
|
assets = json.loads(instance['image_assets'])
|
|
assert 'problem_statement' in assets, (
|
|
'problem_statement is required in image_assets'
|
|
)
|
|
image_urls = assets['problem_statement']
|
|
return MessageAction(content=instruction, image_urls=image_urls)
|
|
return MessageAction(content=instruction)
|
|
|
|
|
|
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
|
DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get(
|
|
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/'
|
|
)
|
|
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
|
|
|
|
|
|
def get_instance_docker_image(
|
|
instance_id: str,
|
|
swebench_official_image: bool = False,
|
|
) -> str:
|
|
if swebench_official_image:
|
|
# Official SWE-Bench image
|
|
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
|
|
# SWE-bench-Live uses the same naming convention as SWE-Bench
|
|
if DATASET_TYPE == 'SWE-bench-Live':
|
|
docker_image_prefix = 'docker.io/starryzhang/'
|
|
elif DATASET_TYPE == 'SWE-bench':
|
|
docker_image_prefix = 'docker.io/swebench/'
|
|
elif DATASET_TYPE == 'SWE-rebench':
|
|
docker_image_prefix = 'docker.io/swerebench/'
|
|
repo, name = instance_id.split('__')
|
|
image_name = f'{docker_image_prefix.rstrip("/")}/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
|
|
logger.debug(f'Using official SWE-Bench image: {image_name}')
|
|
return image_name
|
|
else:
|
|
# OpenHands version of the image
|
|
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX
|
|
image_name = 'sweb.eval.x86_64.' + instance_id
|
|
image_name = image_name.replace(
|
|
'__', '_s_'
|
|
) # to comply with docker image naming convention
|
|
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()
|
|
|
|
|
|
def get_config(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
) -> OpenHandsConfig:
|
|
# We use a different instance image for the each instance of swe-bench eval
|
|
use_swebench_official_image = DATASET_TYPE != 'SWE-Gym'
|
|
|
|
base_container_image = get_instance_docker_image(
|
|
instance['instance_id'],
|
|
swebench_official_image=use_swebench_official_image,
|
|
)
|
|
logger.info(
|
|
f'Using instance container image: {base_container_image}. '
|
|
f'Please make sure this image exists. '
|
|
f'Submit an issue on https://github.com/OpenHands/OpenHands if you run into any issues.'
|
|
)
|
|
|
|
sandbox_config = get_default_sandbox_config_for_eval()
|
|
sandbox_config.base_container_image = base_container_image
|
|
sandbox_config.enable_auto_lint = True
|
|
sandbox_config.use_host_network = False
|
|
# Add platform to the sandbox config to solve issue 4401
|
|
sandbox_config.platform = 'linux/amd64'
|
|
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
|
|
dataset_name=metadata.dataset,
|
|
instance_id=instance['instance_id'],
|
|
)
|
|
|
|
config = get_openhands_config_for_eval(
|
|
metadata=metadata,
|
|
enable_browser=RUN_WITH_BROWSING,
|
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
|
sandbox_config=sandbox_config,
|
|
)
|
|
|
|
config.set_llm_config(
|
|
update_llm_config_for_completions_logging(
|
|
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
|
)
|
|
)
|
|
# get 'draft_editor' config if exists
|
|
config.set_llm_config(get_llm_config_arg('draft_editor'), 'draft_editor')
|
|
|
|
model_routing_config = get_model_routing_config_arg()
|
|
model_routing_config.llms_for_routing = (
|
|
get_llms_for_routing_config()
|
|
) # Populate with LLMs for routing from config.toml file
|
|
|
|
agent_config = AgentConfig(
|
|
enable_jupyter=False,
|
|
enable_browsing=RUN_WITH_BROWSING,
|
|
enable_llm_editor=ENABLE_LLM_EDITOR,
|
|
enable_mcp=False,
|
|
condenser=metadata.condenser_config,
|
|
enable_prompt_extensions=False,
|
|
model_routing=model_routing_config,
|
|
system_prompt_filename=metadata.agent_config.system_prompt_filename
|
|
if metadata.agent_config
|
|
else 'system_prompt.j2',
|
|
)
|
|
config.set_agent_config(agent_config)
|
|
|
|
return config
|
|
|
|
|
|
def initialize_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required
|
|
metadata: EvalMetadata,
|
|
):
|
|
"""Initialize the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
"""
|
|
logger.info('-' * 30)
|
|
logger.info('BEGIN Runtime Initialization Fn')
|
|
logger.info('-' * 30)
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
obs: CmdOutputObservation
|
|
|
|
# Set instance id and git configuration
|
|
action = CmdRunAction(
|
|
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false"""
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
|
|
|
|
# inject the init script
|
|
script_dir = os.path.dirname(__file__)
|
|
|
|
# inject the instance info
|
|
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
|
|
)
|
|
|
|
swe_instance_json_name = 'swe-bench-instance.json'
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Construct the full path for the desired file name within the temporary directory
|
|
temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
|
|
# Write to the file with the desired name within the temporary directory
|
|
with open(temp_file_path, 'w') as f:
|
|
if not isinstance(instance, dict):
|
|
json.dump([instance.to_dict()], f)
|
|
else:
|
|
json.dump([instance], f)
|
|
|
|
# Copy the file to the desired location
|
|
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
|
|
|
|
# inject the instance swe entry
|
|
if DATASET_TYPE == 'SWE-bench-Live':
|
|
entry_script_path = 'instance_swe_entry_live.sh'
|
|
elif DATASET_TYPE == 'SWE-rebench':
|
|
entry_script_path = 'instance_swe_entry_rebench.sh'
|
|
else:
|
|
entry_script_path = 'instance_swe_entry.sh'
|
|
runtime.copy_to(
|
|
str(os.path.join(script_dir, f'scripts/setup/{entry_script_path}')),
|
|
'/swe_util/',
|
|
)
|
|
|
|
action = CmdRunAction(command='cat ~/.bashrc')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
|
|
|
action = CmdRunAction(command='source ~/.bashrc')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
if isinstance(obs, ErrorObservation):
|
|
logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
|
|
|
action = CmdRunAction(command=f'source /swe_util/{entry_script_path}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to source /swe_util/{entry_script_path}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command='git reset --hard')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
|
|
|
|
action = CmdRunAction(
|
|
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
|
|
|
if metadata.details['mode'] == 'swt-ci':
|
|
# set up repo
|
|
setup_commands = []
|
|
if instance['repo'] in MAP_REPO_TO_INSTALL:
|
|
setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])
|
|
|
|
# Run pre-install set up if provided
|
|
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
|
|
instance['version'], []
|
|
)
|
|
if 'pre_install' in install:
|
|
for pre_install in install['pre_install']:
|
|
setup_commands.append(pre_install)
|
|
|
|
if 'install' in install:
|
|
setup_commands.append(install['install'])
|
|
|
|
for command in setup_commands:
|
|
action = CmdRunAction(command=command)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if DATASET_TYPE != 'Multimodal' and DATASET_TYPE != 'SWE-bench-Live':
|
|
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
|
|
# SWE-Bench multimodal datasets and SWE-bench-Live are not using the testbed environment
|
|
action = CmdRunAction(command='which python')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0 and 'testbed' in obs.content,
|
|
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
|
|
)
|
|
|
|
logger.info('-' * 30)
|
|
logger.info('END Runtime Initialization Fn')
|
|
logger.info('-' * 30)
|
|
|
|
|
|
def complete_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
|
|
) -> dict[str, Any]:
|
|
"""Complete the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
If you need to do something in the sandbox to get the correctness metric after
|
|
the agent has run, modify this function.
|
|
"""
|
|
logger.info('-' * 30)
|
|
logger.info('BEGIN Runtime Completion Fn')
|
|
logger.info('-' * 30)
|
|
obs: CmdOutputObservation
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if obs.exit_code == -1:
|
|
# The previous command is still running
|
|
# We need to kill previous command
|
|
logger.info('The previous command is still running, trying to kill it...')
|
|
action = CmdRunAction(command='C-c')
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
# Then run the command again
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if obs.exit_code == -1:
|
|
# The previous command is still running
|
|
# We need to kill previous command
|
|
logger.info('The previous command is still running, trying to ctrl+z it...')
|
|
action = CmdRunAction(command='C-z')
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
# Then run the command again
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command='git config --global core.pager ""')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to git config --global core.pager "": {str(obs)}',
|
|
)
|
|
|
|
# First check for any git repositories in subdirectories
|
|
action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to find git repositories: {str(obs)}',
|
|
)
|
|
|
|
git_dirs = [p for p in obs.content.strip().split('\n') if p]
|
|
if git_dirs:
|
|
# Remove all .git directories in subdirectories
|
|
for git_dir in git_dirs:
|
|
action = CmdRunAction(command=f'rm -rf "{git_dir}"')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to remove git directory {git_dir}: {str(obs)}',
|
|
)
|
|
|
|
# add all files
|
|
action = CmdRunAction(command='git add -A')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to git add -A: {str(obs)}',
|
|
)
|
|
|
|
# Remove binary files from git staging
|
|
action = CmdRunAction(command=remove_binary_files_from_git())
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to remove binary files: {str(obs)}',
|
|
)
|
|
|
|
n_retries = 0
|
|
git_patch = None
|
|
while n_retries < 5:
|
|
action = CmdRunAction(
|
|
command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
|
|
)
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
n_retries += 1
|
|
if isinstance(obs, CmdOutputObservation):
|
|
if obs.exit_code == 0:
|
|
# Read the patch file
|
|
action = FileReadAction(path='patch.diff')
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
if isinstance(obs, FileReadObservation):
|
|
git_patch = obs.content
|
|
break
|
|
elif isinstance(obs, ErrorObservation):
|
|
# Fall back to cat "patch.diff" to get the patch
|
|
assert 'File could not be decoded as utf-8' in obs.content
|
|
action = CmdRunAction(command='cat patch.diff')
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
git_patch = obs.content
|
|
break
|
|
else:
|
|
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
|
else:
|
|
logger.info('Failed to get git diff, retrying...')
|
|
sleep_if_should_continue(10)
|
|
elif isinstance(obs, ErrorObservation):
|
|
logger.error(f'Error occurred: {obs.content}. Retrying...')
|
|
sleep_if_should_continue(10)
|
|
else:
|
|
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
|
|
|
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
|
|
|
|
# Remove binary diffs from the patch
|
|
git_patch = remove_binary_diffs(git_patch)
|
|
|
|
logger.info('-' * 30)
|
|
logger.info('END Runtime Completion Fn')
|
|
logger.info('-' * 30)
|
|
return {'git_patch': git_patch}
|
|
|
|
|
|
def process_instance(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
runtime_failure_count: int = 0,
|
|
) -> EvalOutput:
|
|
config = get_config(instance, metadata)
|
|
|
|
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
|
if reset_logger:
|
|
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
|
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
|
else:
|
|
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
|
|
|
# Increase resource_factor with increasing attempt_id
|
|
if runtime_failure_count > 0:
|
|
config.sandbox.remote_runtime_resource_factor = min(
|
|
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
|
|
8,
|
|
)
|
|
logger.warning(
|
|
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
|
)
|
|
|
|
metadata = copy.deepcopy(metadata)
|
|
metadata.details['runtime_failure_count'] = runtime_failure_count
|
|
metadata.details['remote_runtime_resource_factor'] = (
|
|
config.sandbox.remote_runtime_resource_factor
|
|
)
|
|
|
|
runtime = create_runtime(config)
|
|
call_async_from_sync(runtime.connect)
|
|
|
|
try:
|
|
initialize_runtime(runtime, instance, metadata)
|
|
|
|
message_action = get_instruction(instance, metadata)
|
|
|
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=message_action,
|
|
runtime=runtime,
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
|
metadata.agent_class
|
|
],
|
|
)
|
|
)
|
|
|
|
# if fatal error, throw EvalError to trigger re-run
|
|
if is_fatal_evaluation_error(state.last_error):
|
|
raise EvalException('Fatal error detected: ' + state.last_error)
|
|
|
|
# ======= THIS IS SWE-Bench specific =======
|
|
# Get git patch
|
|
if DATASET_TYPE == 'SWE-bench-Live':
|
|
from evaluation.benchmarks.swe_bench.live_utils import (
|
|
complete_runtime as complete_runtime_fn,
|
|
)
|
|
else:
|
|
complete_runtime_fn = complete_runtime
|
|
return_val = complete_runtime_fn(runtime, instance)
|
|
git_patch = return_val['git_patch']
|
|
logger.info(
|
|
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
|
|
)
|
|
finally:
|
|
runtime.close()
|
|
# ==========================================
|
|
|
|
# ======= Attempt to evaluate the agent's edits =======
|
|
# we use eval_infer.sh to evaluate the agent's edits, not here
|
|
# because the agent may alter the environment / testcases
|
|
test_result = {
|
|
'git_patch': git_patch,
|
|
}
|
|
|
|
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
|
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
|
|
histories = [event_to_dict(event) for event in state.history]
|
|
metrics = get_metrics(state)
|
|
|
|
# Save the output
|
|
instruction = message_action.content
|
|
if message_action.image_urls:
|
|
instruction += (
|
|
'\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>'
|
|
)
|
|
output = EvalOutput(
|
|
instance_id=instance.instance_id,
|
|
instruction=instruction,
|
|
instance=instance.to_dict(), # SWE Bench specific
|
|
test_result=test_result,
|
|
metadata=metadata,
|
|
history=histories,
|
|
metrics=metrics,
|
|
error=state.last_error if state and state.last_error else None,
|
|
)
|
|
return output
|
|
|
|
|
|
def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
|
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
|
|
if os.path.exists(file_path):
|
|
with open(file_path, 'r') as file:
|
|
data = toml.load(file)
|
|
if 'selected_ids' in data:
|
|
selected_ids = data['selected_ids']
|
|
logger.info(
|
|
f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
|
|
)
|
|
subset = dataset[dataset[filter_column].isin(selected_ids)]
|
|
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
|
return subset
|
|
if 'selected_repos' in data:
|
|
# repos for the swe-bench instances:
|
|
# ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy']
|
|
selected_repos = data['selected_repos']
|
|
if isinstance(selected_repos, str):
|
|
selected_repos = [selected_repos]
|
|
assert isinstance(selected_repos, list)
|
|
logger.info(
|
|
f'Filtering {selected_repos} tasks from "selected_repos"...'
|
|
)
|
|
subset = dataset[dataset['repo'].isin(selected_repos)]
|
|
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
|
return subset
|
|
|
|
skip_ids = os.environ.get('SKIP_IDS', '').split(',')
|
|
if len(skip_ids) > 0:
|
|
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
|
|
return dataset[~dataset[filter_column].isin(skip_ids)]
|
|
return dataset
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_evaluation_parser()
|
|
parser.add_argument(
|
|
'--dataset',
|
|
type=str,
|
|
default='princeton-nlp/SWE-bench',
|
|
help='data set to evaluate on, either full-test or lite-test',
|
|
)
|
|
parser.add_argument(
|
|
'--split',
|
|
type=str,
|
|
default='test',
|
|
help='split to evaluate on',
|
|
)
|
|
parser.add_argument(
|
|
'--mode',
|
|
type=str,
|
|
default='swe',
|
|
choices=['swe', 'swt', 'swt-ci'],
|
|
help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
|
|
)
|
|
|
|
args, _ = parser.parse_known_args()
|
|
|
|
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
|
# so we don't need to manage file uploading to OpenHands's repo
|
|
dataset = load_dataset(args.dataset, split=args.split)
|
|
|
|
# Set the global dataset type based on dataset name
|
|
set_dataset_type(args.dataset)
|
|
|
|
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
|
logger.info(
|
|
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
|
)
|
|
if DATASET_TYPE == 'SWE-Gym':
|
|
with open(
|
|
os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
'split',
|
|
'swegym_verified_instances.json',
|
|
),
|
|
'r',
|
|
) as f:
|
|
swegym_verified_instances = json.load(f)
|
|
swe_bench_tests = swe_bench_tests[
|
|
swe_bench_tests['instance_id'].isin(swegym_verified_instances)
|
|
]
|
|
logger.info(
|
|
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances'
|
|
)
|
|
|
|
llm_config = None
|
|
if args.llm_config:
|
|
llm_config = get_llm_config_arg(args.llm_config, args.config_file)
|
|
llm_config.log_completions = True
|
|
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
|
llm_config.modify_params = False
|
|
|
|
if llm_config is None:
|
|
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
|
|
|
# Get condenser config from environment variable
|
|
condenser_name = os.environ.get('EVAL_CONDENSER')
|
|
if condenser_name:
|
|
condenser_config = get_condenser_config_arg(condenser_name, args.config_file)
|
|
if condenser_config is None:
|
|
raise ValueError(
|
|
f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
|
|
)
|
|
else:
|
|
# If no specific condenser config is provided via env var, default to NoOpCondenser
|
|
condenser_config = NoOpCondenserConfig()
|
|
logger.debug(
|
|
'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.'
|
|
)
|
|
|
|
agent_config = None
|
|
if args.agent_config:
|
|
agent_config = get_agent_config_arg(args.agent_config, args.config_file)
|
|
|
|
details = {'mode': args.mode}
|
|
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
|
|
|
dataset_description = (
|
|
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
|
|
)
|
|
metadata = make_metadata(
|
|
llm_config,
|
|
dataset_description,
|
|
args.agent_cls,
|
|
args.max_iterations,
|
|
args.eval_note,
|
|
args.eval_output_dir,
|
|
details=details,
|
|
agent_config=agent_config,
|
|
condenser_config=condenser_config,
|
|
)
|
|
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
print(f'### OUTPUT FILE: {output_file} ###')
|
|
|
|
# Run evaluation in iterative mode:
|
|
# If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made.
|
|
ITERATIVE_EVAL_MODE = (
|
|
os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true'
|
|
)
|
|
ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int(
|
|
os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3')
|
|
)
|
|
|
|
if not ITERATIVE_EVAL_MODE:
|
|
# load the dataset
|
|
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
|
|
if len(instances) > 0 and not isinstance(
|
|
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
|
|
):
|
|
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
|
|
instances[col] = instances[col].apply(lambda x: str(x))
|
|
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
timeout_seconds=8
|
|
* 60
|
|
* 60, # 8 hour PER instance should be more than enough
|
|
max_retries=5,
|
|
)
|
|
else:
|
|
critic = AgentFinishedCritic()
|
|
|
|
def get_cur_output_file_path(attempt: int) -> str:
|
|
return (
|
|
f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl'
|
|
)
|
|
|
|
eval_ids = None
|
|
for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1):
|
|
cur_output_file = get_cur_output_file_path(attempt)
|
|
logger.info(
|
|
f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.'
|
|
)
|
|
|
|
# For deterministic eval, we set temperature to 0.1 for (>1) attempt
|
|
# so hopefully we get slightly different results
|
|
if attempt > 1 and metadata.llm_config.temperature == 0:
|
|
logger.info(
|
|
f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...'
|
|
)
|
|
metadata.llm_config.temperature = 0.1
|
|
|
|
# Load instances - at first attempt, we evaluate all instances
|
|
# On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic
|
|
instances = prepare_dataset(
|
|
swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids
|
|
)
|
|
if len(instances) > 0 and not isinstance(
|
|
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
|
|
):
|
|
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
|
|
instances[col] = instances[col].apply(lambda x: str(x))
|
|
|
|
# Run evaluation - but save them to cur_output_file
|
|
logger.info(
|
|
f'Evaluating {len(instances)} instances for attempt {attempt}...'
|
|
)
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
cur_output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
timeout_seconds=8
|
|
* 60
|
|
* 60, # 8 hour PER instance should be more than enough
|
|
max_retries=5,
|
|
)
|
|
|
|
# When eval is done, we update eval_ids to the instances that failed the current attempt
|
|
instances_failed = []
|
|
logger.info(
|
|
f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...'
|
|
)
|
|
with open(cur_output_file, 'r') as f:
|
|
for line in f:
|
|
instance = json.loads(line)
|
|
try:
|
|
history = [
|
|
event_from_dict(event) for event in instance['history']
|
|
]
|
|
critic_result = critic.evaluate(
|
|
history, instance['test_result'].get('git_patch', '')
|
|
)
|
|
if not critic_result.success:
|
|
instances_failed.append(instance['instance_id'])
|
|
except Exception as e:
|
|
logger.error(
|
|
f'Error loading history for instance {instance["instance_id"]}: {e}'
|
|
)
|
|
instances_failed.append(instance['instance_id'])
|
|
logger.info(
|
|
f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
|
|
)
|
|
eval_ids = instances_failed
|
|
|
|
# If no instances failed, we break
|
|
if len(instances_failed) == 0:
|
|
break
|
|
|
|
# Then we should aggregate the results from all attempts into the original output file
|
|
# and remove the intermediate files
|
|
logger.info(
|
|
'Aggregating results from all attempts into the original output file...'
|
|
)
|
|
fout = open(output_file, 'w')
|
|
added_instance_ids = set()
|
|
for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)):
|
|
cur_output_file = get_cur_output_file_path(attempt)
|
|
if not os.path.exists(cur_output_file):
|
|
logger.warning(
|
|
f'Intermediate output file {cur_output_file} does not exist. Skipping...'
|
|
)
|
|
continue
|
|
|
|
with open(cur_output_file, 'r') as f:
|
|
for line in f:
|
|
instance = json.loads(line)
|
|
# Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else)
|
|
if (
|
|
instance['instance_id'] not in added_instance_ids
|
|
and instance['test_result'].get('git_patch', '').strip()
|
|
):
|
|
fout.write(line)
|
|
added_instance_ids.add(instance['instance_id'])
|
|
logger.info(
|
|
f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}'
|
|
)
|
|
fout.close()
|
|
logger.info(
|
|
f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
|
|
)
|
|
# Check if any instances reached maximum retries
|
|
check_maximum_retries_exceeded(metadata.eval_output_dir)
|