mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
942 lines
38 KiB
Python
942 lines
38 KiB
Python
import asyncio
|
|
import copy
|
|
import json
|
|
import os
|
|
import tempfile
|
|
from typing import Any, Literal
|
|
|
|
import pandas as pd
|
|
import toml
|
|
from datasets import load_dataset
|
|
|
|
import openhands.agenthub
|
|
from evaluation.benchmarks.swe_bench.binary_patch_utils import (
|
|
remove_binary_diffs,
|
|
remove_binary_files_from_git,
|
|
)
|
|
from evaluation.benchmarks.swe_bench.resource.mapping import (
|
|
get_instance_resource_factor,
|
|
)
|
|
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import (
|
|
MAP_REPO_TO_INSTALL,
|
|
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE,
|
|
MAP_VERSION_TO_INSTALL,
|
|
)
|
|
from evaluation.utils.shared import (
|
|
EvalException,
|
|
EvalMetadata,
|
|
EvalOutput,
|
|
assert_and_raise,
|
|
codeact_user_response,
|
|
get_default_sandbox_config_for_eval,
|
|
get_metrics,
|
|
is_fatal_evaluation_error,
|
|
make_metadata,
|
|
prepare_dataset,
|
|
reset_logger_for_multiprocessing,
|
|
run_evaluation,
|
|
update_llm_config_for_completions_logging,
|
|
)
|
|
from openhands.controller.state.state import State
|
|
from openhands.core.config import (
|
|
AgentConfig,
|
|
AppConfig,
|
|
get_llm_config_arg,
|
|
get_parser,
|
|
)
|
|
from openhands.core.logger import openhands_logger as logger
|
|
from openhands.core.main import create_runtime, run_controller
|
|
from openhands.critic import AgentFinishedCritic
|
|
from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
|
|
from openhands.events.observation import (
|
|
CmdOutputObservation,
|
|
ErrorObservation,
|
|
FileReadObservation,
|
|
)
|
|
from openhands.events.serialization.event import event_from_dict, event_to_dict
|
|
from openhands.runtime.base import Runtime
|
|
from openhands.utils.async_utils import call_async_from_sync
|
|
from openhands.utils.shutdown_listener import sleep_if_should_continue
|
|
|
|
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
|
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
|
BenchMode = Literal['swe', 'swt', 'swt-ci']
|
|
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': codeact_user_response,
|
|
}
|
|
|
|
|
|
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
|
return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
|
|
|
|
|
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
mode = metadata.details['mode']
|
|
if mode.startswith('swt'):
|
|
test_instructions = (
|
|
f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n'
|
|
if mode.endswith('ci')
|
|
else ''
|
|
)
|
|
instruction = f"""\
|
|
<uploaded_files>
|
|
/workspace/{workspace_dir_name}
|
|
</uploaded_files>
|
|
I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
|
|
|
|
<issue_description>
|
|
{instance.problem_statement}
|
|
</issue_description>
|
|
|
|
|
|
Can you help me implement the necessary changes to the repository to test whether the issue in <issue_description> was resolved?
|
|
I will take care of all changes to any of the non-test files. This means you DON'T have to modify the actual logic and ONLY have to update test logic and tests!
|
|
Your task is to make the minimal changes to tests files in the /workspace directory to reproduce the issue in the <issue_description>, i.e., such that the generated tests fail in the current state (where the issue is unresolved) and pass when the issue will be resolved.
|
|
Follow these steps to reproduce the issue:
|
|
1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.
|
|
2. Create a script `reproduction.py` to reproduce the error and execute it with `python reproduction.py` using the BashTool, to confirm the error
|
|
3. Edit the sourcecode of the repo to integrate your reproduction script into the test framework
|
|
4. Run the test framework and make sure your tests fail! Only submit FAILING tests! Never submit passing tests.
|
|
{test_instructions}Your thinking should be thorough and so it's fine if it's very long.
|
|
"""
|
|
else:
|
|
instruction = f"""
|
|
<uploaded_files>
|
|
/workspace/{workspace_dir_name}
|
|
</uploaded_files>
|
|
|
|
I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description:
|
|
|
|
<issue_description>
|
|
{instance.problem_statement}
|
|
</issue_description>
|
|
|
|
Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
|
|
I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
|
|
Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
|
|
Your task is to make the minimal changes to non-test files in the /workspace/{workspace_dir_name} directory to ensure the <issue_description> is satisfied.
|
|
|
|
Follow these phases to resolve the issue:
|
|
|
|
Phase 1. READING: read the problem and reword it in clearer terms
|
|
1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
|
|
1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details.
|
|
1.3 Explain the problem in clear terms.
|
|
1.4 Enumerate the steps to reproduce the problem.
|
|
1.5 Hightlight any best practices to take into account when testing and fixing the issue
|
|
|
|
Phase 2. RUNNING: install and run the tests on the repository
|
|
2.1 Follow the readme
|
|
2.2 Install the environment and anything needed
|
|
2.2 Iterate and figure out how to run the tests
|
|
|
|
Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions
|
|
3.1 Use `grep` to search for relevant methods, classes, keywords and error messages.
|
|
3.2 Identify all files related to the problem statement.
|
|
3.3 Propose the methods and files to fix the issue and explain why.
|
|
3.4 From the possible file locations, select the most likely location to fix the issue.
|
|
|
|
Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue.
|
|
4.1 Look at existing test files in the repository to understand the test format/structure.
|
|
4.2 Create a minimal reproduction script that reproduces the located issue.
|
|
4.3 Run the reproduction script to confirm you are reproducing the issue.
|
|
4.4 Adjust the reproduction script as necessary.
|
|
|
|
Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it
|
|
5.1 State clearly what the problem is.
|
|
5.2 State clearly where the problem is located.
|
|
5.3 State clearly how the test reproduces the issue.
|
|
5.4 State clearly the best practices to take into account in the fix.
|
|
5.5 State clearly how to fix the problem.
|
|
|
|
Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution.
|
|
6.1 Make minimal, focused changes to fix the issue.
|
|
|
|
Phase 7. VERIFICATION: Test your implementation thoroughly.
|
|
7.1 Run your reproduction script to verify the fix works.
|
|
7.2 Add edge cases to your test script to ensure comprehensive coverage.
|
|
7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
|
|
|
|
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['base_commit']}.
|
|
8.1 Ensure you've fully addressed all requirements.
|
|
8.2 Run any tests in the repository related to:
|
|
8.2.1 The issue you are fixing
|
|
8.2.2 The files you modified
|
|
8.2.3 The functions you changed
|
|
8.3 If any tests fail, revise your implementation until all tests pass
|
|
|
|
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
|
|
"""
|
|
|
|
if RUN_WITH_BROWSING:
|
|
instruction += (
|
|
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
|
|
)
|
|
|
|
if 'image_assets' in instance:
|
|
assets = json.loads(instance['image_assets'])
|
|
assert 'problem_statement' in assets, (
|
|
'problem_statement is required in image_assets'
|
|
)
|
|
image_urls = assets['problem_statement']
|
|
return MessageAction(content=instruction, image_urls=image_urls)
|
|
return MessageAction(content=instruction)
|
|
|
|
|
|
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
|
DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get(
|
|
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/'
|
|
)
|
|
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
|
|
|
|
|
|
def get_instance_docker_image(
|
|
instance_id: str,
|
|
swebench_official_image: bool = False,
|
|
) -> str:
|
|
if swebench_official_image:
|
|
# Official SWE-Bench image
|
|
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
|
|
docker_image_prefix = 'docker.io/swebench/'
|
|
repo, name = instance_id.split('__')
|
|
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
|
|
logger.debug(f'Using official SWE-Bench image: {image_name}')
|
|
return image_name
|
|
else:
|
|
# OpenHands version of the image
|
|
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX
|
|
image_name = 'sweb.eval.x86_64.' + instance_id
|
|
image_name = image_name.replace(
|
|
'__', '_s_'
|
|
) # to comply with docker image naming convention
|
|
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()
|
|
|
|
|
|
def get_config(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
) -> AppConfig:
|
|
# We use a different instance image for the each instance of swe-bench eval
|
|
use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
|
|
base_container_image = get_instance_docker_image(
|
|
instance['instance_id'],
|
|
swebench_official_image=use_swebench_official_image,
|
|
)
|
|
logger.info(
|
|
f'Using instance container image: {base_container_image}. '
|
|
f'Please make sure this image exists. '
|
|
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
|
)
|
|
|
|
sandbox_config = get_default_sandbox_config_for_eval()
|
|
sandbox_config.base_container_image = base_container_image
|
|
sandbox_config.enable_auto_lint = True
|
|
sandbox_config.use_host_network = False
|
|
# Add platform to the sandbox config to solve issue 4401
|
|
sandbox_config.platform = 'linux/amd64'
|
|
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
|
|
dataset_name=metadata.dataset,
|
|
instance_id=instance['instance_id'],
|
|
)
|
|
|
|
config = AppConfig(
|
|
default_agent=metadata.agent_class,
|
|
run_as_openhands=False,
|
|
max_iterations=metadata.max_iterations,
|
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
|
sandbox=sandbox_config,
|
|
# do not mount workspace
|
|
workspace_base=None,
|
|
workspace_mount_path=None,
|
|
)
|
|
config.set_llm_config(
|
|
update_llm_config_for_completions_logging(
|
|
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
|
)
|
|
)
|
|
agent_config = AgentConfig(
|
|
enable_jupyter=False,
|
|
enable_browsing=RUN_WITH_BROWSING,
|
|
enable_llm_editor=False,
|
|
condenser=metadata.condenser_config,
|
|
enable_prompt_extensions=False,
|
|
)
|
|
config.set_agent_config(agent_config)
|
|
return config
|
|
|
|
|
|
def initialize_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required
|
|
metadata: EvalMetadata,
|
|
):
|
|
"""Initialize the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
"""
|
|
logger.info('-' * 30)
|
|
logger.info('BEGIN Runtime Initialization Fn')
|
|
logger.info('-' * 30)
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
obs: CmdOutputObservation
|
|
|
|
# Set instance id and git configuration
|
|
action = CmdRunAction(
|
|
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false"""
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
|
|
|
|
# inject the init script
|
|
script_dir = os.path.dirname(__file__)
|
|
|
|
# inject the instance info
|
|
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
|
|
)
|
|
|
|
swe_instance_json_name = 'swe-bench-instance.json'
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Construct the full path for the desired file name within the temporary directory
|
|
temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
|
|
# Write to the file with the desired name within the temporary directory
|
|
with open(temp_file_path, 'w') as f:
|
|
if not isinstance(instance, dict):
|
|
json.dump([instance.to_dict()], f)
|
|
else:
|
|
json.dump([instance], f)
|
|
|
|
# Copy the file to the desired location
|
|
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
|
|
|
|
# inject the instance swe entry
|
|
runtime.copy_to(
|
|
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
|
|
'/swe_util/',
|
|
)
|
|
|
|
action = CmdRunAction(command='cat ~/.bashrc')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
|
|
|
action = CmdRunAction(command='source ~/.bashrc')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
if isinstance(obs, ErrorObservation):
|
|
logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
|
|
|
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command='git reset --hard')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
|
|
|
|
action = CmdRunAction(
|
|
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
|
|
|
if metadata.details['mode'] == 'swt-ci':
|
|
# set up repo
|
|
setup_commands = []
|
|
if instance['repo'] in MAP_REPO_TO_INSTALL:
|
|
setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']])
|
|
|
|
# Run pre-install set up if provided
|
|
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get(
|
|
instance['version'], []
|
|
)
|
|
if 'pre_install' in install:
|
|
for pre_install in install['pre_install']:
|
|
setup_commands.append(pre_install)
|
|
|
|
if 'install' in install:
|
|
setup_commands.append(install['install'])
|
|
|
|
for command in setup_commands:
|
|
action = CmdRunAction(command=command)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if 'multimodal' not in metadata.dataset.lower():
|
|
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
|
|
# SWE-Bench multimodal datasets are not using the testbed environment
|
|
action = CmdRunAction(command='which python')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0 and 'testbed' in obs.content,
|
|
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
|
|
)
|
|
|
|
logger.info('-' * 30)
|
|
logger.info('END Runtime Initialization Fn')
|
|
logger.info('-' * 30)
|
|
|
|
|
|
def complete_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
|
|
) -> dict[str, Any]:
|
|
"""Complete the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
If you need to do something in the sandbox to get the correctness metric after
|
|
the agent has run, modify this function.
|
|
"""
|
|
logger.info('-' * 30)
|
|
logger.info('BEGIN Runtime Completion Fn')
|
|
logger.info('-' * 30)
|
|
obs: CmdOutputObservation
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if obs.exit_code == -1:
|
|
# The previous command is still running
|
|
# We need to kill previous command
|
|
logger.info('The previous command is still running, trying to kill it...')
|
|
action = CmdRunAction(command='C-c')
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
# Then run the command again
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if obs.exit_code == -1:
|
|
# The previous command is still running
|
|
# We need to kill previous command
|
|
logger.info('The previous command is still running, trying to ctrl+z it...')
|
|
action = CmdRunAction(command='C-z')
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
# Then run the command again
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command='git config --global core.pager ""')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to git config --global core.pager "": {str(obs)}',
|
|
)
|
|
|
|
# First check for any git repositories in subdirectories
|
|
action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to find git repositories: {str(obs)}',
|
|
)
|
|
|
|
git_dirs = [p for p in obs.content.strip().split('\n') if p]
|
|
if git_dirs:
|
|
# Remove all .git directories in subdirectories
|
|
for git_dir in git_dirs:
|
|
action = CmdRunAction(command=f'rm -rf "{git_dir}"')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to remove git directory {git_dir}: {str(obs)}',
|
|
)
|
|
|
|
# add all files
|
|
action = CmdRunAction(command='git add -A')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to git add -A: {str(obs)}',
|
|
)
|
|
|
|
# Remove binary files from git staging
|
|
action = CmdRunAction(command=remove_binary_files_from_git())
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to remove binary files: {str(obs)}',
|
|
)
|
|
|
|
n_retries = 0
|
|
git_patch = None
|
|
while n_retries < 5:
|
|
action = CmdRunAction(
|
|
command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
|
|
)
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
n_retries += 1
|
|
if isinstance(obs, CmdOutputObservation):
|
|
if obs.exit_code == 0:
|
|
# Read the patch file
|
|
action = FileReadAction(path='patch.diff')
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
if isinstance(obs, FileReadObservation):
|
|
git_patch = obs.content
|
|
break
|
|
elif isinstance(obs, ErrorObservation):
|
|
# Fall back to cat "patch.diff" to get the patch
|
|
assert 'File could not be decoded as utf-8' in obs.content
|
|
action = CmdRunAction(command='cat patch.diff')
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
git_patch = obs.content
|
|
break
|
|
else:
|
|
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
|
else:
|
|
logger.info('Failed to get git diff, retrying...')
|
|
sleep_if_should_continue(10)
|
|
elif isinstance(obs, ErrorObservation):
|
|
logger.error(f'Error occurred: {obs.content}. Retrying...')
|
|
sleep_if_should_continue(10)
|
|
else:
|
|
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
|
|
|
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
|
|
|
|
# Remove binary diffs from the patch
|
|
git_patch = remove_binary_diffs(git_patch)
|
|
|
|
logger.info('-' * 30)
|
|
logger.info('END Runtime Completion Fn')
|
|
logger.info('-' * 30)
|
|
return {'git_patch': git_patch}
|
|
|
|
|
|
def process_instance(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
runtime_failure_count: int = 0,
|
|
) -> EvalOutput:
|
|
config = get_config(instance, metadata)
|
|
|
|
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
|
if reset_logger:
|
|
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
|
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
|
else:
|
|
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
|
|
|
# Increase resource_factor with increasing attempt_id
|
|
if runtime_failure_count > 0:
|
|
config.sandbox.remote_runtime_resource_factor = min(
|
|
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
|
|
8,
|
|
)
|
|
logger.warning(
|
|
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
|
)
|
|
|
|
metadata = copy.deepcopy(metadata)
|
|
metadata.details['runtime_failure_count'] = runtime_failure_count
|
|
metadata.details['remote_runtime_resource_factor'] = (
|
|
config.sandbox.remote_runtime_resource_factor
|
|
)
|
|
|
|
runtime = create_runtime(config)
|
|
call_async_from_sync(runtime.connect)
|
|
|
|
try:
|
|
initialize_runtime(runtime, instance, metadata)
|
|
|
|
message_action = get_instruction(instance, metadata)
|
|
|
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=message_action,
|
|
runtime=runtime,
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
|
metadata.agent_class
|
|
],
|
|
)
|
|
)
|
|
|
|
# if fatal error, throw EvalError to trigger re-run
|
|
if is_fatal_evaluation_error(state.last_error):
|
|
raise EvalException('Fatal error detected: ' + state.last_error)
|
|
|
|
# ======= THIS IS SWE-Bench specific =======
|
|
# Get git patch
|
|
return_val = complete_runtime(runtime, instance)
|
|
git_patch = return_val['git_patch']
|
|
logger.info(
|
|
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
|
|
)
|
|
finally:
|
|
runtime.close()
|
|
# ==========================================
|
|
|
|
# ======= Attempt to evaluate the agent's edits =======
|
|
# we use eval_infer.sh to evaluate the agent's edits, not here
|
|
# because the agent may alter the environment / testcases
|
|
test_result = {
|
|
'git_patch': git_patch,
|
|
}
|
|
|
|
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
|
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
|
|
histories = [event_to_dict(event) for event in state.history]
|
|
metrics = get_metrics(state)
|
|
|
|
# Save the output
|
|
instruction = message_action.content
|
|
if message_action.image_urls:
|
|
instruction += (
|
|
'\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>'
|
|
)
|
|
output = EvalOutput(
|
|
instance_id=instance.instance_id,
|
|
instruction=instruction,
|
|
instance=instance.to_dict(), # SWE Bench specific
|
|
test_result=test_result,
|
|
metadata=metadata,
|
|
history=histories,
|
|
metrics=metrics,
|
|
error=state.last_error if state and state.last_error else None,
|
|
)
|
|
return output
|
|
|
|
|
|
def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
|
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
|
|
if os.path.exists(file_path):
|
|
with open(file_path, 'r') as file:
|
|
data = toml.load(file)
|
|
if 'selected_ids' in data:
|
|
selected_ids = data['selected_ids']
|
|
logger.info(
|
|
f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
|
|
)
|
|
subset = dataset[dataset[filter_column].isin(selected_ids)]
|
|
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
|
return subset
|
|
skip_ids = os.environ.get('SKIP_IDS', '').split(',')
|
|
if len(skip_ids) > 0:
|
|
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
|
|
return dataset[~dataset[filter_column].isin(skip_ids)]
|
|
return dataset
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_parser()
|
|
parser.add_argument(
|
|
'--dataset',
|
|
type=str,
|
|
default='princeton-nlp/SWE-bench',
|
|
help='data set to evaluate on, either full-test or lite-test',
|
|
)
|
|
parser.add_argument(
|
|
'--split',
|
|
type=str,
|
|
default='test',
|
|
help='split to evaluate on',
|
|
)
|
|
parser.add_argument(
|
|
'--mode',
|
|
type=str,
|
|
default='swe',
|
|
choices=['swe', 'swt', 'swt-ci'],
|
|
help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
|
|
)
|
|
args, _ = parser.parse_known_args()
|
|
|
|
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
|
# so we don't need to manage file uploading to OpenHands's repo
|
|
dataset = load_dataset(args.dataset, split=args.split)
|
|
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
|
logger.info(
|
|
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
|
)
|
|
if 'SWE-Gym' in args.dataset:
|
|
with open(
|
|
os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
'split',
|
|
'swegym_verified_instances.json',
|
|
),
|
|
'r',
|
|
) as f:
|
|
swegym_verified_instances = json.load(f)
|
|
swe_bench_tests = swe_bench_tests[
|
|
swe_bench_tests['instance_id'].isin(swegym_verified_instances)
|
|
]
|
|
logger.info(
|
|
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances'
|
|
)
|
|
|
|
llm_config = None
|
|
if args.llm_config:
|
|
llm_config = get_llm_config_arg(args.llm_config)
|
|
llm_config.log_completions = True
|
|
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
|
llm_config.modify_params = False
|
|
|
|
if llm_config is None:
|
|
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
|
|
|
details = {'mode': args.mode}
|
|
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
|
|
|
dataset_descrption = (
|
|
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
|
|
)
|
|
metadata = make_metadata(
|
|
llm_config,
|
|
dataset_descrption,
|
|
args.agent_cls,
|
|
args.max_iterations,
|
|
args.eval_note,
|
|
args.eval_output_dir,
|
|
details=details,
|
|
)
|
|
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
print(f'### OUTPUT FILE: {output_file} ###')
|
|
|
|
# Run evaluation in iterative mode:
|
|
# If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made.
|
|
ITERATIVE_EVAL_MODE = (
|
|
os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true'
|
|
)
|
|
ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int(
|
|
os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3')
|
|
)
|
|
|
|
if not ITERATIVE_EVAL_MODE:
|
|
# load the dataset
|
|
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
|
|
if len(instances) > 0 and not isinstance(
|
|
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
|
|
):
|
|
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
|
|
instances[col] = instances[col].apply(lambda x: str(x))
|
|
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
timeout_seconds=8
|
|
* 60
|
|
* 60, # 8 hour PER instance should be more than enough
|
|
max_retries=5,
|
|
)
|
|
else:
|
|
critic = AgentFinishedCritic()
|
|
|
|
def get_cur_output_file_path(attempt: int) -> str:
|
|
return (
|
|
f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl'
|
|
)
|
|
|
|
eval_ids = None
|
|
for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1):
|
|
cur_output_file = get_cur_output_file_path(attempt)
|
|
logger.info(
|
|
f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.'
|
|
)
|
|
|
|
# For deterministic eval, we set temperature to 0.1 for (>1) attempt
|
|
# so hopefully we get slightly different results
|
|
if attempt > 1 and metadata.llm_config.temperature == 0:
|
|
logger.info(
|
|
f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...'
|
|
)
|
|
metadata.llm_config.temperature = 0.1
|
|
|
|
# Load instances - at first attempt, we evaluate all instances
|
|
# On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic
|
|
instances = prepare_dataset(
|
|
swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids
|
|
)
|
|
if len(instances) > 0 and not isinstance(
|
|
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
|
|
):
|
|
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
|
|
instances[col] = instances[col].apply(lambda x: str(x))
|
|
|
|
# Run evaluation - but save them to cur_output_file
|
|
logger.info(
|
|
f'Evaluating {len(instances)} instances for attempt {attempt}...'
|
|
)
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
cur_output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
timeout_seconds=8
|
|
* 60
|
|
* 60, # 8 hour PER instance should be more than enough
|
|
max_retries=5,
|
|
)
|
|
|
|
# When eval is done, we update eval_ids to the instances that failed the current attempt
|
|
instances_failed = []
|
|
logger.info(
|
|
f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...'
|
|
)
|
|
with open(cur_output_file, 'r') as f:
|
|
for line in f:
|
|
instance = json.loads(line)
|
|
try:
|
|
history = [
|
|
event_from_dict(event) for event in instance['history']
|
|
]
|
|
critic_result = critic.evaluate(
|
|
history, instance['test_result'].get('git_patch', '')
|
|
)
|
|
if not critic_result.success:
|
|
instances_failed.append(instance['instance_id'])
|
|
except Exception as e:
|
|
logger.error(
|
|
f'Error loading history for instance {instance["instance_id"]}: {e}'
|
|
)
|
|
instances_failed.append(instance['instance_id'])
|
|
logger.info(
|
|
f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
|
|
)
|
|
eval_ids = instances_failed
|
|
|
|
# If no instances failed, we break
|
|
if len(instances_failed) == 0:
|
|
break
|
|
|
|
# Then we should aggregate the results from all attempts into the original output file
|
|
# and remove the intermediate files
|
|
logger.info(
|
|
'Aggregating results from all attempts into the original output file...'
|
|
)
|
|
fout = open(output_file, 'w')
|
|
added_instance_ids = set()
|
|
for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)):
|
|
cur_output_file = get_cur_output_file_path(attempt)
|
|
if not os.path.exists(cur_output_file):
|
|
logger.warning(
|
|
f'Intermediate output file {cur_output_file} does not exist. Skipping...'
|
|
)
|
|
continue
|
|
|
|
with open(cur_output_file, 'r') as f:
|
|
for line in f:
|
|
instance = json.loads(line)
|
|
# Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else)
|
|
if (
|
|
instance['instance_id'] not in added_instance_ids
|
|
and instance['test_result'].get('git_patch', '').strip()
|
|
):
|
|
fout.write(line)
|
|
added_instance_ids.add(instance['instance_id'])
|
|
logger.info(
|
|
f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}'
|
|
)
|
|
fout.close()
|
|
logger.info(
|
|
f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
|
|
)
|