mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <engel.nyst@gmail.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
718 lines
27 KiB
Python
718 lines
27 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
import tempfile
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
import toml
|
|
from datasets import load_dataset
|
|
|
|
import openhands.agenthub
|
|
from evaluation.benchmarks.swe_bench.resource.mapping import (
|
|
get_instance_resource_factor,
|
|
)
|
|
from evaluation.utils.shared import (
|
|
EvalException,
|
|
EvalMetadata,
|
|
EvalOutput,
|
|
assert_and_raise,
|
|
codeact_user_response,
|
|
get_default_sandbox_config_for_eval,
|
|
get_metrics,
|
|
get_openhands_config_for_eval,
|
|
is_fatal_evaluation_error,
|
|
make_metadata,
|
|
prepare_dataset,
|
|
reset_logger_for_multiprocessing,
|
|
run_evaluation,
|
|
update_llm_config_for_completions_logging,
|
|
)
|
|
from openhands.controller.state.state import State
|
|
from openhands.core.config import (
|
|
AgentConfig,
|
|
OpenHandsConfig,
|
|
get_evaluation_parser,
|
|
get_llm_config_arg,
|
|
)
|
|
from openhands.core.logger import openhands_logger as logger
|
|
from openhands.core.main import create_runtime, run_controller
|
|
from openhands.events.action import CmdRunAction, MessageAction
|
|
from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
|
from openhands.events.serialization.event import event_to_dict
|
|
from openhands.runtime.base import Runtime
|
|
from openhands.utils.async_utils import call_async_from_sync
|
|
from openhands.utils.shutdown_listener import sleep_if_should_continue
|
|
|
|
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
|
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
|
INDEX_BASE_DIR = os.environ.get('INDEX_BASE_DIR', '')
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': codeact_user_response,
|
|
'LocAgent': codeact_user_response,
|
|
}
|
|
|
|
|
|
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
|
return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
|
|
|
|
|
def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
|
_get_swebench_workspace_dir_name(instance)
|
|
instruction = f"""
|
|
Consider the following issue description:
|
|
|
|
<issue_description>
|
|
{instance.problem_statement}
|
|
</issue_description>
|
|
|
|
Your objective is to localize the specific files, classes or functions, and lines of code that need modification or contain key information to resolve the issue.
|
|
|
|
Follow these steps to localize the issue:
|
|
## Step 1: Categorize and Extract Key Problem Information
|
|
- Classify the problem statement into the following categories:
|
|
Problem description, error trace, code to reproduce the bug, and additional context.
|
|
- Identify modules in the "{instance.instance_id.split('_')[0]}" package mentioned in each category.
|
|
- Use extracted keywords and line numbers to search for relevant code references for additional context.
|
|
|
|
## Step 2: Locate Referenced Modules
|
|
- Accurately determine specific modules
|
|
- Explore the repo to familiarize yourself with its structure.
|
|
- Analyze the described execution flow to identify specific modules or components being referenced.
|
|
- Pay special attention to distinguishing between modules with similar names using context and described execution flow.
|
|
- Output Format for collected relevant modules:
|
|
- Use the format: 'file_path:QualifiedName'
|
|
- E.g., for a function `calculate_sum` in the `MathUtils` class located in `src/helpers/math_helpers.py`, represent it as: 'src/helpers/math_helpers.py:MathUtils.calculate_sum'.
|
|
|
|
## Step 3: Analyze and Reproducing the Problem
|
|
- Clarify the Purpose of the Issue
|
|
- If expanding capabilities: Identify where and how to incorporate new behavior, fields, or modules.
|
|
- If addressing unexpected behavior: Focus on localizing modules containing potential bugs.
|
|
- Reconstruct the execution flow
|
|
- Identify main entry points triggering the issue.
|
|
- Trace function calls, class interactions, and sequences of events.
|
|
- Identify potential breakpoints causing the issue.
|
|
Important: Keep the reconstructed flow focused on the problem, avoiding irrelevant details.
|
|
|
|
## Step 4: Locate Areas for Modification
|
|
- Locate specific files, functions, or lines of code requiring changes or containing critical information for resolving the issue.
|
|
- Consider upstream and downstream dependencies that may affect or be affected by the issue.
|
|
- If applicable, identify where to introduce new fields, functions, or variables.
|
|
- Think Thoroughly: List multiple potential solutions and consider edge cases that could impact the resolution.
|
|
|
|
## Output Format for Final Results:
|
|
Your final output should list the locations requiring modification, wrapped with triple backticks ```
|
|
Each location should include the file path, class name (if applicable), function name, or line numbers, ordered by importance.
|
|
Your answer would better include about 5 files.
|
|
|
|
### Examples:
|
|
```
|
|
full_path1/file1.py
|
|
line: 10
|
|
class: MyClass1
|
|
function: my_function1
|
|
|
|
full_path2/file2.py
|
|
line: 76
|
|
function: MyClass2.my_function2
|
|
|
|
full_path3/file3.py
|
|
line: 24
|
|
line: 156
|
|
function: my_function3
|
|
```
|
|
|
|
Return just the location(s)
|
|
|
|
Note: Your thinking should be thorough and so it's fine if it's very long.
|
|
"""
|
|
instruction += (
|
|
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
|
"Don't include any lambda functions!\n"
|
|
'You should NOT modify any files!\n'
|
|
)
|
|
if RUN_WITH_BROWSING:
|
|
instruction += """
|
|
<IMPORTANT!>
|
|
You SHOULD NEVER attempt to browse the web.
|
|
</IMPORTANT!>
|
|
"""
|
|
return instruction
|
|
|
|
|
|
# TODO: migrate all swe-bench docker to ghcr.io/openhands
|
|
DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get(
|
|
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/'
|
|
)
|
|
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
|
|
|
|
|
|
def get_instance_docker_image(instance_id: str, official_image: bool = False) -> str:
|
|
if official_image:
|
|
# Official SWE-Bench image
|
|
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
|
|
docker_image_prefix = 'docker.io/swebench/'
|
|
repo, name = instance_id.split('__')
|
|
image_name = f'sweb.eval.x86_64.{repo}_1776_{name}:latest'
|
|
logger.warning(f'Using official SWE-Bench image: {image_name}')
|
|
else:
|
|
# OpenHands version of the image
|
|
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX
|
|
image_name = 'sweb.eval.x86_64.' + instance_id
|
|
image_name = image_name.replace(
|
|
'__', '_s_'
|
|
) # to comply with docker image naming convention
|
|
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()
|
|
|
|
|
|
def get_config(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
) -> OpenHandsConfig:
|
|
# We use a different instance image for the each instance of swe-bench eval
|
|
use_official_image = bool(
|
|
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
|
|
)
|
|
base_container_image = get_instance_docker_image(
|
|
instance['instance_id'], use_official_image
|
|
)
|
|
logger.info(
|
|
f'Using instance container image: {base_container_image}. '
|
|
f'Please make sure this image exists. '
|
|
f'Submit an issue on https://github.com/OpenHands/OpenHands if you run into any issues.'
|
|
)
|
|
|
|
sandbox_config = get_default_sandbox_config_for_eval()
|
|
sandbox_config.base_container_image = base_container_image
|
|
sandbox_config.enable_auto_lint = True
|
|
sandbox_config.use_host_network = False
|
|
# Add platform to the sandbox config to solve issue 4401
|
|
sandbox_config.platform = 'linux/amd64'
|
|
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
|
|
dataset_name=metadata.dataset,
|
|
instance_id=instance['instance_id'],
|
|
)
|
|
oh_aci_li_cmd = '/openhands/micromamba/bin/micromamba run -n openhands poetry run pip install openhands-aci[llama]'
|
|
sandbox_config.runtime_extra_deps = oh_aci_li_cmd
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
sandbox_config.runtime_startup_env_vars = {
|
|
'REPO_PATH': f'/workspace/{workspace_dir_name}/',
|
|
}
|
|
|
|
config = get_openhands_config_for_eval(
|
|
metadata=metadata,
|
|
enable_browser=RUN_WITH_BROWSING,
|
|
runtime=os.environ.get('RUNTIME', 'docker'),
|
|
sandbox_config=sandbox_config,
|
|
)
|
|
config.set_llm_config(
|
|
update_llm_config_for_completions_logging(
|
|
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
|
)
|
|
)
|
|
agent_config = AgentConfig(
|
|
enable_jupyter=False,
|
|
enable_browsing=RUN_WITH_BROWSING,
|
|
enable_llm_editor=False,
|
|
enable_mcp=os.environ.get('ENABLE_MCP', False),
|
|
condenser=metadata.condenser_config,
|
|
enable_prompt_extensions=False,
|
|
)
|
|
config.set_agent_config(agent_config)
|
|
return config
|
|
|
|
|
|
def initialize_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required
|
|
):
|
|
"""Initialize the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
"""
|
|
logger.info('-' * 30)
|
|
logger.info('BEGIN Runtime Initialization Fn')
|
|
logger.info('-' * 30)
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
obs: CmdOutputObservation
|
|
|
|
# Set instance id
|
|
action = CmdRunAction(
|
|
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}'
|
|
)
|
|
|
|
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
|
|
|
|
# inject the init script
|
|
script_dir = os.path.dirname(__file__)
|
|
|
|
# inject the instance info
|
|
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
|
|
)
|
|
|
|
swe_instance_json_name = 'swe-bench-instance.json'
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Construct the full path for the desired file name within the temporary directory
|
|
temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
|
|
# Write to the file with the desired name within the temporary directory
|
|
with open(temp_file_path, 'w') as f:
|
|
if not isinstance(instance, dict):
|
|
json.dump([instance.to_dict()], f)
|
|
else:
|
|
json.dump([instance], f)
|
|
|
|
# Copy the file to the desired location
|
|
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
|
|
|
|
# inject the instance swe entry
|
|
runtime.copy_to(
|
|
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
|
|
'/swe_util/',
|
|
)
|
|
|
|
action = CmdRunAction(command='cat ~/.bashrc')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
|
|
|
action = CmdRunAction(command='source ~/.bashrc')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
if isinstance(obs, ErrorObservation):
|
|
logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
|
|
|
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0,
|
|
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command='git reset --hard')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
|
|
|
|
action = CmdRunAction(
|
|
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
|
|
|
# Copy the processed indexes if available
|
|
action = CmdRunAction(command='mkdir _index_data/graph_index_v2.3')
|
|
obs = runtime.run_action(action)
|
|
|
|
# Check if an existing graph index file is available
|
|
graph_index_file_path = os.path.join(
|
|
INDEX_BASE_DIR, 'graph_index_v2.3', f'{instance["instance_id"]}.pkl'
|
|
)
|
|
if INDEX_BASE_DIR and os.path.exists(graph_index_file_path):
|
|
logger.info(
|
|
f'Copying graph index from {graph_index_file_path} to /workspace/{workspace_dir_name}/_index_data/graph_index_v2.3'
|
|
)
|
|
|
|
runtime.copy_to(
|
|
graph_index_file_path,
|
|
f'/workspace/{workspace_dir_name}/_index_data/graph_index_v2.3',
|
|
)
|
|
action = CmdRunAction(
|
|
command=f'mv _index_data/graph_index_v2.3/{instance["instance_id"]}.pkl _index_data/graph_index_v2.3/code_graph.pkl'
|
|
)
|
|
obs = runtime.run_action(action)
|
|
|
|
bm25_index_dir = os.path.join(
|
|
INDEX_BASE_DIR, 'BM25_index', instance['instance_id']
|
|
)
|
|
runtime.copy_to(
|
|
bm25_index_dir,
|
|
f'/workspace/{workspace_dir_name}/_index_data',
|
|
recursive=True,
|
|
)
|
|
action = CmdRunAction(
|
|
command=f'mv _index_data/{instance["instance_id"]} _index_data/bm25_index'
|
|
)
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(obs.exit_code == 0, f'Failed to mv file: {str(obs)}')
|
|
|
|
action = CmdRunAction(command='which python')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
obs.exit_code == 0 and 'testbed' in obs.content,
|
|
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
|
|
)
|
|
|
|
logger.info('-' * 30)
|
|
logger.info('END Runtime Initialization Fn')
|
|
logger.info('-' * 30)
|
|
|
|
|
|
def complete_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
|
|
) -> dict[str, Any]:
|
|
"""Complete the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
If you need to do something in the sandbox to get the correctness metric after
|
|
the agent has run, modify this function.
|
|
"""
|
|
logger.info('-' * 30)
|
|
logger.info('BEGIN Runtime Completion Fn')
|
|
logger.info('-' * 30)
|
|
obs: CmdOutputObservation
|
|
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
if obs.exit_code == -1:
|
|
# The previous command is still running
|
|
# We need to kill previous command
|
|
logger.info('The previous command is still running, trying to kill it...')
|
|
action = CmdRunAction(command='C-c')
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
# Then run the command again
|
|
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
|
)
|
|
|
|
action = CmdRunAction(command='git config --global core.pager ""')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to git config --global core.pager "": {str(obs)}',
|
|
)
|
|
|
|
# First check for any git repositories in subdirectories
|
|
action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to find git repositories: {str(obs)}',
|
|
)
|
|
|
|
git_dirs = [p for p in obs.content.strip().split('\n') if p]
|
|
if git_dirs:
|
|
# Remove all .git directories in subdirectories
|
|
for git_dir in git_dirs:
|
|
action = CmdRunAction(command=f'rm -rf "{git_dir}"')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to remove git directory {git_dir}: {str(obs)}',
|
|
)
|
|
|
|
# add all files
|
|
action = CmdRunAction(command='git add -A')
|
|
action.set_hard_timeout(600)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
assert_and_raise(
|
|
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
|
f'Failed to git add -A: {str(obs)}',
|
|
)
|
|
|
|
n_retries = 0
|
|
git_patch = None
|
|
while n_retries < 5:
|
|
action = CmdRunAction(
|
|
command=f'git diff --no-color --cached {instance["base_commit"]}'
|
|
)
|
|
action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
|
n_retries += 1
|
|
if isinstance(obs, CmdOutputObservation):
|
|
if obs.exit_code == 0:
|
|
git_patch = obs.content.strip()
|
|
break
|
|
else:
|
|
logger.info('Failed to get git diff, retrying...')
|
|
sleep_if_should_continue(10)
|
|
elif isinstance(obs, ErrorObservation):
|
|
logger.error(f'Error occurred: {obs.content}. Retrying...')
|
|
sleep_if_should_continue(10)
|
|
else:
|
|
assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
|
|
|
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
|
|
|
|
logger.info('-' * 30)
|
|
logger.info('END Runtime Completion Fn')
|
|
logger.info('-' * 30)
|
|
return {'git_patch': git_patch}
|
|
|
|
|
|
def process_instance(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
runtime_failure_count: int = 0,
|
|
) -> EvalOutput:
|
|
config = get_config(instance, metadata)
|
|
|
|
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
|
if reset_logger:
|
|
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
|
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
|
else:
|
|
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
|
|
|
# Increase resource_factor with increasing attempt_id
|
|
if runtime_failure_count > 0:
|
|
config.sandbox.remote_runtime_resource_factor = min(
|
|
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
|
|
8,
|
|
)
|
|
logger.warning(
|
|
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
|
)
|
|
runtime = create_runtime(config)
|
|
call_async_from_sync(runtime.connect)
|
|
|
|
try:
|
|
initialize_runtime(runtime, instance)
|
|
|
|
instruction = get_instruction(instance, metadata)
|
|
|
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=MessageAction(content=instruction),
|
|
runtime=runtime,
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
|
metadata.agent_class
|
|
],
|
|
)
|
|
)
|
|
|
|
# if fatal error, throw EvalError to trigger re-run
|
|
if is_fatal_evaluation_error(state.last_error):
|
|
raise EvalException('Fatal error detected: ' + state.last_error)
|
|
|
|
# ======= THIS IS SWE-Bench specific =======
|
|
# Get git patch
|
|
return_val = complete_runtime(runtime, instance)
|
|
git_patch = return_val['git_patch']
|
|
logger.info(
|
|
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
|
|
)
|
|
finally:
|
|
runtime.close()
|
|
# ==========================================
|
|
|
|
# ======= Attempt to evaluate the agent's edits =======
|
|
# we use eval_infer.sh to evaluate the agent's edits, not here
|
|
# because the agent may alter the environment / testcases
|
|
test_result = {
|
|
'git_patch': git_patch,
|
|
}
|
|
|
|
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
|
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
|
|
histories = [event_to_dict(event) for event in state.history]
|
|
metrics = get_metrics(state)
|
|
|
|
# Save the output
|
|
output = EvalOutput(
|
|
instance_id=instance.instance_id,
|
|
instruction=instruction,
|
|
instance=instance.to_dict(), # SWE Bench specific
|
|
test_result=test_result,
|
|
metadata=metadata,
|
|
history=histories,
|
|
metrics=metrics,
|
|
error=state.last_error if state and state.last_error else None,
|
|
)
|
|
return output
|
|
|
|
|
|
def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
|
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
|
|
if os.path.exists(file_path):
|
|
with open(file_path, 'r') as file:
|
|
data = toml.load(file)
|
|
if 'selected_ids' in data:
|
|
selected_ids = data['selected_ids']
|
|
logger.info(
|
|
f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
|
|
)
|
|
subset = dataset[dataset[filter_column].isin(selected_ids)]
|
|
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
|
return subset
|
|
skip_ids = os.environ.get('SKIP_IDS', '').split(',')
|
|
if len(skip_ids) > 0:
|
|
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
|
|
return dataset[~dataset[filter_column].isin(skip_ids)]
|
|
return dataset
|
|
|
|
|
|
# A list of instances that are known to be tricky to infer
|
|
# (will cause runtime failure even with resource factor = 8)
|
|
SWEGYM_EXCLUDE_IDS = [
|
|
'dask__dask-10422',
|
|
'pandas-dev__pandas-50548',
|
|
'pandas-dev__pandas-53672',
|
|
'pandas-dev__pandas-54174',
|
|
'pandas-dev__pandas-55518',
|
|
'pandas-dev__pandas-58383',
|
|
'pydata__xarray-6721',
|
|
'pytest-dev__pytest-10081',
|
|
'pytest-dev__pytest-7236',
|
|
]
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_evaluation_parser()
|
|
parser.add_argument(
|
|
'--dataset',
|
|
type=str,
|
|
default='princeton-nlp/SWE-bench',
|
|
help='data set to evaluate on, either full-test or lite-test',
|
|
)
|
|
parser.add_argument(
|
|
'--split',
|
|
type=str,
|
|
default='test',
|
|
help='split to evaluate on',
|
|
)
|
|
args, _ = parser.parse_known_args()
|
|
|
|
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
|
# so we don't need to manage file uploading to OpenHands's repo
|
|
dataset = load_dataset(args.dataset, split=args.split)
|
|
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
|
logger.info(
|
|
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
|
)
|
|
if 'SWE-Gym' in args.dataset:
|
|
swe_bench_tests = swe_bench_tests[
|
|
~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS)
|
|
]
|
|
logger.info(
|
|
f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks'
|
|
)
|
|
|
|
llm_config = None
|
|
if args.llm_config:
|
|
llm_config = get_llm_config_arg(args.llm_config)
|
|
llm_config.log_completions = True
|
|
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
|
llm_config.modify_params = False
|
|
|
|
if llm_config is None:
|
|
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
|
|
|
details = {}
|
|
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
|
|
|
dataset_descrption = (
|
|
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
|
|
)
|
|
metadata = make_metadata(
|
|
llm_config,
|
|
dataset_descrption,
|
|
args.agent_cls,
|
|
args.max_iterations,
|
|
args.eval_note,
|
|
args.eval_output_dir,
|
|
details=details,
|
|
)
|
|
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
print(f'### OUTPUT FILE: {output_file} ###')
|
|
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
|
|
|
|
if len(instances) > 0 and not isinstance(
|
|
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
|
|
):
|
|
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
|
|
instances[col] = instances[col].apply(lambda x: str(x))
|
|
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
timeout_seconds=8 * 60 * 60, # 8 hour PER instance should be more than enough
|
|
max_retries=5,
|
|
)
|