mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
* add draft dockerfile for build all * add rsync for build * add all-in-one docker * update prepare scripts * Update swe_env_box.py * Add swe_entry.sh (buggy now) * Parse the test command in swe_entry.sh * Update README for instance eval in sandbox * revert specialized config * replace run_as_devin as an init arg * set container & run_as_root via args * update swe entry script * update env * remove mounting * allow error after swe_entry * update swe_env_box * move file * update gitignore * get swe_env_box a working demo * support faking user response & provide sandox ahead of time; also return state for controller * tweak main to support adding controller kwargs * add module * initialize plugin for provided sandbox * add pip cache to plugin & fix jupyter kernel waiting * better print Observation output * add run infer scripts * update readme * add utility for getting diff patch * use get_diff_patch in infer * update readme * support cost tracking for codeact * add swe agent edit hack * disable color in git diff * fix git diff cmd * fix state return * support limit eval * increase t imeout and export pip cache * add eval limit config * return state when hit turn limit * save log to file; allow agent to give up * run eval with max 50 turns * add outputs to gitignore * save swe_instance & instruction * add uuid to swebench * add streamlit dep * fix save series * fix the issue where session id might be duplicated * allow setting temperature for llm (use 0 for eval) * Get report from agent running log * support evaluating task success right after inference. * remove extra log * comment out prompt for baseline * add visualizer for eval * use plaintext for instruction * reduce timeout for all; only increase timeout for init * reduce timeout for all; only increase timeout for init * ignore sid for swe env * close sandbox in each eval loop * update visualizer instruction * increase max chars * add finish action to history too * show test result in metrics * add sidebars for visualizer * also visualize swe_instance * cleanup browser when agent controller finish runinng * do not mount workspace for swe-eval to avoid accidentally overwrite files * Revert "do not mount workspace for swe-eval to avoid accidentally overwrite files" This reverts commit 8ef77390543e562e6f0a5a9992418014d8b3010c. * Revert "Revert "do not mount workspace for swe-eval to avoid accidentally overwrite files"" This reverts commit 016cfbb9f0475f32bacbad5822996b4eaff24a5e. * run jupyter command via copy to, instead of cp to mount * only print mixin output when failed * change ssh box logging * add visualizer for pass rate * add instance id to sandbox name * only remove container we created * use opendevin logger in main * support multi-processing infer * add back metadata, support keyboard interrupt * remove container with startswith * make pbar behave correctly * update instruction w/ multi-processing * show resolved rate by repo * rename tmp dir name * attempt to fix racing for copy to ssh_box * fix script * bump swe-bench-all version * fix ipython with self-contained commands * add jupyter demo to swe_env_box * make resolved count two column * increase height * do not add glob to url params * analyze obs length * print instance id prior to removal handler * add gold patch in visualizer * fix interactive git by adding a git --no-pager as alias * increase max_char to 10k to cover 98% of swe-bench obs cases * allow parsing note * prompt v2 * add iteration reminder * adjust user response * adjust order * fix return eval * fix typo * add reminder before logging * remove other resolve rate * re adjust to new folder structure * support adding eval note * fix eval note path * make sure first log of each instance is printed * add eval note * fix the display for visualizer * tweak visualizer for better git patch reading * exclude empty patch * add retry mechanism for swe_env_box start * fix ssh timeout issue * add stat field for apply test patch success * add visualization for fine-grained report * attempt to support monologue agent by constraining it to single thread * also log error msg when stopeed * save error as well * override WORKSPACE_MOUNT_PATH and WORKSPACE_BASE for monologue to work in mp * add retry mechanism for sshbox * remove retry for swe env box * try to handle loop state stopped * Add get report scripts * Add script to convert agent output to swe-bench format * Merge fine grained report for visualizer * Update eval readme * Update README.md * Add CodeAct gpt4-1106 output and eval logs on swe-bench-lite * Update the script to get model report * Update get_model_report.sh * Update get_agent_report.sh * Update report merge script * Add agent output conversion script * Update swe_lite_env_setup.sh * Add example swe-bench output files * Update eval readme * Remove redundant scripts * set iteration count down to false by default * fix: Issue where CodeAct agent was trying to log cost on local llm and throwing Undefined Model execption out of litellm (#1666) * fix: Issue where CodeAct agent was trying to log cost on local llm and throwing Undefined Model execption out of litellm * Review Feedback * Missing None Check * Review feedback and improved error handling --------- Co-authored-by: Robert Brennan <accounts@rbren.io> * fix prepare_swe_util scripts * update builder images * update setup script * remove swe-bench build workflow * update lock * remove experiments since they are moved to hf * remove visualizer (since it is moved to hf repo) * simply jupyter execution via heredoc * update ssh_box * add initial docker readme * add pkg-config as dependency * add script for swe_bench all-in-one docker * add rsync to builder * rename var * update commit * update readme * update lock * support specify timeout for long running tasks * fix path * separate building of all deps and files * support returning states at the end of controller * remove return None * support specify timeout for long running tasks * add timeout for all existing sandbox impl * fix swe_env_box for new codebase * update llm config in config.py * support pass sandbox in * remove force set * update eval script * fix issue of overriding final state * change default eval output to hf demo * change default eval output to hf demo * fix config * only close it when it is NOT external sandbox * add scripts * tweak config * only put in hostory when state has history attr * fix agent controller on the case of run out interaction budget * always assume state is always not none * remove print of final state * catch all exception when cannot compute completion cost * Update README.md * save source into json * fix path * update docker path * return the final state on close * merge AgentState with State * fix integration test * merge AgentState with State * fix integration test * add ChangeAgentStateAction to history in attempt to fix integration * add back set agent state * update tests * update tests * move scripts for setup * update script and readme for infer * do not reset logger when n processes == 1 * update eval_infer scripts and readme * simplify readme * copy over dir after eval * copy over dir after eval * directly return get state * update lock * fix output saving of infer * replace print with logger * update eval_infer script * add back the missing .close * increase timeout * copy all swe_bench_format file * attempt to fix output parsing * log git commit id as metadata * fix eval script * update lock * update unit tests * fix argparser unit test * fix lock * the deps are now lightweight enough to be incude in make build * add spaces for tests * add eval outputs to gitignore * remove git submodule * readme * tweak git email * update upload instruction * bump codeact version for eval --------- Co-authored-by: Bowen Li <libowen.ne@gmail.com> Co-authored-by: huybery <huybery@gmail.com> Co-authored-by: Bart Shappee <bshappee@gmail.com> Co-authored-by: Robert Brennan <accounts@rbren.io>
412 lines
15 KiB
Python
412 lines
15 KiB
Python
import asyncio
|
|
import json
|
|
import logging
|
|
import multiprocessing as mp
|
|
import os
|
|
import pathlib
|
|
import subprocess
|
|
import time
|
|
from concurrent.futures import ProcessPoolExecutor
|
|
|
|
import pandas as pd
|
|
import whatthepatch
|
|
from datasets import load_dataset
|
|
from tqdm import tqdm
|
|
|
|
from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
|
|
from opendevin.controller.state.state import State
|
|
from opendevin.core.config import args, config, get_llm_config_arg
|
|
from opendevin.core.logger import get_console_handler
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
from opendevin.core.main import main
|
|
from opendevin.events.action import MessageAction
|
|
from opendevin.events.serialization.event import event_to_dict
|
|
|
|
|
|
def cleanup():
|
|
print('Cleaning up child processes...')
|
|
for process in mp.active_children():
|
|
print(f'Terminating child process: {process.name}')
|
|
process.terminate()
|
|
process.join()
|
|
|
|
|
|
def codeact_user_response(state: State) -> str:
|
|
msg = (
|
|
'Please continue working on the task on whatever approach you think is suitable.\n'
|
|
'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
|
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
|
)
|
|
if state.history:
|
|
user_msgs = [
|
|
action
|
|
for action, _ in state.history
|
|
if isinstance(action, MessageAction) and action.source == 'user'
|
|
]
|
|
if len(user_msgs) >= 2:
|
|
# let the agent know that it can give up when it has tried 3 times
|
|
return (
|
|
msg
|
|
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
|
)
|
|
return msg
|
|
|
|
|
|
def monologue_user_response(state: State) -> str:
|
|
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
|
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': codeact_user_response,
|
|
'MonologueAgent': monologue_user_response,
|
|
}
|
|
|
|
AGENT_CLS_TO_INST_SUFFIX = {
|
|
'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
|
}
|
|
|
|
|
|
def get_test_result(instance, sandbox, workspace_dir_name):
|
|
test_result = {'result': {}, 'metadata': {}}
|
|
try:
|
|
test_patch_parsed = whatthepatch.parse_patch(instance.test_patch)
|
|
# get a list of filepaths that are involved in the patch
|
|
involved_filepaths = set()
|
|
for patch in test_patch_parsed:
|
|
involved_filepaths.add(patch.header.old_path.removeprefix('a/'))
|
|
involved_filepaths.add(patch.header.new_path.removeprefix('b/'))
|
|
involved_filepaths = list(involved_filepaths)
|
|
test_result['metadata']['1_test_patch_parse_success'] = True
|
|
test_result['metadata']['1_test_involved_filepaths'] = involved_filepaths
|
|
except Exception as e:
|
|
logger.error(
|
|
f'Error parsing test patch for instance {instance.instance_id}: {e}'
|
|
)
|
|
test_result['metadata']['1_test_patch_parse_success'] = False
|
|
test_result['metadata']['1_test_patch_parse_error'] = str(e)
|
|
test_result['metadata']['1_test_involved_filepaths'] = None
|
|
involved_filepaths = []
|
|
|
|
# Try to revert the changes for involved filepaths
|
|
err_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
|
|
test_result['metadata']['2_revert_test_involved_filepaths_success'] = []
|
|
for filepath in involved_filepaths:
|
|
err_code, output = sandbox.execute(
|
|
f'git checkout {instance["base_commit"]} -- {filepath}'
|
|
)
|
|
if err_code != 0:
|
|
logger.error(f'Error reverting changes for {filepath}: {output}')
|
|
test_result['metadata']['2_revert_test_involved_filepaths_success'].append(
|
|
False
|
|
)
|
|
else:
|
|
test_result['metadata']['2_revert_test_involved_filepaths_success'].append(
|
|
True
|
|
)
|
|
|
|
# Apply the testcase
|
|
err_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
|
|
if err_code != 0:
|
|
logger.error(f'Error applying test patch: {output}')
|
|
test_result['metadata']['3_apply_test_patch_success'] = False
|
|
test_result['metadata']['3_apply_test_patch_error'] = output
|
|
else:
|
|
test_result['metadata']['3_apply_test_patch_success'] = True
|
|
|
|
# Run the test command
|
|
err_code, output = sandbox.execute(
|
|
'$TEST_CMD > /workspace/$SWE_INSTANCE_ID.log 2>&1'
|
|
)
|
|
if err_code != 0:
|
|
logger.error(f'Error running test command: {output}')
|
|
test_result['metadata']['4_run_test_command_success'] = False
|
|
test_result['metadata']['4_run_test_command_error'] = output
|
|
else:
|
|
test_result['metadata']['4_run_test_command_success'] = True
|
|
|
|
# Get the test output
|
|
err_code, output = sandbox.execute('cat /workspace/$SWE_INSTANCE_ID.log')
|
|
if err_code != 0:
|
|
logger.error(f'Error getting test output: {output}')
|
|
test_result['metadata']['4_get_test_output_success'] = False
|
|
test_result['metadata']['4_get_test_output_error'] = output
|
|
else:
|
|
test_result['metadata']['4_get_test_output_success'] = True
|
|
test_result['test_output'] = output
|
|
|
|
# Reformat instance.json
|
|
# $SWE_TASK_DIR/instance.json is a dict {"XXX": "YYY"}, add a [ before and a ] after
|
|
err_code, output = sandbox.execute(
|
|
(
|
|
'cat $SWE_TASK_DIR/instance.json | sed "s/^{/[{/" | sed "s/}$/}]/" > /workspace/instance.json'
|
|
)
|
|
)
|
|
if err_code != 0:
|
|
logger.error(f'Error creating instance.json: {output}')
|
|
test_result['metadata']['5_reformat_instance_json_success'] = False
|
|
test_result['metadata']['5_reformat_instance_json_error'] = output
|
|
else:
|
|
test_result['metadata']['5_reformat_instance_json_success'] = True
|
|
|
|
# Get the instance report
|
|
err_code, output = sandbox.execute(
|
|
(
|
|
'cd /swe_util/OD-SWE-bench '
|
|
'&& export PYTHONPATH=$(pwd):$PYTHONPATH '
|
|
'&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
|
|
)
|
|
)
|
|
if err_code != 0:
|
|
logger.error(f'Error getting instance report: {output}')
|
|
test_result['metadata']['6_get_instance_report_success'] = False
|
|
test_result['metadata']['6_get_instance_report_error'] = output
|
|
else:
|
|
test_result['metadata']['6_get_instance_report_success'] = True
|
|
test_result['result_raw'] = output
|
|
|
|
# try to parse output
|
|
for line in output.strip().split('\n'):
|
|
line = line.strip('-')
|
|
try:
|
|
key, value = line.split(':')
|
|
except ValueError:
|
|
# skip this line
|
|
print(f'Error parsing result line: {line}')
|
|
continue
|
|
value = value.strip()
|
|
try:
|
|
value = int(value)
|
|
except ValueError:
|
|
pass
|
|
test_result['result'][key.strip()] = value
|
|
return test_result
|
|
|
|
|
|
def process_instance(
|
|
instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
|
|
):
|
|
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
|
# create process-specific workspace dir
|
|
if not skip_workspace_mount:
|
|
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
|
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
if reset_logger:
|
|
# Set up logger
|
|
log_file = os.path.join(
|
|
eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
# add back the console handler to print ONE line
|
|
logger.addHandler(get_console_handler())
|
|
logger.info(
|
|
f'Starting evaluation for instance {instance.instance_id}.\nLOG: tail -f {log_file}'
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
file_handler = logging.FileHandler(log_file)
|
|
file_handler.setFormatter(
|
|
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
)
|
|
logger.addHandler(file_handler)
|
|
|
|
if not skip_workspace_mount:
|
|
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
|
|
|
workspace_dir_name = f'{instance.repo}__{instance.version}'.replace('/', '__')
|
|
sandbox = SWEBenchSSHBox.get_box_for_instance(
|
|
instance,
|
|
workspace_dir_name,
|
|
skip_workspace_mount=skip_workspace_mount,
|
|
workspace_mount_path=workspace_mount_path,
|
|
)
|
|
|
|
# Prepare instruction
|
|
instruction = (
|
|
f'Please fix the following issue for the repository in /workspace/{workspace_dir_name}.\n'
|
|
'Environment has been set up for you to start working. You may assume all necessary tools are installed.\n\n'
|
|
'# Problem Statement\n'
|
|
f'{instance.problem_statement}\n\n'
|
|
)
|
|
if instance.hints_text:
|
|
instruction += f'# Hints\n{instance.hints_text}\n\n'
|
|
instruction += (
|
|
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
|
'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n'
|
|
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
|
|
)
|
|
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
|
|
|
# Run the agent
|
|
state: State = asyncio.run(
|
|
main(
|
|
instruction,
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
|
sandbox=sandbox,
|
|
)
|
|
)
|
|
|
|
# Get git patch
|
|
git_patch = sandbox.get_diff_patch()
|
|
logger.info(f'Got git diff for instance {instance.instance_id}')
|
|
|
|
# ======= Attempt to evaluate the agent's edits =======
|
|
# Attempt to analyze the test patch to get involved filepaths
|
|
test_result = get_test_result(instance, sandbox, workspace_dir_name)
|
|
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
# Save the output
|
|
output = {
|
|
'instance_id': instance.instance_id,
|
|
'swe_instance': instance.to_dict(),
|
|
'instruction': instruction,
|
|
'git_patch': git_patch,
|
|
'metadata': metadata,
|
|
'history': [
|
|
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
|
],
|
|
'error': state.error if state and state.error else None,
|
|
'test_result': test_result,
|
|
}
|
|
|
|
# Close the sandbox
|
|
sandbox.close()
|
|
return output
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Load the dataset
|
|
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
|
|
swe_bench_tests = dataset['test'].to_pandas()
|
|
|
|
if args.llm_config:
|
|
specified_llm_config = get_llm_config_arg(args.llm_config)
|
|
if specified_llm_config:
|
|
config.llm = specified_llm_config
|
|
logger.info(f'Config for evaluation: {config}')
|
|
|
|
# TEST METADATA
|
|
agent_class = args.agent_cls
|
|
assert (
|
|
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
|
), f'Unsupported agent class: {agent_class}'
|
|
model_name = config.llm.model.split('/')[-1]
|
|
max_iterations = args.max_iterations
|
|
eval_note = ''
|
|
if args.eval_note is not None:
|
|
eval_note += '_N_' + args.eval_note
|
|
eval_output_dir = os.path.join(
|
|
args.eval_output_dir,
|
|
'swe_bench',
|
|
agent_class,
|
|
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
|
)
|
|
|
|
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
|
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
|
parents=True, exist_ok=True
|
|
)
|
|
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
|
|
|
metadata = {
|
|
'agent_class': agent_class,
|
|
'model_name': model_name,
|
|
'max_iterations': max_iterations,
|
|
'eval_output_dir': eval_output_dir,
|
|
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
# get the commit id of current repo
|
|
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
|
.decode('utf-8')
|
|
.strip(),
|
|
}
|
|
logger.info(f'Metadata: {metadata}')
|
|
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
|
json.dump(metadata, f)
|
|
|
|
# LIMIT EVALUATION
|
|
eval_n_limit = args.eval_n_limit
|
|
if eval_n_limit:
|
|
swe_bench_tests = swe_bench_tests.head(eval_n_limit)
|
|
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
|
|
|
# OUTPUT FILE
|
|
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
|
logger.info(f'Writing evaluation output to {output_file}')
|
|
finished_instance_ids = set()
|
|
if os.path.exists(output_file):
|
|
with open(output_file, 'r') as f:
|
|
for line in f:
|
|
data = json.loads(line)
|
|
finished_instance_ids.add(data['instance_id'])
|
|
logger.warning(
|
|
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
|
)
|
|
output_fp = open(output_file, 'a')
|
|
|
|
logger.info(
|
|
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
|
)
|
|
|
|
# filter out finished instances
|
|
new_swe_bench_tests = []
|
|
for idx, instance in swe_bench_tests.iterrows():
|
|
if instance.instance_id in finished_instance_ids:
|
|
logger.info(
|
|
f'Skipping instance {instance.instance_id} as it is already finished.'
|
|
)
|
|
continue
|
|
new_swe_bench_tests.append(instance)
|
|
|
|
swe_bench_tests = pd.DataFrame(new_swe_bench_tests)
|
|
logger.info(
|
|
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(swe_bench_tests)}'
|
|
)
|
|
|
|
pbar = tqdm(total=len(swe_bench_tests))
|
|
|
|
def update_progress(future):
|
|
pbar.update(1)
|
|
output = future.result()
|
|
pbar.set_description(f'Instance {output["instance_id"]}')
|
|
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
|
logger.info(
|
|
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
|
)
|
|
output_fp.write(json.dumps(output) + '\n')
|
|
output_fp.flush()
|
|
|
|
num_workers = args.eval_num_workers
|
|
logger.info(f'Using {num_workers} workers for evaluation.')
|
|
|
|
skip_workspace_mount = agent_class == 'CodeActAgent'
|
|
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
|
try:
|
|
with ProcessPoolExecutor(num_workers) as executor:
|
|
futures = []
|
|
for row_idx, instance in swe_bench_tests.iterrows():
|
|
future = executor.submit(
|
|
process_instance,
|
|
instance,
|
|
agent_class,
|
|
metadata,
|
|
skip_workspace_mount,
|
|
reset_logger=bool(num_workers > 1),
|
|
)
|
|
future.add_done_callback(update_progress)
|
|
futures.append(future)
|
|
|
|
# Wait for all futures to complete
|
|
for future in futures:
|
|
future.result()
|
|
except KeyboardInterrupt:
|
|
print('KeyboardInterrupt received. Cleaning up...')
|
|
cleanup()
|
|
|
|
output_fp.close()
|
|
logger.info('Evaluation finished.')
|