From be251b11de8da7785bdef3a8cc0eaf66c5eca836 Mon Sep 17 00:00:00 2001 From: Leo Date: Sat, 1 Jun 2024 15:58:14 +0800 Subject: [PATCH] Add AgentBench. (#2012) * Add AgentBench. * Load the datasets from HF. Signed-off-by: ifuryst * Add helper functions. * Add mock executor. Signed-off-by: ifuryst * Add retriv agent answer cmd. * Adjust the dataset. * Refine test results. Signed-off-by: ifuryst * Consolidate all AgentBench datasets and scripts into a single CSV dataset. * Refactor dataset source. * Update helper functions. Signed-off-by: ifuryst * Fix the CRLF problem. Signed-off-by: ifuryst * Separate the instance's workspace. Signed-off-by: ifuryst * Add cleanup logic and error handling for sandbox closure. * Normalized dataset Signed-off-by: ifuryst * Update README. Signed-off-by: ifuryst * Update the prompt to capture the answer. Signed-off-by: ifuryst * Refactor script execution paths to use absolute container workspace path. Signed-off-by: ifuryst * Update AgentBench README. Signed-off-by: ifuryst * Delete useless functions. Signed-off-by: ifuryst * Update evaluation/agent_bench/README.md * Add script to summarize test results from JSONL file in AgentBench Signed-off-by: ifuryst * Delete useless script and codes. Signed-off-by: ifuryst * Update evaluation/agent_bench/scripts/summarise_results.py --------- Signed-off-by: ifuryst Co-authored-by: Boxuan Li --- evaluation/agent_bench/README.md | 59 +++ evaluation/agent_bench/__init__.py | 0 evaluation/agent_bench/helper.py | 44 ++ evaluation/agent_bench/run_infer.py | 393 ++++++++++++++++++ evaluation/agent_bench/scripts/run_infer.sh | 33 ++ .../agent_bench/scripts/summarise_results.py | 37 ++ 6 files changed, 566 insertions(+) create mode 100644 evaluation/agent_bench/README.md create mode 100644 evaluation/agent_bench/__init__.py create mode 100644 evaluation/agent_bench/helper.py create mode 100644 evaluation/agent_bench/run_infer.py create mode 100755 evaluation/agent_bench/scripts/run_infer.sh create mode 100644 evaluation/agent_bench/scripts/summarise_results.py diff --git a/evaluation/agent_bench/README.md b/evaluation/agent_bench/README.md new file mode 100644 index 0000000000..9753aa3da0 --- /dev/null +++ b/evaluation/agent_bench/README.md @@ -0,0 +1,59 @@ +# AgentBench Evaluation + +This folder contains evaluation harness for evaluating agents on +the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). + +## Configure OpenDevin and your LLM + +Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) +for how to set this up. + +Here is an example `config.toml` file: + +```toml +[core] +max_iterations = 100 +cache_dir = "/path/to/cache" + +workspace_base = "/path/to/workspace" +workspace_mount_path = "/path/to/workspace" + +sandbox_container_image = "ghcr.io/opendevin/sandbox:latest" +sandbox_type = "ssh" +sandbox_timeout = 120 +ssh_hostname = "localhost" + +use_host_network = false +run_as_devin = false +enable_auto_lint = true + +[eval_gpt35_turbo] +model = "gpt-3.5-turbo" +api_key = "sk-123" +temperature = 0.0 + +[eval_gpt4o] +model = "gpt-4o" +api_key = "sk-123" +temperature = 0.0 +``` + +## Start the evaluation + +```bash +./evaluation/agent_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit] +``` + +Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now. + +You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on. + +- `--agent-cls`, the agent to use. For example, `CodeActAgent`. +- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`. +- `--max-iterations`: the number of iterations to run the evaluation. For example, `30`. +- `--eval-num-workers`: the number of workers to use for evaluation. For example, `5`. +- `--eval-n-limit`: the number of examples to evaluate. For example, `100`. + +```bash +./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo CodeActAgent 1 +``` diff --git a/evaluation/agent_bench/__init__.py b/evaluation/agent_bench/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/evaluation/agent_bench/helper.py b/evaluation/agent_bench/helper.py new file mode 100644 index 0000000000..b28d60b0e2 --- /dev/null +++ b/evaluation/agent_bench/helper.py @@ -0,0 +1,44 @@ +import os + + +def analysis_size(size_str): + size_str = size_str.strip() + avails = { + 'B': 1, + 'Byte': 1, + 'K': 1024, + 'KB': 1024, + 'M': 1024 * 1024, + 'MB': 1024 * 1024, + 'G': 1024 * 1024 * 1024, + 'GB': 1024 * 1024 * 1024, + 'T': 1024 * 1024 * 1024 * 1024, + 'TB': 1024 * 1024 * 1024 * 1024, + 'P': 1024 * 1024 * 1024 * 1024 * 1024, + 'PB': 1024 * 1024 * 1024 * 1024 * 1024, + } + for size_unit in avails: + if size_str.endswith(size_unit): + return int(size_str[: -len(size_unit)]) * avails[size_unit] + return int(size_str) + + +def compare_results(check_method: str, model_answer: str, final_ans: str) -> bool: + try: + match check_method: + case 'check/integer-match.py': + return int(model_answer) == int(final_ans) + case 'check/size-match.py': + return analysis_size(model_answer) == analysis_size(final_ans) + return ( + model_answer.replace('\r\n', '\n').replace('\r', '\n').strip() + == final_ans.replace('\r\n', '\n').replace('\r', '\n').strip() + ) + except Exception: + return False + + +def create_sh_file(filename: str, cmds: str) -> None: + with open(filename, 'w', encoding='utf-8') as file: + file.write(cmds.replace('\r\n', '\n')) + os.chmod(filename, 0o755) diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py new file mode 100644 index 0000000000..90bb198e2d --- /dev/null +++ b/evaluation/agent_bench/run_infer.py @@ -0,0 +1,393 @@ +import asyncio +import json +import logging +import multiprocessing as mp +import os +import pathlib +import re +import shutil +import subprocess +import time +from concurrent.futures import ProcessPoolExecutor + +import docker +from datasets import load_dataset +from tqdm import tqdm + +from evaluation.agent_bench.helper import compare_results, create_sh_file +from opendevin.controller.state.state import State +from opendevin.core.config import args, config, get_llm_config_arg +from opendevin.core.logger import get_console_handler +from opendevin.core.logger import opendevin_logger as logger +from opendevin.core.main import main +from opendevin.events.action import MessageAction, CmdRunAction +from opendevin.events.serialization.event import event_to_dict +from opendevin.runtime.docker.ssh_box import DockerSSHBox + + +def cleanup(): + print('Cleaning up child processes...') + for process in mp.active_children(): + print(f'Terminating child process: {process.name}') + process.terminate() + process.join() + + +def codeact_user_response(state: State) -> str: + msg = ( + 'Please continue working on the task on whatever approach you think is suitable.\n' + 'If you think you have solved the task, please first send your answer to user through ' + 'message and then exit .\n' + 'Please encapsulate your final answer (answer ONLY) within and .\n' + 'For example: The answer to the question is 42 .\n' + 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n' + ) + if state.history: + user_msgs = [ + action + for action, _ in state.history + if isinstance(action, MessageAction) and action.source == 'user' + ] + if len(user_msgs) >= 2: + # let the agent know that it can give up when it has tried 3 times + return ( + msg + + 'If you want to give up, run: exit .\n' + ) + return msg + + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, +} + +AGENT_CLS_TO_INST_SUFFIX = { + 'CodeActAgent': 'When you think you have solved the question, ' + 'please first send your answer to user through message and then exit.\n' +} + + +def process_instance( + instance, + agent_class, + metadata, + eval_output_dir, + reset_logger: bool = True, +): + # ============================================= + # preparation + # ============================================= + + inst_id = instance.instance_id + question = instance.description + # create a directory for the instance's workspace + instance_workspace = str(os.path.join(config.workspace_base, inst_id)) + container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id)) + if os.path.exists(instance_workspace): + shutil.rmtree(instance_workspace) + os.makedirs(instance_workspace, exist_ok=True) + + # Set up the logger properly, so you can run multiprocessing to parallel the evaluation + if reset_logger: + # Set up logger + log_file = os.path.join(eval_output_dir, 'logs', f'instance_{inst_id}.log') + # Remove all existing handlers from logger + for handler in logger.handlers[:]: + logger.removeHandler(handler) + # add back the console handler to print ONE line + logger.addHandler(get_console_handler()) + logger.info( + f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell' + ) + # Remove all existing handlers from logger + for handler in logger.handlers[:]: + logger.removeHandler(handler) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter( + logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + ) + logger.addHandler(file_handler) + + # ============================================= + # build instruction + # ============================================= + + # Prepare instruction + instruction = ( + f'Please fix the following issue.\n' + 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n' + 'Please encapsulate your final answer (answer ONLY) within and .\n' + 'For example: The answer to the question is 42 .\n' + '# Problem \n' + f'{question}\n\n' + ) + instruction += ( + 'IMPORTANT: You should ONLY interact with the environment provided ' + 'to you AND NEVER ASK FOR HUMAN HELP.\n' + ) + # NOTE: You can actually set slightly different instruction for different agents + instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '') + + # ============================================= + # create sandbox and run the agent + # ============================================= + + sandbox = DockerSSHBox() + sandbox.execute(f'cd {inst_id}') + + init_cmd = instance.init + if init_cmd is not None: + scpt_name = f'{instance.instance_id}_init.sh' + scpt_path = os.path.join(container_inst_workspace, scpt_name) + host_scpt_path = os.path.join(instance_workspace, scpt_name) + create_sh_file(host_scpt_path, init_cmd) + logger.info(f'Running init script: {scpt_path}') + _, init_res = sandbox.execute(scpt_path) + logger.info(f'Init script result: {init_res}') + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State = asyncio.run( + main( + instruction, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + agent_class + ), + sandbox=sandbox, + ) + ) + + if state is None: + raise ValueError('State should not be None.') + + # get the ground truth + # OSBenchSSHBox.get_ground_truth(instance, state) + + # ============================================= + # result evaluation + # ============================================= + + agent_answer = '' + get_agent_result_cmd = instance.get_agent_result + if get_agent_result_cmd is not None: + scpt_name = f'{instance.instance_id}_get_agent_result.sh' + scpt_path = os.path.join(container_inst_workspace, scpt_name) + host_scpt_path = os.path.join(instance_workspace, scpt_name) + create_sh_file(host_scpt_path, get_agent_result_cmd) + logger.info(f'Running get agent result cmd: {scpt_path}') + _, agent_answer = sandbox.execute(scpt_path) + else: + logger.info('Retrieving agent answer from history.') + raw_ans = '' + for act, _ in reversed(state.history): + if isinstance(act, MessageAction) and act.source == 'agent': + raw_ans = act.content + break + if isinstance(act, CmdRunAction) and act.source == 'agent': + raw_ans = act.thought + break + agent_answer = re.findall(r'(.*?)', raw_ans) + if len(agent_answer) == 0: + logger.warning(f'Failed to parse model answer: {raw_ans}') + agent_answer = raw_ans + else: + agent_answer = agent_answer[0] + + final_ans = '' + if instance.ground_truth is not None: + final_ans = instance.ground_truth + else: + get_ground_truth_cmd = instance.get_ground_truth + if get_ground_truth_cmd is not None: + scpt_name = f'{instance.instance_id}_get_ground_truth.sh' + scpt_path = os.path.join(container_inst_workspace, scpt_name) + host_scpt_path = os.path.join(instance_workspace, scpt_name) + create_sh_file(host_scpt_path, get_ground_truth_cmd) + logger.info(f'Running get ground truth cmd: {scpt_path}') + sandbox.execute(f'cd {container_inst_workspace}') + _, final_ans = sandbox.execute(scpt_path) + + comparison_method = instance.comparison_method + logger.info( + f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}' + ) + test_result = compare_results(comparison_method, agent_answer, final_ans) + + histories = [ + (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history + ] + + # Save the output + output = { + 'instance_id': inst_id, + 'instance': instance.to_dict(), + 'instruction': instruction, + 'metadata': metadata, + 'history': histories, + 'error': state.error if state and state.error else None, + 'test_result': { + 'agent_answer': agent_answer, + 'final_answer': final_ans, + 'check_method': comparison_method, + 'result': test_result, + }, + } + + # clean up + if os.path.exists(instance_workspace): + shutil.rmtree(instance_workspace) + # Close the sandbox + try: + sandbox.close() + except docker.errors.NotFound as e: + logger.error(f'Failed to close sandbox: {e}') + return output + + +if __name__ == '__main__': + # ============================================= + # load datasets + # ============================================= + + dataset = load_dataset('iFurySt/AgentBench') + agent_bench_tests = dataset['osbench'].to_pandas() + logger.info(f'Loaded {len(agent_bench_tests)} tests.') + + # ============================================= + # handle arguments and prepare for evaluation + # ============================================= + + if args.llm_config: + specified_llm_config = get_llm_config_arg(args.llm_config) + if specified_llm_config: + config.llm = specified_llm_config + logger.info(f'Config for evaluation: {config}') + + # TEST METADATA + agent_cls = args.agent_cls + assert ( + agent_cls in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN + ), f'Unsupported agent class: {agent_cls}' + model_name = config.llm.model.split('/')[-1] + max_iterations = args.max_iterations + eval_note = '' + if args.eval_note is not None: + eval_note += '_N_' + args.eval_note + eval_op_dir = str( + os.path.join( + args.eval_output_dir, + 'agent_bench', + agent_cls, + model_name + '_maxiter_' + str(max_iterations) + eval_note, + ) + ) + + pathlib.Path(eval_op_dir).mkdir(parents=True, exist_ok=True) + pathlib.Path(str(os.path.join(eval_op_dir, 'logs'))).mkdir( + parents=True, exist_ok=True + ) + logger.info(f'Using evaluation output directory: {eval_op_dir}') + + meta = { + 'agent_class': agent_cls, + 'model_name': model_name, + 'max_iterations': max_iterations, + 'eval_output_dir': eval_op_dir, + 'start_time': time.strftime('%Y-%m-%d %H:%M:%S'), + # get the commit id of current repo for reproducibility + 'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD']) + .decode('utf-8') + .strip(), + } + logger.info(f'Metadata: {meta}') + with open(os.path.join(eval_op_dir, 'metadata.json'), 'w') as f: + json.dump(meta, f) + + # LIMIT EVALUATION + eval_n_limit = args.eval_n_limit + if eval_n_limit: + agent_bench_tests = agent_bench_tests[:eval_n_limit] + logger.info(f'Limiting evaluation to first {eval_n_limit} instances.') + + # OUTPUT FILE + output_file = os.path.join(eval_op_dir, 'output.jsonl') + logger.info(f'Writing evaluation output to {output_file}') + finished_instance_ids = set() + if os.path.exists(output_file): + with open(output_file, 'r') as f: + for line in f: + data = json.loads(line) + finished_instance_ids.add(data['instance_id']) + logger.warning( + f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.' + ) + output_fp = open(output_file, 'a') + + logger.info( + f'Evaluation started with Agent {agent_cls}, model {model_name}, max iterations {max_iterations}.' + ) + + # ============================================= + # filter out finished instances + # ============================================= + + new_agent_bench_tests = [] + for idx, inst in agent_bench_tests.iterrows(): + if inst.instance_id in finished_instance_ids: + logger.info( + f'Skipping instance {inst.instance_id} as it is already finished.' + ) + continue + new_agent_bench_tests.append(inst) + + agent_bench_tests = new_agent_bench_tests + logger.info( + f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(agent_bench_tests)}' + ) + + # ============================================= + # start task + # ============================================= + + pbar = tqdm(total=len(agent_bench_tests)) + + # This function tracks the progress AND write the output to a JSONL file + def update_progress(fut): + pbar.update(1) + output = fut.result() + pbar.set_description(f'Instance {output["instance_id"]}') + pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}') + logger.info( + f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}' + ) + output_fp.write(json.dumps(output) + '\n') + output_fp.flush() + + # This sets the multiprocessing + num_workers = args.eval_num_workers + logger.info(f'Using {num_workers} workers for evaluation.') + + try: + with ProcessPoolExecutor(num_workers) as executor: + futures = [] + # This is how we perform multiprocessing + for inst in agent_bench_tests: + future = executor.submit( + process_instance, + inst, + agent_cls, + meta, + eval_op_dir, + reset_logger=bool(num_workers > 1), + ) + future.add_done_callback(update_progress) + futures.append(future) + + # Wait for all futures to complete + for future in futures: + future.result() + except KeyboardInterrupt: + print('KeyboardInterrupt received. Cleaning up...') + cleanup() + + output_fp.close() + logger.info('Evaluation finished.') diff --git a/evaluation/agent_bench/scripts/run_infer.sh b/evaluation/agent_bench/scripts/run_infer.sh new file mode 100755 index 0000000000..65e9d36d50 --- /dev/null +++ b/evaluation/agent_bench/scripts/run_infer.sh @@ -0,0 +1,33 @@ +#!/bin/bash +MODEL_CONFIG=$1 +AGENT=$2 +EVAL_LIMIT=$3 + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin +# We need to track the version of Agent in the evaluation to make sure results are comparable +AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)") + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" + +COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --max-iterations 30 \ + --max-chars 10000000 \ + --eval-num-workers 5 \ + --eval-note $AGENT_VERSION" + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/evaluation/agent_bench/scripts/summarise_results.py b/evaluation/agent_bench/scripts/summarise_results.py new file mode 100644 index 0000000000..67a8964b1d --- /dev/null +++ b/evaluation/agent_bench/scripts/summarise_results.py @@ -0,0 +1,37 @@ +import json +import sys + + +def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]: + passed = [] + failed = [] + with open(res_file_path, 'r') as file: + for line in file: + data = json.loads(line.strip()) + instance_id = data['instance_id'] + resolved = False + if 'test_result' in data and 'result' in data['test_result']: + resolved = data['test_result']['result'] + if resolved: + passed.append(instance_id) + else: + failed.append(instance_id) + return passed, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print( + 'Usage: poetry run python summarise_results.py ' + ) + sys.exit(1) + json_file_path = sys.argv[1] + passed_tests, failed_tests = extract_test_results(json_file_path) + succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests)) + print( + f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}' + ) + print('PASSED TESTS:') + print(passed_tests) + print('FAILED TESTS:') + print(failed_tests)