mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Add AgentBench. (#2012)
* Add AgentBench. * Load the datasets from HF. Signed-off-by: ifuryst <ifuryst@gmail.com> * Add helper functions. * Add mock executor. Signed-off-by: ifuryst <ifuryst@gmail.com> * Add retriv agent answer cmd. * Adjust the dataset. * Refine test results. Signed-off-by: ifuryst <ifuryst@gmail.com> * Consolidate all AgentBench datasets and scripts into a single CSV dataset. * Refactor dataset source. * Update helper functions. Signed-off-by: ifuryst <ifuryst@gmail.com> * Fix the CRLF problem. Signed-off-by: ifuryst <ifuryst@gmail.com> * Separate the instance's workspace. Signed-off-by: ifuryst <ifuryst@gmail.com> * Add cleanup logic and error handling for sandbox closure. * Normalized dataset Signed-off-by: ifuryst <ifuryst@gmail.com> * Update README. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update the prompt to capture the answer. Signed-off-by: ifuryst <ifuryst@gmail.com> * Refactor script execution paths to use absolute container workspace path. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update AgentBench README. Signed-off-by: ifuryst <ifuryst@gmail.com> * Delete useless functions. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update evaluation/agent_bench/README.md * Add script to summarize test results from JSONL file in AgentBench Signed-off-by: ifuryst <ifuryst@gmail.com> * Delete useless script and codes. Signed-off-by: ifuryst <ifuryst@gmail.com> * Update evaluation/agent_bench/scripts/summarise_results.py --------- Signed-off-by: ifuryst <ifuryst@gmail.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
parent
04d7354501
commit
be251b11de
59
evaluation/agent_bench/README.md
Normal file
59
evaluation/agent_bench/README.md
Normal file
@ -0,0 +1,59 @@
|
||||
# AgentBench Evaluation
|
||||
|
||||
This folder contains evaluation harness for evaluating agents on
|
||||
the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688).
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md)
|
||||
for how to set this up.
|
||||
|
||||
Here is an example `config.toml` file:
|
||||
|
||||
```toml
|
||||
[core]
|
||||
max_iterations = 100
|
||||
cache_dir = "/path/to/cache"
|
||||
|
||||
workspace_base = "/path/to/workspace"
|
||||
workspace_mount_path = "/path/to/workspace"
|
||||
|
||||
sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
|
||||
sandbox_type = "ssh"
|
||||
sandbox_timeout = 120
|
||||
ssh_hostname = "localhost"
|
||||
|
||||
use_host_network = false
|
||||
run_as_devin = false
|
||||
enable_auto_lint = true
|
||||
|
||||
[eval_gpt35_turbo]
|
||||
model = "gpt-3.5-turbo"
|
||||
api_key = "sk-123"
|
||||
temperature = 0.0
|
||||
|
||||
[eval_gpt4o]
|
||||
model = "gpt-4o"
|
||||
api_key = "sk-123"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
## Start the evaluation
|
||||
|
||||
```bash
|
||||
./evaluation/agent_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
|
||||
```
|
||||
|
||||
Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
|
||||
|
||||
You can update the arguments in the script `evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
|
||||
|
||||
- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
|
||||
- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
|
||||
- `--max-iterations`: the number of iterations to run the evaluation. For example, `30`.
|
||||
- `--eval-num-workers`: the number of workers to use for evaluation. For example, `5`.
|
||||
- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
|
||||
|
||||
```bash
|
||||
./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo CodeActAgent 1
|
||||
```
|
||||
0
evaluation/agent_bench/__init__.py
Normal file
0
evaluation/agent_bench/__init__.py
Normal file
44
evaluation/agent_bench/helper.py
Normal file
44
evaluation/agent_bench/helper.py
Normal file
@ -0,0 +1,44 @@
|
||||
import os
|
||||
|
||||
|
||||
def analysis_size(size_str):
|
||||
size_str = size_str.strip()
|
||||
avails = {
|
||||
'B': 1,
|
||||
'Byte': 1,
|
||||
'K': 1024,
|
||||
'KB': 1024,
|
||||
'M': 1024 * 1024,
|
||||
'MB': 1024 * 1024,
|
||||
'G': 1024 * 1024 * 1024,
|
||||
'GB': 1024 * 1024 * 1024,
|
||||
'T': 1024 * 1024 * 1024 * 1024,
|
||||
'TB': 1024 * 1024 * 1024 * 1024,
|
||||
'P': 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
'PB': 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
}
|
||||
for size_unit in avails:
|
||||
if size_str.endswith(size_unit):
|
||||
return int(size_str[: -len(size_unit)]) * avails[size_unit]
|
||||
return int(size_str)
|
||||
|
||||
|
||||
def compare_results(check_method: str, model_answer: str, final_ans: str) -> bool:
|
||||
try:
|
||||
match check_method:
|
||||
case 'check/integer-match.py':
|
||||
return int(model_answer) == int(final_ans)
|
||||
case 'check/size-match.py':
|
||||
return analysis_size(model_answer) == analysis_size(final_ans)
|
||||
return (
|
||||
model_answer.replace('\r\n', '\n').replace('\r', '\n').strip()
|
||||
== final_ans.replace('\r\n', '\n').replace('\r', '\n').strip()
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def create_sh_file(filename: str, cmds: str) -> None:
|
||||
with open(filename, 'w', encoding='utf-8') as file:
|
||||
file.write(cmds.replace('\r\n', '\n'))
|
||||
os.chmod(filename, 0o755)
|
||||
393
evaluation/agent_bench/run_infer.py
Normal file
393
evaluation/agent_bench/run_infer.py
Normal file
@ -0,0 +1,393 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import docker
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.agent_bench.helper import compare_results, create_sh_file
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.action import MessageAction, CmdRunAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please first send your answer to user through '
|
||||
'message and then <execute_bash> exit </execute_bash>.\n'
|
||||
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
'CodeActAgent': 'When you think you have solved the question, '
|
||||
'please first send your answer to user through message and then exit.\n'
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
eval_output_dir,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# =============================================
|
||||
# preparation
|
||||
# =============================================
|
||||
|
||||
inst_id = instance.instance_id
|
||||
question = instance.description
|
||||
# create a directory for the instance's workspace
|
||||
instance_workspace = str(os.path.join(config.workspace_base, inst_id))
|
||||
container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id))
|
||||
if os.path.exists(instance_workspace):
|
||||
shutil.rmtree(instance_workspace)
|
||||
os.makedirs(instance_workspace, exist_ok=True)
|
||||
|
||||
# Set up the logger properly, so you can run multiprocessing to parallel the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{inst_id}.log')
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# =============================================
|
||||
# build instruction
|
||||
# =============================================
|
||||
|
||||
# Prepare instruction
|
||||
instruction = (
|
||||
f'Please fix the following issue.\n'
|
||||
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
'# Problem \n'
|
||||
f'{question}\n\n'
|
||||
)
|
||||
instruction += (
|
||||
'IMPORTANT: You should ONLY interact with the environment provided '
|
||||
'to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
|
||||
# =============================================
|
||||
# create sandbox and run the agent
|
||||
# =============================================
|
||||
|
||||
sandbox = DockerSSHBox()
|
||||
sandbox.execute(f'cd {inst_id}')
|
||||
|
||||
init_cmd = instance.init
|
||||
if init_cmd is not None:
|
||||
scpt_name = f'{instance.instance_id}_init.sh'
|
||||
scpt_path = os.path.join(container_inst_workspace, scpt_name)
|
||||
host_scpt_path = os.path.join(instance_workspace, scpt_name)
|
||||
create_sh_file(host_scpt_path, init_cmd)
|
||||
logger.info(f'Running init script: {scpt_path}')
|
||||
_, init_res = sandbox.execute(scpt_path)
|
||||
logger.info(f'Init script result: {init_res}')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
),
|
||||
sandbox=sandbox,
|
||||
)
|
||||
)
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
# get the ground truth
|
||||
# OSBenchSSHBox.get_ground_truth(instance, state)
|
||||
|
||||
# =============================================
|
||||
# result evaluation
|
||||
# =============================================
|
||||
|
||||
agent_answer = ''
|
||||
get_agent_result_cmd = instance.get_agent_result
|
||||
if get_agent_result_cmd is not None:
|
||||
scpt_name = f'{instance.instance_id}_get_agent_result.sh'
|
||||
scpt_path = os.path.join(container_inst_workspace, scpt_name)
|
||||
host_scpt_path = os.path.join(instance_workspace, scpt_name)
|
||||
create_sh_file(host_scpt_path, get_agent_result_cmd)
|
||||
logger.info(f'Running get agent result cmd: {scpt_path}')
|
||||
_, agent_answer = sandbox.execute(scpt_path)
|
||||
else:
|
||||
logger.info('Retrieving agent answer from history.')
|
||||
raw_ans = ''
|
||||
for act, _ in reversed(state.history):
|
||||
if isinstance(act, MessageAction) and act.source == 'agent':
|
||||
raw_ans = act.content
|
||||
break
|
||||
if isinstance(act, CmdRunAction) and act.source == 'agent':
|
||||
raw_ans = act.thought
|
||||
break
|
||||
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
|
||||
if len(agent_answer) == 0:
|
||||
logger.warning(f'Failed to parse model answer: {raw_ans}')
|
||||
agent_answer = raw_ans
|
||||
else:
|
||||
agent_answer = agent_answer[0]
|
||||
|
||||
final_ans = ''
|
||||
if instance.ground_truth is not None:
|
||||
final_ans = instance.ground_truth
|
||||
else:
|
||||
get_ground_truth_cmd = instance.get_ground_truth
|
||||
if get_ground_truth_cmd is not None:
|
||||
scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
|
||||
scpt_path = os.path.join(container_inst_workspace, scpt_name)
|
||||
host_scpt_path = os.path.join(instance_workspace, scpt_name)
|
||||
create_sh_file(host_scpt_path, get_ground_truth_cmd)
|
||||
logger.info(f'Running get ground truth cmd: {scpt_path}')
|
||||
sandbox.execute(f'cd {container_inst_workspace}')
|
||||
_, final_ans = sandbox.execute(scpt_path)
|
||||
|
||||
comparison_method = instance.comparison_method
|
||||
logger.info(
|
||||
f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
|
||||
)
|
||||
test_result = compare_results(comparison_method, agent_answer, final_ans)
|
||||
|
||||
histories = [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
]
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': inst_id,
|
||||
'instance': instance.to_dict(),
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'history': histories,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': {
|
||||
'agent_answer': agent_answer,
|
||||
'final_answer': final_ans,
|
||||
'check_method': comparison_method,
|
||||
'result': test_result,
|
||||
},
|
||||
}
|
||||
|
||||
# clean up
|
||||
if os.path.exists(instance_workspace):
|
||||
shutil.rmtree(instance_workspace)
|
||||
# Close the sandbox
|
||||
try:
|
||||
sandbox.close()
|
||||
except docker.errors.NotFound as e:
|
||||
logger.error(f'Failed to close sandbox: {e}')
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# =============================================
|
||||
# load datasets
|
||||
# =============================================
|
||||
|
||||
dataset = load_dataset('iFurySt/AgentBench')
|
||||
agent_bench_tests = dataset['osbench'].to_pandas()
|
||||
logger.info(f'Loaded {len(agent_bench_tests)} tests.')
|
||||
|
||||
# =============================================
|
||||
# handle arguments and prepare for evaluation
|
||||
# =============================================
|
||||
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_cls = args.agent_cls
|
||||
assert (
|
||||
agent_cls in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_cls}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_op_dir = str(
|
||||
os.path.join(
|
||||
args.eval_output_dir,
|
||||
'agent_bench',
|
||||
agent_cls,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
)
|
||||
|
||||
pathlib.Path(eval_op_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(str(os.path.join(eval_op_dir, 'logs'))).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_op_dir}')
|
||||
|
||||
meta = {
|
||||
'agent_class': agent_cls,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_op_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {meta}')
|
||||
with open(os.path.join(eval_op_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(meta, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
agent_bench_tests = agent_bench_tests[:eval_n_limit]
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_op_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_cls}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
# =============================================
|
||||
|
||||
new_agent_bench_tests = []
|
||||
for idx, inst in agent_bench_tests.iterrows():
|
||||
if inst.instance_id in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {inst.instance_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_agent_bench_tests.append(inst)
|
||||
|
||||
agent_bench_tests = new_agent_bench_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(agent_bench_tests)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# start task
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(agent_bench_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(fut):
|
||||
pbar.update(1)
|
||||
output = fut.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multiprocessing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multiprocessing
|
||||
for inst in agent_bench_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
inst,
|
||||
agent_cls,
|
||||
meta,
|
||||
eval_op_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
33
evaluation/agent_bench/scripts/run_infer.sh
Executable file
33
evaluation/agent_bench/scripts/run_infer.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
MODEL_CONFIG=$1
|
||||
AGENT=$2
|
||||
EVAL_LIMIT=$3
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
|
||||
# We need to track the version of Agent in the evaluation to make sure results are comparable
|
||||
AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run python evaluation/agent_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers 5 \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
37
evaluation/agent_bench/scripts/summarise_results.py
Normal file
37
evaluation/agent_bench/scripts/summarise_results.py
Normal file
@ -0,0 +1,37 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
|
||||
passed = []
|
||||
failed = []
|
||||
with open(res_file_path, 'r') as file:
|
||||
for line in file:
|
||||
data = json.loads(line.strip())
|
||||
instance_id = data['instance_id']
|
||||
resolved = False
|
||||
if 'test_result' in data and 'result' in data['test_result']:
|
||||
resolved = data['test_result']['result']
|
||||
if resolved:
|
||||
passed.append(instance_id)
|
||||
else:
|
||||
failed.append(instance_id)
|
||||
return passed, failed
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 2:
|
||||
print(
|
||||
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
|
||||
)
|
||||
sys.exit(1)
|
||||
json_file_path = sys.argv[1]
|
||||
passed_tests, failed_tests = extract_test_results(json_file_path)
|
||||
succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
|
||||
print(
|
||||
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
|
||||
)
|
||||
print('PASSED TESTS:')
|
||||
print(passed_tests)
|
||||
print('FAILED TESTS:')
|
||||
print(failed_tests)
|
||||
Loading…
x
Reference in New Issue
Block a user