mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
* Remove global config from memory * Remove runtime global config * Remove from storage * Remove global config * Fix event stream tests * Fix sandbox issue * Change config * Removed transferred tests * Add swe env box * Fixes on testing * Fixed some tests * Fix typing * Fix ipython test * Revive function * Make temp_dir fixture * Remove test to avoid circular import
249 lines
8.9 KiB
Python
249 lines
8.9 KiB
Python
import asyncio
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
|
|
import docker
|
|
import pandas as pd
|
|
from datasets import load_dataset
|
|
|
|
from evaluation.agent_bench.helper import (
|
|
FAKE_RESPONSES,
|
|
INST_SUFFIXES,
|
|
compare_results,
|
|
create_sh_file,
|
|
)
|
|
from evaluation.utils.shared import (
|
|
EvalMetadata,
|
|
make_metadata,
|
|
prepare_dataset,
|
|
run_evaluation,
|
|
)
|
|
from opendevin.controller.agent import Agent
|
|
from opendevin.controller.state.state import State
|
|
from opendevin.core.config import get_llm_config_arg, load_app_config, parse_arguments
|
|
from opendevin.core.logger import get_console_handler
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
from opendevin.core.main import run_agent_controller
|
|
from opendevin.events.action import CmdRunAction, MessageAction
|
|
from opendevin.llm.llm import LLM
|
|
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
|
|
|
config = load_app_config()
|
|
|
|
|
|
def process_instance(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
):
|
|
# Create the agent
|
|
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
|
|
|
|
inst_id = instance.instance_id
|
|
question = instance.description
|
|
# create a directory for the instance's workspace
|
|
instance_workspace = str(os.path.join(config.workspace_base, inst_id))
|
|
container_inst_workspace = str(
|
|
os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
|
|
)
|
|
if os.path.exists(instance_workspace):
|
|
shutil.rmtree(instance_workspace)
|
|
os.makedirs(instance_workspace, exist_ok=True)
|
|
|
|
# Set up the logger properly, so you can run multiprocessing to parallel the evaluation
|
|
if reset_logger:
|
|
# Set up logger
|
|
log_file = os.path.join(
|
|
metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
# add back the console handler to print ONE line
|
|
logger.addHandler(get_console_handler())
|
|
logger.info(
|
|
f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
file_handler = logging.FileHandler(log_file)
|
|
file_handler.setFormatter(
|
|
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
)
|
|
logger.addHandler(file_handler)
|
|
|
|
# =============================================
|
|
# build instruction
|
|
# =============================================
|
|
|
|
# Prepare instruction
|
|
instruction = (
|
|
f'Please fix the following issue.\n'
|
|
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
|
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
|
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
|
'# Problem \n'
|
|
f'{question}\n\n'
|
|
)
|
|
instruction += (
|
|
'IMPORTANT: You should ONLY interact with the environment provided '
|
|
'to you AND NEVER ASK FOR HUMAN HELP.\n'
|
|
)
|
|
# NOTE: You can actually set slightly different instruction for different agents
|
|
instruction += INST_SUFFIXES[agent.__class__.__name__]
|
|
|
|
# =============================================
|
|
# create sandbox and run the agent
|
|
# =============================================
|
|
|
|
sandbox = DockerSSHBox()
|
|
sandbox.execute(f'cd {inst_id}')
|
|
|
|
init_cmd = instance.init
|
|
if init_cmd is not None:
|
|
scpt_name = f'{instance.instance_id}_init.sh'
|
|
scpt_path = os.path.join(container_inst_workspace, scpt_name)
|
|
host_scpt_path = os.path.join(instance_workspace, scpt_name)
|
|
create_sh_file(host_scpt_path, init_cmd)
|
|
logger.info(f'Running init script: {scpt_path}')
|
|
_, init_res = sandbox.execute(scpt_path)
|
|
logger.info(f'Init script result: {init_res}')
|
|
|
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
|
state: State | None = asyncio.run(
|
|
run_agent_controller(
|
|
agent,
|
|
instruction,
|
|
max_iterations=metadata.max_iterations,
|
|
max_budget_per_task=config.max_budget_per_task,
|
|
fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
|
|
sandbox=sandbox,
|
|
sid=inst_id,
|
|
)
|
|
)
|
|
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
# get the ground truth
|
|
# OSBenchSSHBox.get_ground_truth(instance, state)
|
|
|
|
# =============================================
|
|
# result evaluation
|
|
# =============================================
|
|
|
|
agent_answer = ''
|
|
get_agent_result_cmd = instance.get_agent_result
|
|
if get_agent_result_cmd is not None:
|
|
scpt_name = f'{instance.instance_id}_get_agent_result.sh'
|
|
scpt_path = os.path.join(container_inst_workspace, scpt_name)
|
|
host_scpt_path = os.path.join(instance_workspace, scpt_name)
|
|
create_sh_file(host_scpt_path, get_agent_result_cmd)
|
|
logger.info(f'Running get agent result cmd: {scpt_path}')
|
|
_, agent_answer = sandbox.execute(scpt_path)
|
|
else:
|
|
logger.info('Retrieving agent answer from history.')
|
|
raw_ans = ''
|
|
|
|
# retrieve the last agent message or thought
|
|
for event in state.history.get_events(reverse=True):
|
|
if isinstance(event, MessageAction) and event.source == 'agent':
|
|
raw_ans = event.content
|
|
elif isinstance(event, CmdRunAction) and event.source == 'agent':
|
|
raw_ans = event.thought
|
|
|
|
# parse the answer for a solution tag
|
|
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
|
|
if len(agent_answer) == 0:
|
|
logger.warning(f'Failed to parse model answer: {raw_ans}')
|
|
agent_answer = raw_ans
|
|
else:
|
|
agent_answer = agent_answer[0]
|
|
|
|
final_ans = ''
|
|
if instance.ground_truth is not None:
|
|
final_ans = instance.ground_truth
|
|
else:
|
|
get_ground_truth_cmd = instance.get_ground_truth
|
|
if get_ground_truth_cmd is not None:
|
|
scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
|
|
scpt_path = os.path.join(container_inst_workspace, scpt_name)
|
|
host_scpt_path = os.path.join(instance_workspace, scpt_name)
|
|
create_sh_file(host_scpt_path, get_ground_truth_cmd)
|
|
logger.info(f'Running get ground truth cmd: {scpt_path}')
|
|
sandbox.execute(f'cd {container_inst_workspace}')
|
|
_, final_ans = sandbox.execute(scpt_path)
|
|
|
|
comparison_method = instance.comparison_method
|
|
logger.info(
|
|
f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
|
|
)
|
|
test_result = compare_results(comparison_method, agent_answer, final_ans)
|
|
|
|
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
|
# for compatibility with the existing output format, we can remake the pairs here
|
|
# remove when it becomes unnecessary
|
|
histories = state.history.compatibility_for_eval_history_pairs()
|
|
|
|
metrics = state.metrics.get() if state.metrics else None
|
|
|
|
# Save the output
|
|
output = {
|
|
'instance_id': inst_id,
|
|
'instance': instance.to_dict(),
|
|
'instruction': instruction,
|
|
'metadata': metadata.model_dump(),
|
|
'history': histories,
|
|
'metrics': metrics,
|
|
'error': state.last_error if state and state.last_error else None,
|
|
'test_result': {
|
|
'agent_answer': agent_answer,
|
|
'final_answer': final_ans,
|
|
'check_method': comparison_method,
|
|
'result': test_result,
|
|
},
|
|
}
|
|
|
|
# clean up
|
|
if os.path.exists(instance_workspace):
|
|
shutil.rmtree(instance_workspace)
|
|
# Close the sandbox
|
|
try:
|
|
sandbox.close()
|
|
except docker.errors.NotFound as e:
|
|
logger.error(f'Failed to close sandbox: {e}')
|
|
return output
|
|
|
|
|
|
if __name__ == '__main__':
|
|
id_column = 'instance_id'
|
|
args = parse_arguments()
|
|
dataset = load_dataset('iFurySt/AgentBench')
|
|
agent_bench_tests = dataset['osbench'].to_pandas()
|
|
|
|
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
|
logger.info(f'Config for evaluation: {config}')
|
|
|
|
metadata = make_metadata(
|
|
llm_config,
|
|
args.dataset_name,
|
|
args.agent_cls,
|
|
args.max_iterations,
|
|
args.eval_note,
|
|
args.eval_output_dir,
|
|
)
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
|
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
id_column,
|
|
)
|