mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
235 lines
7.9 KiB
Python
235 lines
7.9 KiB
Python
import asyncio
|
|
import functools
|
|
import logging
|
|
import os
|
|
import pathlib
|
|
from typing import Any, Dict
|
|
|
|
from datasets import load_dataset
|
|
|
|
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
|
from evaluation.utils.shared import (
|
|
EvalMetadata,
|
|
make_metadata,
|
|
prepare_dataset,
|
|
run_evaluation,
|
|
)
|
|
from opendevin.controller.agent import Agent
|
|
from opendevin.controller.state.state import State
|
|
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
|
from opendevin.core.logger import get_console_handler
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
from opendevin.core.main import run_agent_controller
|
|
from opendevin.llm.llm import LLM
|
|
|
|
from .datatypes import TaskState
|
|
from .env import SimplifiedEnv
|
|
from .prompts import ToolPromptTemplate
|
|
from .tasks import Task
|
|
|
|
|
|
def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
|
|
logger.info(f'Gold reference: {task.reference}')
|
|
logger.info(f'Task config: {task_config}')
|
|
|
|
env = SimplifiedEnv(
|
|
agent_state=state,
|
|
task=task,
|
|
task_config=task_config,
|
|
)
|
|
last_action = state.history.get_last_action()
|
|
result_state: TaskState = env.step(last_action.message or '')
|
|
|
|
state.task_state = result_state
|
|
|
|
if not result_state.latest_output:
|
|
# Task is finished
|
|
msg = '/exit'
|
|
else:
|
|
msg = result_state.latest_output['content']
|
|
|
|
logger.info('User response:' + msg)
|
|
return msg
|
|
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': codeact_user_response_mint,
|
|
}
|
|
|
|
AGENT_CLS_TO_INST_SUFFIX = {
|
|
'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
|
|
}
|
|
|
|
|
|
def process_instance(
|
|
instance: Any,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
):
|
|
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
|
|
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
|
# create process-specific workspace dir
|
|
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
|
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
|
|
|
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
|
if reset_logger:
|
|
# Set up logger
|
|
log_file = os.path.join(
|
|
metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
# add back the console handler to print ONE line
|
|
logger.addHandler(get_console_handler())
|
|
logger.info(
|
|
f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
|
|
)
|
|
# Remove all existing handlers from logger
|
|
for handler in logger.handlers[:]:
|
|
logger.removeHandler(handler)
|
|
file_handler = logging.FileHandler(log_file)
|
|
file_handler.setFormatter(
|
|
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
)
|
|
logger.addHandler(file_handler)
|
|
|
|
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
|
|
|
# use a session id for concurrent processing
|
|
sid = instance.task_id + '_' + str(os.getpid())
|
|
sandbox = DockerSSHBox(sid=sid)
|
|
|
|
requirements_host_src = 'evaluation/mint/requirements.txt'
|
|
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
|
|
sandbox.copy_to(
|
|
host_src=requirements_host_src,
|
|
sandbox_dest=requirements_sandbox_dest,
|
|
recursive=False,
|
|
)
|
|
logger.info(
|
|
f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
|
|
)
|
|
exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
|
|
|
|
# Prepare instruction
|
|
assert metadata.details is not None
|
|
instruction = ToolPromptTemplate(use_tool=True)(
|
|
max_total_steps=metadata.max_iterations,
|
|
max_propose_solution=metadata.details['max_propose_solution'],
|
|
in_context_example=instance.in_context_example(
|
|
use_tool=True, with_feedback=False
|
|
),
|
|
task_prompt='Task:\n' + instance.prompt,
|
|
)
|
|
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
|
|
|
|
# NOTE: You can actually set slightly different instruction for different agents
|
|
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
|
|
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
|
fake_user_response_fn = functools.partial(
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
|
|
task=instance,
|
|
task_config={
|
|
'max_iterations': metadata.max_iterations,
|
|
'max_propose_solution': metadata.details['max_propose_solution'],
|
|
},
|
|
)
|
|
|
|
state: State | None = asyncio.run(
|
|
run_agent_controller(
|
|
agent,
|
|
instruction,
|
|
max_iterations=metadata.max_iterations,
|
|
fake_user_response_fn=fake_user_response_fn,
|
|
sandbox=sandbox,
|
|
sid=sid,
|
|
)
|
|
)
|
|
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
task_state = None
|
|
if hasattr(state, 'task_state'):
|
|
task_state = state.task_state
|
|
logger.info('Task state: ' + str(task_state.to_dict()))
|
|
|
|
metrics = state.metrics.get() if state.metrics else None
|
|
|
|
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
|
# for compatibility with the existing output format, we can remake the pairs here
|
|
# remove when it becomes unnecessary
|
|
histories = state.history.compatibility_for_eval_history_pairs()
|
|
|
|
# Save the output
|
|
output = {
|
|
'id': instance.task_id,
|
|
'instance': instance.to_dict(),
|
|
'instruction': instruction,
|
|
'metadata': metadata.model_dump(),
|
|
'history': histories,
|
|
'metrics': metrics,
|
|
'error': state.last_error if state and state.last_error else None,
|
|
'test_result': task_state.success if task_state else False,
|
|
}
|
|
|
|
# Close the sandbox
|
|
sandbox.close()
|
|
|
|
return output
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_parser()
|
|
|
|
parser.add_argument(
|
|
'--subset',
|
|
default='math',
|
|
choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
|
|
type=str,
|
|
help='subset of the dataset to be used',
|
|
)
|
|
parser.add_argument(
|
|
'--max-propose-solution',
|
|
default=2,
|
|
type=int,
|
|
help='maximum number of times the agent can propose a solution',
|
|
)
|
|
|
|
args, _ = parser.parse_known_args()
|
|
|
|
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
|
# so we don't need to manage file uploading to OpenDevin's repo
|
|
mint_dataset = load_dataset(
|
|
'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
|
|
)
|
|
logger.info(f'Evaluating MINT - {args.subset} subset')
|
|
mint_tests = mint_dataset.to_pandas()
|
|
|
|
id_column = 'id'
|
|
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
|
logger.info(f'Config for evaluation: {config}')
|
|
|
|
metadata = make_metadata(
|
|
llm_config,
|
|
args.dataset_name,
|
|
args.agent_cls,
|
|
args.max_iterations,
|
|
args.eval_note,
|
|
args.eval_output_dir,
|
|
details={'max_propose_solution': args.max_propose_solution},
|
|
)
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
|
|
run_evaluation(
|
|
instances,
|
|
metadata,
|
|
output_file,
|
|
args.eval_num_workers,
|
|
process_instance,
|
|
id_column,
|
|
)
|