OpenHands/evaluation/mint/run_infer.py
Graham Neubig 3a21198424
Remove monologue agent (#3036)
* Remove monologue agent

* Fixes
2024-07-19 19:25:05 +00:00

235 lines
7.9 KiB
Python

import asyncio
import functools
import logging
import os
import pathlib
from typing import Any, Dict
from datasets import load_dataset
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from evaluation.utils.shared import (
EvalMetadata,
make_metadata,
prepare_dataset,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
from .datatypes import TaskState
from .env import SimplifiedEnv
from .prompts import ToolPromptTemplate
from .tasks import Task
def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
logger.info(f'Gold reference: {task.reference}')
logger.info(f'Task config: {task_config}')
env = SimplifiedEnv(
agent_state=state,
task=task,
task_config=task_config,
)
last_action = state.history.get_last_action()
result_state: TaskState = env.step(last_action.message or '')
state.task_state = result_state
if not result_state.latest_output:
# Task is finished
msg = '/exit'
else:
msg = result_state.latest_output['content']
logger.info('User response:' + msg)
return msg
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response_mint,
}
AGENT_CLS_TO_INST_SUFFIX = {
'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
}
def process_instance(
instance: Any,
metadata: EvalMetadata,
reset_logger: bool = True,
):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
# use a session id for concurrent processing
sid = instance.task_id + '_' + str(os.getpid())
sandbox = DockerSSHBox(sid=sid)
requirements_host_src = 'evaluation/mint/requirements.txt'
requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
sandbox.copy_to(
host_src=requirements_host_src,
sandbox_dest=requirements_sandbox_dest,
recursive=False,
)
logger.info(
f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
)
exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
# Prepare instruction
assert metadata.details is not None
instruction = ToolPromptTemplate(use_tool=True)(
max_total_steps=metadata.max_iterations,
max_propose_solution=metadata.details['max_propose_solution'],
in_context_example=instance.in_context_example(
use_tool=True, with_feedback=False
),
task_prompt='Task:\n' + instance.prompt,
)
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
fake_user_response_fn = functools.partial(
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
task=instance,
task_config={
'max_iterations': metadata.max_iterations,
'max_propose_solution': metadata.details['max_propose_solution'],
},
)
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=fake_user_response_fn,
sandbox=sandbox,
sid=sid,
)
)
if state is None:
raise ValueError('State should not be None.')
task_state = None
if hasattr(state, 'task_state'):
task_state = state.task_state
logger.info('Task state: ' + str(task_state.to_dict()))
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# Save the output
output = {
'id': instance.task_id,
'instance': instance.to_dict(),
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': task_state.success if task_state else False,
}
# Close the sandbox
sandbox.close()
return output
if __name__ == '__main__':
parser = get_parser()
parser.add_argument(
'--subset',
default='math',
choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
type=str,
help='subset of the dataset to be used',
)
parser.add_argument(
'--max-propose-solution',
default=2,
type=int,
help='maximum number of times the agent can propose a solution',
)
args, _ = parser.parse_known_args()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
mint_dataset = load_dataset(
'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
)
logger.info(f'Evaluating MINT - {args.subset} subset')
mint_tests = mint_dataset.to_pandas()
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
details={'max_propose_solution': args.max_propose_solution},
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
)