mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
372 lines
15 KiB
Python
372 lines
15 KiB
Python
import asyncio
|
|
import copy
|
|
import functools
|
|
import os
|
|
import re
|
|
import shutil
|
|
import zipfile
|
|
|
|
import huggingface_hub
|
|
import pandas as pd
|
|
from datasets import load_dataset
|
|
from PIL import Image
|
|
|
|
from evaluation.benchmarks.gaia.scorer import question_scorer
|
|
from evaluation.benchmarks.gaia.utils import (
|
|
image_to_jpg_base64_url,
|
|
image_to_png_base64_url,
|
|
)
|
|
from evaluation.utils.shared import (
|
|
EvalMetadata,
|
|
EvalOutput,
|
|
codeact_user_response,
|
|
compatibility_for_eval_history_pairs,
|
|
get_default_sandbox_config_for_eval,
|
|
get_metrics,
|
|
get_openhands_config_for_eval,
|
|
make_metadata,
|
|
prepare_dataset,
|
|
reset_logger_for_multiprocessing,
|
|
run_evaluation,
|
|
update_llm_config_for_completions_logging,
|
|
)
|
|
from openhands.controller.state.state import State
|
|
from openhands.core.config import (
|
|
OpenHandsConfig,
|
|
get_evaluation_parser,
|
|
get_llm_config_arg,
|
|
load_from_toml,
|
|
)
|
|
from openhands.core.config.utils import (
|
|
get_agent_config_arg,
|
|
get_llms_for_routing_config,
|
|
get_model_routing_config_arg,
|
|
)
|
|
from openhands.core.logger import openhands_logger as logger
|
|
from openhands.core.main import create_runtime, run_controller
|
|
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
|
|
from openhands.events.observation import CmdOutputObservation
|
|
from openhands.runtime.base import Runtime
|
|
from openhands.utils.async_utils import call_async_from_sync
|
|
|
|
DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
|
|
|
|
|
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
|
'CodeActAgent': functools.partial(codeact_user_response, encapsulate_solution=True),
|
|
}
|
|
|
|
AGENT_CLS_TO_INST_SUFFIX = {
|
|
'CodeActAgent': 'When you think you have solved the question, please use the finish tool and include your final answer in the message parameter of the finish tool. Your final answer MUST be encapsulated within <solution> and </solution>.\n'
|
|
}
|
|
|
|
|
|
def get_config(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
) -> OpenHandsConfig:
|
|
sandbox_config = get_default_sandbox_config_for_eval()
|
|
sandbox_config.base_container_image = 'nikolaik/python-nodejs:python3.12-nodejs22'
|
|
config = get_openhands_config_for_eval(
|
|
metadata=metadata,
|
|
sandbox_config=sandbox_config,
|
|
runtime='docker',
|
|
)
|
|
config.set_llm_config(
|
|
update_llm_config_for_completions_logging(
|
|
metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
|
)
|
|
)
|
|
model_routing_config = get_model_routing_config_arg()
|
|
model_routing_config.llms_for_routing = (
|
|
get_llms_for_routing_config()
|
|
) # Populate with LLMs for routing from config.toml file
|
|
|
|
if metadata.agent_config:
|
|
metadata.agent_config.model_routing = model_routing_config
|
|
config.set_agent_config(metadata.agent_config, metadata.agent_class)
|
|
else:
|
|
logger.info('Agent config not provided, using default settings')
|
|
agent_config = config.get_agent_config(metadata.agent_class)
|
|
agent_config.enable_prompt_extensions = False
|
|
agent_config.model_routing = model_routing_config
|
|
|
|
config_copy = copy.deepcopy(config)
|
|
load_from_toml(config_copy)
|
|
config.search_api_key = config_copy.search_api_key
|
|
return config
|
|
|
|
|
|
def initialize_runtime(
|
|
runtime: Runtime,
|
|
instance: pd.Series, # this argument is not required
|
|
):
|
|
"""Initialize the runtime for the agent.
|
|
|
|
This function is called before the runtime is used to run the agent.
|
|
"""
|
|
logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
|
|
obs: CmdOutputObservation
|
|
|
|
action = CmdRunAction(command='mkdir -p /workspace')
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
assert obs.exit_code == 0
|
|
|
|
if instance['file_name'] != '':
|
|
# if this question comes with a file, we need to save it to the workspace
|
|
assert metadata.data_split is not None
|
|
extension_name = instance['file_name'].split('.')[-1]
|
|
src_file = os.path.join(
|
|
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
|
|
)
|
|
assert os.path.exists(src_file)
|
|
if extension_name == 'zip':
|
|
temp_dir = os.path.join(
|
|
DATASET_CACHE_DIR, '2023', metadata.data_split, 'tmp_file'
|
|
)
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
with zipfile.ZipFile(src_file, 'r') as zip_ref:
|
|
zip_ref.extractall(temp_dir)
|
|
for root, dirs, files in os.walk(temp_dir):
|
|
for file in files:
|
|
dest_file = '/workspace'
|
|
runtime.copy_to(os.path.join(root, file), dest_file)
|
|
shutil.rmtree(temp_dir)
|
|
elif extension_name not in ['jpg', 'png']:
|
|
dest_file = '/workspace'
|
|
runtime.copy_to(src_file, dest_file)
|
|
|
|
# rename to file.extension_name
|
|
action = CmdRunAction(
|
|
command=f'mv /workspace/{instance["file_name"]} /workspace/file.{extension_name}'
|
|
)
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
assert obs.exit_code == 0
|
|
|
|
action = CmdRunAction(command='cd /workspace')
|
|
logger.info(action, extra={'msg_type': 'ACTION'})
|
|
obs = runtime.run_action(action)
|
|
assert obs.exit_code == 0
|
|
|
|
action = CmdRunAction(
|
|
command='apt-get update && apt-get install -y ffmpeg && apt-get install -y ffprobe'
|
|
)
|
|
runtime.run_action(action)
|
|
logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
|
|
|
|
|
|
def process_instance(
|
|
instance: pd.Series,
|
|
metadata: EvalMetadata,
|
|
reset_logger: bool = True,
|
|
) -> EvalOutput:
|
|
config = get_config(instance, metadata)
|
|
|
|
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
|
if reset_logger:
|
|
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
|
reset_logger_for_multiprocessing(logger, instance['instance_id'], log_dir)
|
|
else:
|
|
logger.info(f'Starting evaluation for instance {instance["instance_id"]}.')
|
|
|
|
if instance['file_name'] != '':
|
|
extension_name = instance['file_name'].split('.')[-1]
|
|
dest_file = os.path.join('/workspace', f'file.{extension_name}')
|
|
else:
|
|
dest_file = None
|
|
|
|
# Prepare instruction
|
|
instruction = """You have one question to answer. It is paramount that you provide a correct answer.
|
|
Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
|
|
You must make sure you find the correct answer! You MUST strictly follow the task-specific formatting instructions for your final answer.
|
|
Here is the task:
|
|
{task_question}
|
|
""".format(
|
|
task_question=instance['Question'],
|
|
)
|
|
logger.info(f'Instruction: {instruction}')
|
|
image_urls = []
|
|
if dest_file:
|
|
if extension_name not in ['jpg', 'png', 'zip']:
|
|
instruction += f'To solve this task you will have to use the attached file provided in the workspace at location: {dest_file}\n\n'
|
|
elif extension_name == 'zip':
|
|
filenames = []
|
|
src_file = os.path.join(
|
|
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
|
|
)
|
|
with zipfile.ZipFile(src_file, 'r') as zip_ref:
|
|
filenames = zip_ref.namelist()
|
|
|
|
filenames = [f'/workspace/{file}' for file in filenames]
|
|
filenames = ', '.join(filenames)
|
|
instruction += f'To solve this task you will have to use the attached files provided in the workspace at locations: {filenames}\n\n'
|
|
else: # Image files: jpg, png
|
|
src_file = os.path.join(
|
|
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
|
|
)
|
|
instruction += 'Image: To solve this task you will have to use the image shown below.\n\n'
|
|
image = Image.open(src_file)
|
|
if extension_name == 'jpg':
|
|
image_urls.append(image_to_jpg_base64_url(image))
|
|
else:
|
|
image_urls.append(image_to_png_base64_url(image))
|
|
|
|
instruction += """IMPORTANT: When seeking information from a website, REFRAIN from arbitrary URL navigation. You should utilize the designated search engine tool with precise keywords to obtain relevant URLs or use the specific website's search interface. DO NOT navigate directly to specific URLs as they may not exist.\n\nFor example: if you want to search for a research paper on Arxiv, either use the search engine tool with specific keywords or navigate to arxiv.org and then use its interface.\n"""
|
|
instruction += 'IMPORTANT: You should NEVER ask for Human Help.\n'
|
|
instruction += 'IMPORTANT: Please encapsulate your final answer (answer ONLY) within <solution> and </solution>. Your answer will be evaluated using string matching approaches so it important that you STRICTLY adhere to the output formatting instructions specified in the task (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)\n'
|
|
instruction += (
|
|
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
|
)
|
|
instruction += "IMPORTANT: Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, express it numerically (i.e., with digits rather than words), do not use commas, and do not include units such as $ or percent signs unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities). If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.\n"
|
|
|
|
# NOTE: You can actually set slightly different instruction for different agents
|
|
instruction += AGENT_CLS_TO_INST_SUFFIX.get(metadata.agent_class, '')
|
|
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
|
|
|
|
runtime = create_runtime(config)
|
|
call_async_from_sync(runtime.connect)
|
|
initialize_runtime(runtime, instance)
|
|
|
|
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
|
state: State | None = asyncio.run(
|
|
run_controller(
|
|
config=config,
|
|
initial_user_action=MessageAction(
|
|
content=instruction, image_urls=image_urls
|
|
),
|
|
runtime=runtime,
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
|
metadata.agent_class
|
|
],
|
|
)
|
|
)
|
|
# ======= Attempt to evaluate the agent's edits =======
|
|
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
|
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
|
|
|
if state is None:
|
|
raise ValueError('State should not be None.')
|
|
|
|
model_answer_raw = ''
|
|
# get the last message or thought from the agent
|
|
for event in reversed(state.history):
|
|
if event.source == 'agent':
|
|
if isinstance(event, AgentFinishAction):
|
|
model_answer_raw = event.final_thought
|
|
break
|
|
elif isinstance(event, CmdRunAction):
|
|
model_answer_raw = event.thought
|
|
break
|
|
elif isinstance(event, MessageAction):
|
|
model_answer_raw = event.content
|
|
break
|
|
|
|
# attempt to parse model_answer
|
|
model_answer = re.findall(r'<solution>(.*?)</solution>', model_answer_raw)
|
|
if len(model_answer) == 0:
|
|
logger.warning(f'Failed to parse model answer: {model_answer_raw}')
|
|
model_answer = model_answer_raw
|
|
else:
|
|
model_answer = model_answer[0]
|
|
|
|
logger.info(
|
|
f'Final message: {model_answer} | Ground truth: {instance["Final answer"]}'
|
|
)
|
|
score = question_scorer(
|
|
model_answer=model_answer, ground_truth=instance['Final answer']
|
|
)
|
|
test_result = {
|
|
'score': score,
|
|
'model_answer_raw': model_answer_raw,
|
|
'model_answer': model_answer,
|
|
'ground_truth': instance['Final answer'],
|
|
}
|
|
metrics = get_metrics(state)
|
|
|
|
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
|
# for compatibility with the existing output format, we can remake the pairs here
|
|
# remove when it becomes unnecessary
|
|
histories = compatibility_for_eval_history_pairs(state.history)
|
|
|
|
# Save the output
|
|
output = EvalOutput(
|
|
instance_id=instance['instance_id'],
|
|
instance=instance.to_dict(),
|
|
instruction=instance['Question'],
|
|
metadata=metadata,
|
|
history=histories,
|
|
metrics=metrics,
|
|
error=state.last_error if state and state.last_error else None,
|
|
test_result=test_result,
|
|
)
|
|
runtime.close()
|
|
return output
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = get_evaluation_parser()
|
|
parser.add_argument(
|
|
'--level',
|
|
type=str,
|
|
help='gaia level to evaluate, eg. 2023_level1',
|
|
)
|
|
parser.add_argument(
|
|
'--data-split',
|
|
type=str,
|
|
help='data split to evaluate, eg. test',
|
|
default='validation',
|
|
)
|
|
args, _ = parser.parse_known_args()
|
|
|
|
agent_config = None
|
|
if args.agent_config:
|
|
agent_config = get_agent_config_arg(args.agent_config)
|
|
|
|
llm_config = None
|
|
if args.llm_config:
|
|
llm_config = get_llm_config_arg(args.llm_config)
|
|
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
|
llm_config.modify_params = False
|
|
|
|
if llm_config is None:
|
|
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
|
|
|
toml_config = OpenHandsConfig()
|
|
load_from_toml(toml_config)
|
|
metadata = make_metadata(
|
|
llm_config=llm_config,
|
|
dataset_name='gaia',
|
|
agent_class=args.agent_cls,
|
|
max_iterations=args.max_iterations,
|
|
eval_note=args.eval_note,
|
|
eval_output_dir=args.eval_output_dir,
|
|
data_split=args.data_split,
|
|
details={
|
|
'gaia-level': args.level,
|
|
'mcp-servers': ['tavily'] if toml_config.search_api_key else [],
|
|
},
|
|
agent_config=agent_config,
|
|
)
|
|
|
|
dataset = load_dataset('gaia-benchmark/GAIA', args.level)
|
|
huggingface_hub.snapshot_download(
|
|
'gaia-benchmark/GAIA',
|
|
repo_type='dataset',
|
|
local_dir=DATASET_CACHE_DIR,
|
|
)
|
|
gaia_tests = dataset[metadata.data_split].to_pandas()
|
|
gaia_tests.rename(columns={'task_id': 'instance_id'}, inplace=True)
|
|
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
prepared_dataset = prepare_dataset(gaia_tests, output_file, args.eval_n_limit)
|
|
|
|
run_evaluation(
|
|
dataset=prepared_dataset,
|
|
metadata=metadata,
|
|
output_file=output_file,
|
|
num_workers=args.eval_num_workers,
|
|
process_instance_func=process_instance,
|
|
)
|