mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Simplify eval code (#2775)
* Start simplifying eval code * Update * Add EDA * Updated GAIA * Update gpqa * Add humanevalfix * Fix logic_reasoning * Add miniwob * Add mint and ml_bench * toolqa * Added swe-bench * Fixed webarena * Refactor parameters
This commit is contained in:
parent
038e8f8caa
commit
a081935fd8
@ -1,18 +1,21 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# import huggingface_hub
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.EDA.game import Q20Game, Q20GameCelebrity
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
|
||||
# from evaluation.EDA.scorer import question_scorer
|
||||
@ -36,7 +39,7 @@ def cleanup():
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
def codeact_user_response_eda(state: State) -> str:
|
||||
global game
|
||||
model_guess = ''
|
||||
if state.history:
|
||||
@ -54,12 +57,8 @@ def codeact_user_response(state: State) -> str:
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'CodeActAgent': codeact_user_response_eda,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
@ -69,10 +68,14 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent, instance, metadata, openai_api_key, reset_logger: bool = True
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
eval_output_dir = metadata['eval_output_dir']
|
||||
eval_output_dir = metadata.eval_output_dir
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
@ -107,12 +110,14 @@ def process_instance(
|
||||
|
||||
# Use codeactagent as guesser_model
|
||||
global game
|
||||
game = _game_class[metadata['dataset']](
|
||||
assert metadata.dataset is not None
|
||||
assert metadata.details is not None
|
||||
game = _game_class[metadata.dataset](
|
||||
item=instance['text'].strip(),
|
||||
answerer_model=metadata['answerer_model'],
|
||||
answerer_model=metadata.details['answerer_model'],
|
||||
guesser_model=None,
|
||||
num_turns=metadata['max_iterations'],
|
||||
openai_api_key=openai_api_key,
|
||||
num_turns=metadata.max_iterations,
|
||||
openai_api_key=metadata.details['openai_api_key'],
|
||||
guesser_kargs=guesser_kargs,
|
||||
)
|
||||
|
||||
@ -195,151 +200,42 @@ if __name__ == '__main__':
|
||||
help='data split, eg, test',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.directory:
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
print(f'Setting workspace base to {config.workspace_base}')
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
eda_dataset = load_dataset(
|
||||
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
|
||||
)
|
||||
logger.info(
|
||||
f'Evaluating Entity Deduction Arena {args.dataset} {args.data_split} split'
|
||||
)
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
eda_dataset = load_dataset(
|
||||
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
|
||||
)
|
||||
|
||||
metadata = make_metadata(
|
||||
config.llm,
|
||||
f'eda-{args.dataset}',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'eda',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
data_split=args.data_split,
|
||||
details={
|
||||
'answerer_model': str(args.answerer_model),
|
||||
'openai_api_key': str(args.OPENAI_API_KEY),
|
||||
},
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'dataset': args.dataset,
|
||||
'data_split': args.data_split,
|
||||
'answerer_model': args.answerer_model,
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
eda_dataset = eda_dataset.select(list(range(eval_n_limit)))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_items = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_items.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_items)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
prepared_dataset = prepare_dataset(
|
||||
eda_dataset.to_pandas(), output_file, args.eval_n_limit, 'text'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_eda_dataset = []
|
||||
for instance in eda_dataset:
|
||||
if instance['text'].strip() in finished_items:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["text"].strip()} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_eda_dataset.append(instance)
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
|
||||
|
||||
eda_dataset = new_eda_dataset
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_items)}, Remaining instances: {len(eda_dataset)}'
|
||||
run_evaluation(
|
||||
prepared_dataset,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
'text',
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(eda_dataset))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in eda_dataset:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
args.OPENAI_API_KEY,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -1,9 +1,39 @@
|
||||
import os
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
from evaluation.utils.shared import codeact_user_response
|
||||
from opendevin.events.action import CmdRunAction, MessageAction
|
||||
|
||||
|
||||
def try_parse_answer(act) -> str | None:
|
||||
raw_ans = ''
|
||||
if isinstance(act, MessageAction) and act.source == 'agent':
|
||||
raw_ans = act.content
|
||||
elif isinstance(act, CmdRunAction) and act.source == 'agent':
|
||||
raw_ans = act.thought
|
||||
else:
|
||||
return None
|
||||
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
|
||||
if not agent_answer:
|
||||
return None
|
||||
return agent_answer[0].strip()
|
||||
|
||||
|
||||
FAKE_RESPONSES = {
|
||||
'CodeActAgent': partial(
|
||||
codeact_user_response, encapsulate_solution=True, try_parse=try_parse_answer
|
||||
),
|
||||
}
|
||||
|
||||
INST_SUFFIXES: dict[str, str] = {
|
||||
'CodeActAgent': (
|
||||
'When you think you have solved the question, '
|
||||
'please first send your answer to user through message and then exit.\n'
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def analysis_size(size_str):
|
||||
size_str = size_str.strip()
|
||||
avails = {
|
||||
@ -45,17 +75,3 @@ def create_sh_file(filename: str, cmds: str) -> None:
|
||||
with open(filename, 'w', encoding='utf-8') as file:
|
||||
file.write(cmds.replace('\r\n', '\n'))
|
||||
os.chmod(filename, 0o755)
|
||||
|
||||
|
||||
def try_parse_answer(act) -> str | None:
|
||||
raw_ans = ''
|
||||
if isinstance(act, MessageAction) and act.source == 'agent':
|
||||
raw_ans = act.content
|
||||
elif isinstance(act, CmdRunAction) and act.source == 'agent':
|
||||
raw_ans = act.thought
|
||||
else:
|
||||
return None
|
||||
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
|
||||
if not agent_answer:
|
||||
return None
|
||||
return agent_answer[0].strip()
|
||||
|
||||
@ -1,27 +1,28 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import docker
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.agent_bench.helper import (
|
||||
FAKE_RESPONSES,
|
||||
INST_SUFFIXES,
|
||||
compare_results,
|
||||
create_sh_file,
|
||||
try_parse_answer,
|
||||
)
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -31,64 +32,13 @@ from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please first send your answer to user through '
|
||||
'message and then <execute_bash> exit </execute_bash>.\n'
|
||||
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
if state.history:
|
||||
# check if the last action is an answer, if so, return exit for early exit
|
||||
last_action, _ = state.history[-1]
|
||||
ans = try_parse_answer(last_action)
|
||||
if ans is not None:
|
||||
return '/exit'
|
||||
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
}
|
||||
|
||||
AGENT_CLS_TO_INST_SUFFIX = {
|
||||
'CodeActAgent': 'When you think you have solved the question, '
|
||||
'please first send your answer to user through message and then exit.\n'
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
eval_output_dir,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# =============================================
|
||||
# preparation
|
||||
# =============================================
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
|
||||
inst_id = instance.instance_id
|
||||
question = instance.description
|
||||
@ -104,7 +54,9 @@ def process_instance(
|
||||
# Set up the logger properly, so you can run multiprocessing to parallel the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{inst_id}.log')
|
||||
log_file = os.path.join(
|
||||
metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
@ -140,7 +92,7 @@ def process_instance(
|
||||
'to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
instruction += INST_SUFFIXES[agent.__class__.__name__]
|
||||
|
||||
# =============================================
|
||||
# create sandbox and run the agent
|
||||
@ -164,9 +116,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
|
||||
sandbox=sandbox,
|
||||
sid=inst_id,
|
||||
)
|
||||
@ -262,154 +212,27 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
id_column = 'instance_id'
|
||||
args = parse_arguments()
|
||||
|
||||
# =============================================
|
||||
# load datasets
|
||||
# =============================================
|
||||
dataset = load_dataset('iFurySt/AgentBench')
|
||||
agent_bench_tests = dataset['osbench'].to_pandas()
|
||||
logger.info(f'Loaded {len(agent_bench_tests)} tests.')
|
||||
|
||||
# =============================================
|
||||
# handle arguments and prepare for evaluation
|
||||
# =============================================
|
||||
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_cls = args.agent_cls
|
||||
assert (
|
||||
agent_cls in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_cls}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_op_dir = str(
|
||||
os.path.join(
|
||||
args.eval_output_dir,
|
||||
'agent_bench',
|
||||
agent_cls,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_op_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(str(os.path.join(eval_op_dir, 'logs'))).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_op_dir}')
|
||||
|
||||
meta = {
|
||||
'agent_class': agent_cls,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_op_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {meta}')
|
||||
with open(os.path.join(eval_op_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(meta, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
agent_bench_tests = agent_bench_tests[:eval_n_limit]
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_op_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_cls}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
# =============================================
|
||||
|
||||
new_agent_bench_tests = []
|
||||
for idx, inst in agent_bench_tests.iterrows():
|
||||
if inst.instance_id in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {inst.instance_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_agent_bench_tests.append(inst)
|
||||
|
||||
agent_bench_tests = new_agent_bench_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(agent_bench_tests)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# start task
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(agent_bench_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(fut):
|
||||
pbar.update(1)
|
||||
output = fut.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multiprocessing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_cls)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multiprocessing
|
||||
for inst in agent_bench_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
inst,
|
||||
meta,
|
||||
eval_op_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -4,22 +4,26 @@ import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from functools import partial
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
@ -32,33 +36,10 @@ def cleanup():
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'CodeActAgent': partial(
|
||||
codeact_user_response, encapsulate_solution=True, try_parse=None
|
||||
),
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
@ -112,13 +93,12 @@ def get_test_result(instance, sandbox, workspace_dir_name):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
instance = BiocoderData(**instance)
|
||||
print(instance)
|
||||
workspace_dir_name = (
|
||||
@ -130,15 +110,14 @@ def process_instance(
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
|
||||
metadata.eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
@ -157,8 +136,7 @@ def process_instance(
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
|
||||
# You can omit this if you don't need to setup specialized sandbox
|
||||
@ -192,25 +170,6 @@ def process_instance(
|
||||
'You do not need to run the code to check if it works. \n'
|
||||
'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
|
||||
)
|
||||
|
||||
# instruction = (
|
||||
# f'In the file {instance.filePath}, there is a function with a signature and without a body. Your job is to complete the function, according to the given instructions. When you complete the function, respond with the function body, and nothing else.'
|
||||
# 'The repository has cloned for you to start working. You are not allowed to run any bash commands, just modify the files. \n\n'
|
||||
# '# Problem Statement\n'
|
||||
# 'Complete the following function signature:\n\n'
|
||||
# f'{instance.signature}'
|
||||
# 'The function should do the following:\n\n'
|
||||
# f'{instance.promptSummaryOnly}\n\n'
|
||||
# )
|
||||
#
|
||||
# instruction += (
|
||||
# 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
# 'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
|
||||
# 'Do NOT add any import statements or change anything else other than the writing the function body.\n'
|
||||
# 'You do not need to run the code to check if it works. The system will automatically check the correctness of your code.\n'
|
||||
# 'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
|
||||
# )
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
@ -257,150 +216,27 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
id_column = 'test_case_id'
|
||||
args = parse_arguments()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
dataset = load_dataset('lilbillbiscuit/biocoder_public')
|
||||
biocoder_tests = dataset['test'].to_pandas()
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'biocoder',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
eval_output_dir = str(eval_output_dir)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
biocoder_tests = biocoder_tests.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_test_case_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_test_case_ids.add(data['test_case_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_test_case_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_biocoder_tests = []
|
||||
for idx, instance in biocoder_tests.iterrows():
|
||||
if instance.test_case_id in finished_test_case_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance.test_case_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_biocoder_tests.append(instance)
|
||||
|
||||
biocoder_tests = pd.DataFrame(new_biocoder_tests)
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_test_case_ids)}, Remaining instances: {len(biocoder_tests)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(biocoder_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["test_case_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["test_case_id"]}: {output["test_result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for row_idx, instance in biocoder_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -8,17 +8,21 @@ import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from func_timeout import FunctionTimedOut, func_timeout
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -128,17 +132,17 @@ def get_test_result(instance, path, timeout=30):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
workspace_mount_path = os.path.join(
|
||||
config.workspace_mount_path, 'bird_eval_workspace'
|
||||
)
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# reset workspace to config
|
||||
config.workspace_mount_path = workspace_mount_path
|
||||
@ -162,7 +166,7 @@ def process_instance(
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir,
|
||||
metadata.eval_output_dir,
|
||||
'logs',
|
||||
f'instance_{sid}.log',
|
||||
)
|
||||
@ -183,8 +187,7 @@ def process_instance(
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# Create file with BIRD instance
|
||||
statements = f"""
|
||||
@ -386,143 +389,27 @@ def create_prompt(e, database_path):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
id_column = 'task_id'
|
||||
args = parse_arguments()
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
# Due to the large size of the BIRD database, it cannot be hosted on huggingface datasets, so it needs to be downloaded
|
||||
bird_dataset = load_bird()
|
||||
bird_tests = bird_dataset['test'].to_pandas()
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/humanevalfix/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
dataset = bird_dataset['test'].to_pandas()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'bird',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
bird_tests = bird_tests.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['task_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_bird_tests = []
|
||||
for idx, instance in bird_tests.iterrows():
|
||||
if instance.task_id in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance.task_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_bird_tests.append(instance)
|
||||
|
||||
bird_tests = pd.DataFrame(new_bird_tests)
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(bird_tests)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(bird_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["task_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["task_id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for row_idx, instance in bird_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount=False,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -1,23 +1,27 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from functools import partial
|
||||
|
||||
import huggingface_hub
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.gaia.scorer import question_scorer
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import config, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -29,43 +33,8 @@ DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
|
||||
DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
|
||||
|
||||
|
||||
def cleanup():
|
||||
logger.info('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
logger.info(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
|
||||
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
@ -74,7 +43,13 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(agent, instance, metadata, reset_logger: bool = True):
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@ -89,7 +64,7 @@ def process_instance(agent, instance, metadata, reset_logger: bool = True):
|
||||
config.workspace_mount_path = workspace_mount_path
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
eval_output_dir = metadata['eval_output_dir']
|
||||
eval_output_dir = metadata.eval_output_dir
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
@ -115,8 +90,9 @@ def process_instance(agent, instance, metadata, reset_logger: bool = True):
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
if instance['file_name'] != '':
|
||||
# if this question comes with a file, we need to save it to the workspace
|
||||
assert metadata.data_split is not None
|
||||
src_file = os.path.join(
|
||||
DATASET_CACHE_DIR, '2023', metadata['data_split'], instance['file_name']
|
||||
DATASET_CACHE_DIR, '2023', metadata.data_split, instance['file_name']
|
||||
)
|
||||
extension_name = instance['file_name'].split('.')[-1]
|
||||
dest_file = os.path.join(workspace_mount_path, f'file.{extension_name}')
|
||||
@ -218,159 +194,42 @@ if __name__ == '__main__':
|
||||
type=str,
|
||||
help='gaia level to evaluate, eg. 2023_level1',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--data-split',
|
||||
type=str,
|
||||
help='data split to evaluate, eg. validation',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.directory:
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
logger.info(f'Setting workspace base to {config.workspace_base}')
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
level = args.level
|
||||
data_split = args.data_split
|
||||
dataset = load_dataset('gaia-benchmark/GAIA', level)
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config=config.llm,
|
||||
dataset_name='gaia',
|
||||
agent_class=args.agent_cls,
|
||||
max_iterations=args.max_iterations,
|
||||
eval_note=args.eval_note,
|
||||
eval_output_dir=args.eval_output_dir,
|
||||
data_split=args.data_split,
|
||||
details={'gaia-level': args.level},
|
||||
)
|
||||
|
||||
dataset = load_dataset('gaia-benchmark/GAIA', args.level)
|
||||
huggingface_hub.snapshot_download(
|
||||
'gaia-benchmark/GAIA',
|
||||
repo_type='dataset',
|
||||
local_dir=DATASET_CACHE_DIR,
|
||||
)
|
||||
gaia_tests = dataset[data_split]
|
||||
logger.info(f'Evaluating GAIA-Benchmark {level} {data_split} split')
|
||||
gaia_tests = dataset[metadata.data_split]
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'gaia',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
prepared_dataset = prepare_dataset(
|
||||
gaia_tests.to_pandas(), output_file, args.eval_n_limit, 'task_id'
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
|
||||
|
||||
run_evaluation(
|
||||
dataset=prepared_dataset,
|
||||
metadata=metadata,
|
||||
output_file=output_file,
|
||||
num_workers=args.eval_num_workers,
|
||||
process_instance_func=process_instance,
|
||||
id_column='task_id',
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'gaia-level': level,
|
||||
'data_split': data_split,
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
gaia_tests = gaia_tests.select(list(range(eval_n_limit)))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_task_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_task_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_gaia_tests = []
|
||||
for instance in gaia_tests:
|
||||
if instance['task_id'] in finished_task_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["task_id"]} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_gaia_tests.append(instance)
|
||||
|
||||
gaia_tests = new_gaia_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_task_ids)}, Remaining instances: {len(gaia_tests)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(gaia_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["score"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in gaia_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
logger.info('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -18,24 +18,26 @@ TODOs:
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import config, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -43,41 +45,6 @@ from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
def cleanup():
|
||||
logger.info('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
logger.info(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'Feel free to use all tools for calculations and solving the problem, and web-search for finding relevant facts during the process if needed\n'
|
||||
'If you think you have reliably finished solving the problem, first generate a message reporting the final concise answer to the user. Once that is done, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, just generate a final answer message to the user and in the next turn --> run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
@ -156,16 +123,12 @@ def convert_instance_dict(instance):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
instance: dict,
|
||||
metadata: dict,
|
||||
skip_workspace_mount: bool,
|
||||
eval_output_dir: str,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
"""
|
||||
Process a single instance from the dataset
|
||||
"""
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
try:
|
||||
@ -173,31 +136,18 @@ def process_instance(
|
||||
config.workspace_mount_path, '_eval_workspace'
|
||||
)
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
skip_workspace_mount = False
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# reset workspace to config
|
||||
config.workspace_base = workspace_mount_path
|
||||
config.workspace_mount_path = workspace_mount_path
|
||||
|
||||
# workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# workspace_mount_path = os.path.abspath(workspace_mount_path)
|
||||
# # create process-specific workspace dir
|
||||
# # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# # so that different agent don't interfere with each other.
|
||||
# if not skip_workspace_mount:
|
||||
# workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
# pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
|
||||
metadata.eval_output_dir, 'logs', f'instance_{instance.instance_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
@ -218,8 +168,7 @@ def process_instance(
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# ======= Run the agent on the instance =======
|
||||
# Prepare instruction for the agent using suggested format in gpqa codebase
|
||||
@ -329,148 +278,28 @@ if __name__ == '__main__':
|
||||
gpqa_dataset['task_id'] = gpqa_dataset.index
|
||||
# gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'gpqa',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
metadata = make_metadata(
|
||||
llm_config=config.llm,
|
||||
dataset_name='gpqa',
|
||||
agent_class=args.agent_cls,
|
||||
max_iterations=args.max_iterations,
|
||||
eval_note=args.eval_note,
|
||||
eval_output_dir=args.eval_output_dir,
|
||||
data_split=args.data_split,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit # NOTE: This is useful for debugging and testing using a smaller subset of the dataset
|
||||
if eval_n_limit:
|
||||
# start_index = 20
|
||||
# gpqa_dataset = gpqa_dataset.iloc[start_index:]
|
||||
gpqa_dataset = gpqa_dataset.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
logger.info('#############################################')
|
||||
logger.info(f'{eval_n_limit} instances will be evaluated.')
|
||||
logger.info('#############################################')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
prepared_dataset = prepare_dataset(
|
||||
gpqa_dataset, output_file, args.eval_n_limit, 'task_id'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_gpqa_dataset = []
|
||||
for idx, instance in gpqa_dataset.iterrows():
|
||||
# instance = convert_instance_dict(instance) # preprocessing
|
||||
if instance.instance_id in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance.instance_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_gpqa_dataset.append(instance)
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
|
||||
|
||||
gpqa_dataset = pd.DataFrame(new_gpqa_dataset)
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(gpqa_dataset)}'
|
||||
run_evaluation(
|
||||
dataset=prepared_dataset,
|
||||
metadata=metadata,
|
||||
output_file=output_file,
|
||||
num_workers=args.eval_num_workers,
|
||||
process_instance_func=process_instance,
|
||||
id_column='task_id',
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(gpqa_dataset))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for row_idx, instance in gpqa_dataset.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -10,27 +10,28 @@ TODOs:
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from evaluate import load
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
@ -63,40 +64,6 @@ LANGUAGE_TO_NUM_WORKERS = {
|
||||
'python': 4,
|
||||
}
|
||||
|
||||
|
||||
def cleanup():
|
||||
logger.info('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
logger.info(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
@ -138,8 +105,12 @@ def get_test_result(instance, path, language='python', timeout=10):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
|
||||
@ -148,11 +119,8 @@ def process_instance(
|
||||
config.workspace_mount_path, '_eval_workspace'
|
||||
)
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# reset workspace to config
|
||||
config.workspace_base = workspace_mount_path
|
||||
@ -165,7 +133,7 @@ def process_instance(
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir,
|
||||
metadata.eval_output_dir,
|
||||
'logs',
|
||||
f'instance_{sid}.log',
|
||||
)
|
||||
@ -186,8 +154,7 @@ def process_instance(
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# Create file with HumanEvalFix problem
|
||||
# Prompt reference: https://github.com/bigcode-project/bigcode-evaluation-harness/blob/84b96da31b7f840b55c5733325346176140cdb6b/bigcode_eval/tasks/humanevalpack.py#L509
|
||||
@ -266,136 +233,24 @@ if __name__ == '__main__':
|
||||
) # TODO: Support other languages
|
||||
hefix_tests = dataset['test'].to_pandas()
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/humanevalfix/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
id_column = 'task_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'humanevalfix',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
hefix_tests = hefix_tests.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['task_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_hefix_tests = []
|
||||
for idx, instance in hefix_tests.iterrows():
|
||||
if instance.task_id in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance.task_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_hefix_tests.append(instance)
|
||||
|
||||
hefix_tests = pd.DataFrame(new_hefix_tests)
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(hefix_tests)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(hefix_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["task_id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["task_id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for row_idx, instance in hefix_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount=False,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -1,61 +1,30 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import shutil
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
def cleanup():
|
||||
logger.info('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
logger.info(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
@ -130,13 +99,12 @@ def get_test_result(
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent,
|
||||
instance,
|
||||
dataset_name,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
|
||||
@ -145,11 +113,8 @@ def process_instance(
|
||||
config.workspace_mount_path, '_eval_workspace'
|
||||
)
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# reset workspace to config
|
||||
config.workspace_base = workspace_mount_path
|
||||
@ -159,7 +124,7 @@ def process_instance(
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
|
||||
metadata.eval_output_dir, 'logs', f'instance_{instance["id"]}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
@ -178,8 +143,7 @@ def process_instance(
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# sandbox = DockerSSHBox()
|
||||
logic_inference_path = os.path.join(workspace_mount_path, 'logic_inference.py')
|
||||
@ -300,162 +264,30 @@ if __name__ == '__main__':
|
||||
if args.directory:
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
print(f'Setting workspace base to {config.workspace_base}')
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
|
||||
dataset_name = args.dataset
|
||||
data_split = args.data_split
|
||||
dataset = load_dataset(f'renma/{dataset_name}')
|
||||
logic_reasoning_tests = dataset[data_split]
|
||||
logger.info(f'Evaluating logic reasoning dataset {dataset_name} {data_split} split')
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
|
||||
eval_output_dir = os.path.join(
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'logic_reasoning',
|
||||
agent_class,
|
||||
dataset_name,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
logic_reasoning_tests = logic_reasoning_tests.select(list(range(eval_n_limit)))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
start_time = time.strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_task_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_task_ids.add(data['id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_logic_reasoning_tests = []
|
||||
for instance in logic_reasoning_tests:
|
||||
if instance['id'] in finished_task_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["id"]} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_logic_reasoning_tests.append(instance)
|
||||
|
||||
logic_reasoning_tests = new_logic_reasoning_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_task_ids)}, Remaining instances: {len(logic_reasoning_tests)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(logic_reasoning_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["id"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
# json.dump(output, output_fp, indent=4)
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
# num_workers = 1
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent don't requires mounted workspace to work
|
||||
skip_workspace_mount = False
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in logic_reasoning_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
dataset_name,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
test_result = [(json.loads(line))['test_result']['result'] for line in f]
|
||||
|
||||
metadata = {
|
||||
'Dataset': dataset_name,
|
||||
'Data split': data_split,
|
||||
'Number of Samples': len(test_result),
|
||||
'Agent class': agent_class,
|
||||
'Model name': model_name,
|
||||
'Start_time': start_time,
|
||||
'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
|
||||
}
|
||||
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f, indent=4)
|
||||
|
||||
logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
|
||||
logger.info(
|
||||
f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
|
||||
@ -2,17 +2,20 @@ import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
||||
import gymnasium as gym
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -23,19 +26,30 @@ from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
docker_ssh_box: DockerSSHBox | None = None
|
||||
|
||||
|
||||
def get_sandbox():
|
||||
global docker_ssh_box
|
||||
if docker_ssh_box is None:
|
||||
docker_ssh_box = DockerSSHBox()
|
||||
return docker_ssh_box
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
env_id: str,
|
||||
metadata: dict,
|
||||
eval_output_dir: str,
|
||||
docker_sandbox: DockerSSHBox,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
env_id = instance.id
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
|
||||
log_file = os.path.join(
|
||||
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
@ -59,7 +73,7 @@ def process_instance(
|
||||
runtime_tools_config = {
|
||||
RuntimeTool.BROWSER: {
|
||||
'browsergym_eval': env_id,
|
||||
'browsergym_eval_save_dir': eval_output_dir,
|
||||
'browsergym_eval_save_dir': metadata.eval_output_dir,
|
||||
}
|
||||
}
|
||||
|
||||
@ -68,7 +82,7 @@ def process_instance(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=docker_sandbox,
|
||||
sandbox=get_sandbox(),
|
||||
sid=env_id,
|
||||
)
|
||||
)
|
||||
@ -82,7 +96,7 @@ def process_instance(
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
|
||||
browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
|
||||
# read goal
|
||||
with open(
|
||||
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
|
||||
@ -114,111 +128,35 @@ def process_instance(
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
|
||||
]
|
||||
dataset = pd.DataFrame(
|
||||
{
|
||||
'id': [
|
||||
id
|
||||
for id in gym.envs.registry.keys()
|
||||
if id.startswith('browsergym/miniwob')
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'miniwob',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
_ = get_sandbox() # Initialize the sandbox
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
env_ids = env_ids[:eval_n_limit]
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_env_ids = []
|
||||
for idx in env_ids:
|
||||
if idx in finished_instance_ids:
|
||||
logger.info(f'Skipping instance {idx} as it is already finished.')
|
||||
continue
|
||||
new_env_ids.append(idx)
|
||||
|
||||
env_ids = new_env_ids
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
docker_sandbox = DockerSSHBox()
|
||||
for env_id in tqdm(env_ids):
|
||||
try:
|
||||
output = process_instance(
|
||||
agent=agent,
|
||||
env_id=env_id,
|
||||
metadata=metadata,
|
||||
eval_output_dir=eval_output_dir,
|
||||
docker_sandbox=docker_sandbox,
|
||||
reset_logger=False,
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
except Exception as e:
|
||||
logger.error(f'Error processing instance {env_id}: {e}')
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -1,45 +1,36 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Dict
|
||||
from typing import Any, Dict
|
||||
|
||||
import tasks
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
from .config_variables import TASK_INFO_MAP
|
||||
from .datatypes import TaskState
|
||||
from .env import SimplifiedEnv
|
||||
from .prompts import ToolPromptTemplate
|
||||
from .tasks import Task
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State, task: Task, task_config: Dict[str, int]):
|
||||
def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
|
||||
logger.info(f'Gold reference: {task.reference}')
|
||||
logger.info(f'Task config: {task_config}')
|
||||
|
||||
@ -49,7 +40,7 @@ def codeact_user_response(state: State, task: Task, task_config: Dict[str, int])
|
||||
task_config=task_config,
|
||||
)
|
||||
last_action, _ = state.history[-1]
|
||||
result_state: TaskState = env.step(last_action.message)
|
||||
result_state: TaskState = env.step(last_action.message or '')
|
||||
|
||||
state.task_state = result_state
|
||||
|
||||
@ -63,12 +54,8 @@ def codeact_user_response(state: State, task: Task, task_config: Dict[str, int])
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'CodeActAgent': codeact_user_response_mint,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
}
|
||||
|
||||
@ -78,26 +65,21 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: Task,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
agent: Agent,
|
||||
instance: Any,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
|
||||
metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
@ -116,8 +98,7 @@ def process_instance(
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# use a session id for concurrent processing
|
||||
sid = instance.task_id + '_' + str(os.getpid())
|
||||
@ -136,9 +117,10 @@ def process_instance(
|
||||
exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
|
||||
|
||||
# Prepare instruction
|
||||
assert metadata.details is not None
|
||||
instruction = ToolPromptTemplate(use_tool=True)(
|
||||
max_total_steps=metadata['max_iterations'],
|
||||
max_propose_solution=metadata['max_propose_solution'],
|
||||
max_total_steps=metadata.max_iterations,
|
||||
max_propose_solution=metadata.details['max_propose_solution'],
|
||||
in_context_example=instance.in_context_example(
|
||||
use_tool=True, with_feedback=False
|
||||
),
|
||||
@ -154,8 +136,8 @@ def process_instance(
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
|
||||
task=instance,
|
||||
task_config={
|
||||
'max_iterations': metadata['max_iterations'],
|
||||
'max_propose_solution': metadata['max_propose_solution'],
|
||||
'max_iterations': metadata.max_iterations,
|
||||
'max_propose_solution': metadata.details['max_propose_solution'],
|
||||
},
|
||||
)
|
||||
|
||||
@ -224,149 +206,28 @@ if __name__ == '__main__':
|
||||
'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
|
||||
)
|
||||
logger.info(f'Evaluating MINT - {args.subset} subset')
|
||||
mint_tests = mint_dataset.to_pandas()
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'mint',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
args.subset,
|
||||
details={'max_propose_solution': args.max_propose_solution},
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
agent,
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'max_propose_solution': args.max_propose_solution,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
mint_dataset = mint_dataset.select(range(eval_n_limit))
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}, max propose solution {args.max_propose_solution}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
task_class: Task = getattr(tasks, TASK_INFO_MAP[args.subset]['class'])
|
||||
new_mint_tests: list[Task] = []
|
||||
|
||||
for instance in mint_dataset:
|
||||
if instance['id'] in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance["id"]} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
# convert to Task object
|
||||
instance = task_class(**instance)
|
||||
new_mint_tests.append(instance)
|
||||
|
||||
mint_dataset = new_mint_tests
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(mint_dataset)}'
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(mint_dataset))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
# logger.info('Output: ', output)
|
||||
# pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
# pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
# logger.info(
|
||||
# f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
||||
# )
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for instance in mint_dataset:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -15,63 +15,31 @@ TODOs:
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Any
|
||||
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
|
||||
def cleanup():
|
||||
logger.info('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
logger.info(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have completed the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
@ -101,7 +69,7 @@ ID2CONDA = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent, instance, metadata, eval_output_dir, reset_logger: bool = True
|
||||
agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
|
||||
):
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
@ -122,7 +90,7 @@ def process_instance(
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir,
|
||||
metadata.eval_output_dir,
|
||||
'logs',
|
||||
f"instance_{instance['id']}_pid_{os.getpid()}.log",
|
||||
)
|
||||
@ -268,129 +236,30 @@ if __name__ == '__main__':
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
data_split = args.eval_split
|
||||
agent_class = args.agent_cls
|
||||
num_workers = args.eval_num_workers
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
ml_bench = ml_bench.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to {eval_n_limit} instances.')
|
||||
|
||||
# TEST METADATA
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'ml_bench',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
os.makedirs(eval_output_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(eval_output_dir, 'logs'), exist_ok=True)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Evaluating on data split: {data_split}')
|
||||
logger.info(f'Using {num_workers} worker processes')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
print(f'Error parsing line: {line}')
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, data split {data_split}.'
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
agent,
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
|
||||
# Filter out finished instances
|
||||
new_instances = [
|
||||
instance
|
||||
for _, instance in ml_bench.iterrows()
|
||||
if instance['id'] not in finished_instance_ids
|
||||
]
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(new_instances)}'
|
||||
)
|
||||
|
||||
pbar = tqdm(total=len(new_instances))
|
||||
|
||||
# This function tracks the progress AND writes the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"]}')
|
||||
pbar.set_postfix_str(f'Metrics: {output["metrics"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["metrics"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
for _, instance in enumerate(new_instances):
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
metadata,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
for future in futures:
|
||||
output = future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
logger.info('Evaluation completed.')
|
||||
|
||||
@ -1,29 +1,30 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import toml
|
||||
import whatthepatch
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
import agenthub
|
||||
from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
@ -38,31 +39,6 @@ def cleanup():
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'CodeActSWEAgent': codeact_user_response,
|
||||
@ -193,30 +169,25 @@ def get_test_result(instance, sandbox, workspace_dir_name):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent_class: str,
|
||||
llm_config: dict,
|
||||
instance: Any,
|
||||
metadata: dict,
|
||||
skip_workspace_mount: bool,
|
||||
eval_output_dir: str,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(llm_config=llm_config))
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# create process-specific workspace dir
|
||||
# if `not skip_workspace_mount` - we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
if not skip_workspace_mount:
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
eval_output_dir, 'infer_logs', f'instance_{instance.instance_id}.log'
|
||||
metadata.eval_output_dir,
|
||||
'infer_logs',
|
||||
f'instance_{instance.instance_id}.log',
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
@ -237,22 +208,18 @@ def process_instance(
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
||||
|
||||
if not skip_workspace_mount:
|
||||
logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
|
||||
|
||||
# NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
|
||||
# You can omit this if you don't need to setup specialized sandbox
|
||||
workspace_dir_name = f'{instance.repo}__{instance.version}'.replace('/', '__')
|
||||
sandbox = SWEBenchSSHBox.get_box_for_instance(
|
||||
instance,
|
||||
workspace_dir_name,
|
||||
skip_workspace_mount=skip_workspace_mount,
|
||||
workspace_mount_path=workspace_mount_path,
|
||||
sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
|
||||
sandbox_plugins=agenthub.Agent.get_cls(metadata.agent_class).sandbox_plugins,
|
||||
)
|
||||
|
||||
# Prepare instruction
|
||||
if agent_class == 'CodeActSWEAgent':
|
||||
if metadata.agent_class == 'CodeActSWEAgent':
|
||||
instruction = (
|
||||
'We are currently solving the following issue within our repository. Here is the issue text:\n'
|
||||
'--- BEGIN ISSUE ---\n'
|
||||
@ -386,144 +353,35 @@ if __name__ == '__main__':
|
||||
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
|
||||
swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'swe_bench_lite',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'infer_logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
_agent_cls = agenthub.Agent.get_cls(agent_class)
|
||||
details = {}
|
||||
_agent_cls = agenthub.Agent.get_cls(args.agent_cls)
|
||||
if hasattr(_agent_cls, 'system_message'):
|
||||
metadata['system_message'] = _agent_cls.system_message
|
||||
details['system_message'] = _agent_cls.system_message
|
||||
if hasattr(_agent_cls, 'in_context_example'):
|
||||
metadata['in_context_example'] = _agent_cls.in_context_example
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
details['in_context_example'] = _agent_cls.in_context_example
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
swe_bench_tests = swe_bench_tests.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
'swe-bench-lite',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
details=details,
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_swe_bench_tests = []
|
||||
for idx, instance in swe_bench_tests.iterrows():
|
||||
if instance.instance_id in finished_instance_ids:
|
||||
logger.info(
|
||||
f'Skipping instance {instance.instance_id} as it is already finished.'
|
||||
)
|
||||
continue
|
||||
new_swe_bench_tests.append(instance)
|
||||
|
||||
swe_bench_tests = pd.DataFrame(new_swe_bench_tests)
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(swe_bench_tests)}'
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(
|
||||
swe_bench_tests, output_file, args.eval_n_limit, id_column
|
||||
)
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
# =============================================
|
||||
|
||||
pbar = tqdm(total=len(swe_bench_tests))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["instance_id"][:10]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for row_idx, instance in swe_bench_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent_class,
|
||||
config.llm,
|
||||
instance,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
@ -1,18 +1,22 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Any
|
||||
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
codeact_user_response,
|
||||
make_metadata,
|
||||
monologue_user_response,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -22,40 +26,6 @@ from opendevin.llm.llm import LLM
|
||||
|
||||
from .utils import download_data, download_tools, encode_question, eval_answer, get_data
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def codeact_user_response(state: State) -> str:
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'When you think you finished the task, respond with `Finish[answer]` where you include your answer in `[]`\n'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
|
||||
)
|
||||
if state.history:
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'MonologueAgent': monologue_user_response,
|
||||
@ -66,7 +36,9 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(agent: Agent, task, metadata, reset_logger: bool = True):
|
||||
def process_instance(
|
||||
agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
|
||||
):
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@ -74,10 +46,10 @@ def process_instance(agent: Agent, task, metadata, reset_logger: bool = True):
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
eval_output_dir = metadata['eval_output_dir']
|
||||
qid = task['qid']
|
||||
question = task['question']
|
||||
answer = task['answer']
|
||||
eval_output_dir = metadata.eval_output_dir
|
||||
qid = instance.qid
|
||||
question = instance.question
|
||||
answer = instance.answer
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{qid}.log')
|
||||
@ -139,7 +111,7 @@ def process_instance(agent: Agent, task, metadata, reset_logger: bool = True):
|
||||
'text': model_answer_raw,
|
||||
'correct': correct,
|
||||
'answer_id': 'None',
|
||||
'model_id': metadata['model_name'],
|
||||
'model_id': metadata.model_name,
|
||||
'metadata': metadata,
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
@ -171,36 +143,6 @@ if __name__ == '__main__':
|
||||
default='YOUR_WOLFRAMALPHA_APPID',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
if args.directory:
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
print(f'Setting workspace base to {config.workspace_base}')
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
agent_class = args.agent_cls
|
||||
assert (
|
||||
agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
|
||||
), f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
args.eval_output_dir,
|
||||
'toolqa',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
dataset = ''
|
||||
hardness = ''
|
||||
@ -215,149 +157,39 @@ if __name__ == '__main__':
|
||||
'yelp',
|
||||
'genda',
|
||||
]
|
||||
if args.dataset in dataset_choices:
|
||||
dataset = args.dataset
|
||||
else:
|
||||
if args.dataset not in dataset_choices:
|
||||
raise ValueError(
|
||||
'Please choose from agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp for dataset.'
|
||||
)
|
||||
if args.hardness == 'easy':
|
||||
hardness = 'easy'
|
||||
elif args.hardness == 'hard':
|
||||
hardness = 'hard'
|
||||
else:
|
||||
if args.hardness not in ['easy', 'hard']:
|
||||
raise ValueError('Please choose from easy and hard for hardness.')
|
||||
|
||||
logger.info(f'Evaluating ToolQA {dataset} {hardness} test')
|
||||
# workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
workspace_mount_path = config.workspace_mount_path
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
toolqa_test = get_data(dataset, hardness)
|
||||
toolqa_test = pd.DataFrame(get_data(dataset, hardness))
|
||||
toolqa_data_path = download_data(workspace_mount_path)
|
||||
toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
|
||||
|
||||
# TEST METADATA
|
||||
metadata = {
|
||||
'dataset': dataset,
|
||||
'hardness': hardness,
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproduciblity
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(
|
||||
os.path.join(eval_output_dir, f'metadata_{dataset}_{hardness}.json'), 'w'
|
||||
) as f:
|
||||
json.dump(metadata, f)
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
toolqa_test = toolqa_test[:eval_n_limit]
|
||||
logger.info(
|
||||
f'Limiting evaluation to a total of first {eval_n_limit} instances.'
|
||||
)
|
||||
output_file = os.path.join(
|
||||
eval_output_dir, f'output_{model_name}_{dataset}_{hardness}.jsonl'
|
||||
id_column = 'qid'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
f'toolqa-{args.dataset}-{args.hardness}',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
)
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_task_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
task = json.loads(line)
|
||||
finished_task_ids.add(task['qid'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_toolqa_test = []
|
||||
for task in toolqa_test:
|
||||
qid = task['qid']
|
||||
if qid in finished_task_ids:
|
||||
logger.info(f'Skipping instance {qid} as it is already finished.')
|
||||
continue
|
||||
new_toolqa_test.append(task)
|
||||
finished_task_number = len(finished_task_ids)
|
||||
toolqa_test = new_toolqa_test
|
||||
logger.info(
|
||||
f'Finished instances: {finished_task_number}, Remaining instances: {len(toolqa_test)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
pbar = tqdm(total=len(toolqa_test))
|
||||
|
||||
# This function tracks the progress AND write the output to a JSONL file
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output["qid"]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["correct"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output["qid"]}: {output["correct"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
finished_task_ids.add(output['qid'])
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
# This is how we perform multi-processing
|
||||
for task in toolqa_test:
|
||||
try:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
task,
|
||||
metadata,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
except Exception:
|
||||
continue
|
||||
# Wait for all futures to complete
|
||||
for future in futures:
|
||||
try:
|
||||
future.result()
|
||||
except Exception:
|
||||
continue
|
||||
except KeyboardInterrupt:
|
||||
logger.info('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
output_fp.close()
|
||||
total_correct = 0
|
||||
output = []
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
output.append(data)
|
||||
if data['qid'] in finished_task_ids:
|
||||
if str(data['correct']).lower() == 'true':
|
||||
total_correct += 1
|
||||
# sort all output by question_id
|
||||
output = sorted(output, key=lambda x: x['qid'])
|
||||
with open(output_file, 'w') as f:
|
||||
for dat in output:
|
||||
f.write(json.dumps(dat) + '\n')
|
||||
f.flush()
|
||||
logger.info(
|
||||
f'Evaluation finished for {dataset}-{hardness}. Total: {len(toolqa_test)+finished_task_number}; Correct: {total_correct}; Accuracy: {total_correct / (len(toolqa_test)+finished_task_number)}'
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
agent,
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
|
||||
208
evaluation/utils/shared.py
Normal file
208
evaluation/utils/shared.py
Normal file
@ -0,0 +1,208 @@
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from asyncio.log import logger
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Any, Callable
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig
|
||||
from opendevin.events.action import Action
|
||||
from opendevin.events.action.message import MessageAction
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
class EvalMetadata(BaseModel):
|
||||
agent_class: str
|
||||
llm_config: LLMConfig
|
||||
max_iterations: int
|
||||
eval_output_dir: str
|
||||
start_time: str
|
||||
git_commit: str
|
||||
dataset: str | None = None
|
||||
data_split: str | None = None
|
||||
details: dict[str, Any] | None = None
|
||||
|
||||
|
||||
def codeact_user_response(
|
||||
state: State,
|
||||
encapsulate_solution: bool = False,
|
||||
try_parse: Callable[[Action], str] | None = None,
|
||||
) -> str:
|
||||
encaps_str = (
|
||||
(
|
||||
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
||||
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
)
|
||||
if encapsulate_solution
|
||||
else ''
|
||||
)
|
||||
msg = (
|
||||
'Please continue working on the task on whatever approach you think is suitable.\n'
|
||||
'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
|
||||
f'{encaps_str}'
|
||||
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
if state.history:
|
||||
if try_parse is not None:
|
||||
last_action, _ = state.history[-1]
|
||||
ans = try_parse(last_action)
|
||||
if ans is not None:
|
||||
return '/exit'
|
||||
user_msgs = [
|
||||
action
|
||||
for action, _ in state.history
|
||||
if isinstance(action, MessageAction) and action.source == 'user'
|
||||
]
|
||||
if len(user_msgs) >= 2:
|
||||
# let the agent know that it can give up when it has tried 3 times
|
||||
return (
|
||||
msg
|
||||
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
|
||||
)
|
||||
return msg
|
||||
|
||||
|
||||
def monologue_user_response(state: State) -> str:
|
||||
raise NotImplementedError('MonologueAgent should never ask for user responses.')
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('Cleaning up child processes...')
|
||||
for process in mp.active_children():
|
||||
print(f'Terminating child process: {process.name}')
|
||||
process.terminate()
|
||||
process.join()
|
||||
|
||||
|
||||
def make_metadata(
|
||||
llm_config: LLMConfig,
|
||||
dataset_name: str,
|
||||
agent_class: str,
|
||||
max_iterations: int,
|
||||
eval_note: str | None,
|
||||
eval_output_dir: str,
|
||||
data_split: str | None = None,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> EvalMetadata:
|
||||
model_name = llm_config.model.split('/')[-1]
|
||||
eval_note = f'_N_{eval_note}' if eval_note else ''
|
||||
|
||||
eval_output_path = os.path.join(
|
||||
eval_output_dir,
|
||||
dataset_name,
|
||||
agent_class,
|
||||
f'{model_name}_maxiter_{max_iterations}{eval_note}',
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_path}')
|
||||
|
||||
metadata = EvalMetadata(
|
||||
agent_class=agent_class,
|
||||
llm_config=llm_config,
|
||||
max_iterations=max_iterations,
|
||||
eval_output_dir=eval_output_path,
|
||||
start_time=time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
dataset=dataset_name,
|
||||
data_split=data_split,
|
||||
details=details,
|
||||
)
|
||||
metadata_json = metadata.model_dump_json()
|
||||
logger.info(f'Metadata: {metadata_json}')
|
||||
with open(os.path.join(eval_output_path, 'metadata.json'), 'w') as f:
|
||||
f.write(metadata_json)
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def prepare_dataset(dataset: pd.DataFrame, output_file, eval_n_limit, id_column):
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_ids.add(data[id_column])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
|
||||
)
|
||||
|
||||
if eval_n_limit:
|
||||
dataset = dataset.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
new_dataset = [
|
||||
instance
|
||||
for _, instance in dataset.iterrows()
|
||||
if instance[id_column] not in finished_ids
|
||||
]
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_ids)}, Remaining instances: {len(new_dataset)}'
|
||||
)
|
||||
|
||||
return pd.DataFrame(new_dataset)
|
||||
|
||||
|
||||
def run_evaluation(
|
||||
dataset: pd.DataFrame,
|
||||
metadata: EvalMetadata,
|
||||
output_file: str,
|
||||
num_workers: int,
|
||||
process_instance_func: Callable[[pd.Series, EvalMetadata, bool], Any],
|
||||
id_column: str,
|
||||
):
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent.__class__.name}, '
|
||||
f'model {agent.llm.model_name}, max iterations {metadata.max_iterations}.'
|
||||
)
|
||||
pbar = tqdm(total=len(dataset))
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
def update_progress(future):
|
||||
pbar.update(1)
|
||||
output = future.result()
|
||||
pbar.set_description(f'Instance {output[id_column]}')
|
||||
pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
|
||||
logger.info(
|
||||
f'Finished evaluation for instance {output[id_column]}: {output["test_result"]["result"]}'
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
for _, instance in dataset.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance_func,
|
||||
instance,
|
||||
metadata,
|
||||
bool(num_workers > 1),
|
||||
)
|
||||
future.add_done_callback(update_progress)
|
||||
futures.append(future)
|
||||
|
||||
for future in futures:
|
||||
future.result()
|
||||
except KeyboardInterrupt:
|
||||
print('KeyboardInterrupt received. Cleaning up...')
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
@ -2,17 +2,20 @@ import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
||||
import gymnasium as gym
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -24,18 +27,30 @@ from opendevin.runtime.tools import RuntimeTool
|
||||
SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
|
||||
docker_ssh_box: DockerSSHBox | None = None
|
||||
|
||||
|
||||
def get_sandbox():
|
||||
global docker_ssh_box
|
||||
if docker_ssh_box is None:
|
||||
docker_ssh_box = DockerSSHBox()
|
||||
return docker_ssh_box
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
env_id: str,
|
||||
metadata: dict,
|
||||
eval_output_dir: str,
|
||||
docker_sandbox: DockerSSHBox,
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
env_id = instance.id
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
|
||||
log_file = os.path.join(
|
||||
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
@ -59,7 +74,7 @@ def process_instance(
|
||||
runtime_tools_config = {
|
||||
RuntimeTool.BROWSER: {
|
||||
'browsergym_eval': env_id,
|
||||
'browsergym_eval_save_dir': eval_output_dir,
|
||||
'browsergym_eval_save_dir': metadata.eval_output_dir,
|
||||
}
|
||||
}
|
||||
|
||||
@ -68,7 +83,7 @@ def process_instance(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=docker_sandbox,
|
||||
sandbox=get_sandbox(),
|
||||
sid=env_id,
|
||||
)
|
||||
)
|
||||
@ -82,7 +97,7 @@ def process_instance(
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
|
||||
browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
|
||||
# read goal
|
||||
with open(
|
||||
os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
|
||||
@ -118,108 +133,34 @@ if __name__ == '__main__':
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
|
||||
]
|
||||
|
||||
# Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
|
||||
# for details of how to set `llm_config`
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
dataset = pd.DataFrame(
|
||||
{
|
||||
'id': [
|
||||
id
|
||||
for id in gym.envs.registry.keys()
|
||||
if id.startswith('browsergym/miniwob')
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# TEST METADATA
|
||||
agent_class = args.agent_cls
|
||||
assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
|
||||
model_name = config.llm.model.split('/')[-1]
|
||||
max_iterations = args.max_iterations
|
||||
eval_note = ''
|
||||
if args.eval_note is not None:
|
||||
eval_note += '_N_' + args.eval_note
|
||||
eval_output_dir = os.path.join(
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
'webarena',
|
||||
agent_class,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
_ = get_sandbox() # Initialize the sandbox
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
logger.info(f'Using evaluation output directory: {eval_output_dir}')
|
||||
|
||||
metadata = {
|
||||
'agent_class': agent_class,
|
||||
'model_name': model_name,
|
||||
'max_iterations': max_iterations,
|
||||
'eval_output_dir': eval_output_dir,
|
||||
'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
# get the commit id of current repo for reproducibility
|
||||
'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
|
||||
.decode('utf-8')
|
||||
.strip(),
|
||||
}
|
||||
logger.info(f'Metadata: {metadata}')
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
|
||||
# LIMIT EVALUATION
|
||||
eval_n_limit = args.eval_n_limit
|
||||
if eval_n_limit:
|
||||
env_ids = env_ids[:eval_n_limit]
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
# OUTPUT FILE
|
||||
output_file = os.path.join(eval_output_dir, 'output.jsonl')
|
||||
logger.info(f'Writing evaluation output to {output_file}')
|
||||
finished_instance_ids = set()
|
||||
if os.path.exists(output_file):
|
||||
with open(output_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
finished_instance_ids.add(data['instance_id'])
|
||||
logger.warning(
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
|
||||
)
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# filter out finished instances
|
||||
new_env_ids = []
|
||||
for idx in env_ids:
|
||||
if idx in finished_instance_ids:
|
||||
logger.info(f'Skipping instance {idx} as it is already finished.')
|
||||
continue
|
||||
new_env_ids.append(idx)
|
||||
|
||||
env_ids = new_env_ids
|
||||
logger.info(
|
||||
f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
|
||||
)
|
||||
|
||||
# =============================================
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
docker_sandbox = DockerSSHBox()
|
||||
for env_id in tqdm(env_ids):
|
||||
try:
|
||||
output = process_instance(
|
||||
agent=agent,
|
||||
env_id=env_id,
|
||||
metadata=metadata,
|
||||
eval_output_dir=eval_output_dir,
|
||||
docker_sandbox=docker_sandbox,
|
||||
reset_logger=False,
|
||||
)
|
||||
output_fp.write(json.dumps(output) + '\n')
|
||||
output_fp.flush()
|
||||
except Exception as e:
|
||||
logger.error(f'Error processing instance {env_id}: {e}')
|
||||
|
||||
output_fp.close()
|
||||
logger.info('Evaluation finished.')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user