add cost metrics to evaluation outputs for all benchmarks (#2199)

This commit is contained in:
Ryan H. Tran 2024-06-02 15:28:00 +07:00 committed by GitHub
parent 8d79c3edbc
commit 22e8fb39b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 44 additions and 25 deletions

View File

@ -141,6 +141,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
test_result = game.reward()
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
@ -151,6 +152,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': {
'success': test_result,

View File

@ -20,7 +20,7 @@ from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.events.action import MessageAction, CmdRunAction
from opendevin.events.action import CmdRunAction, MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.runtime.docker.ssh_box import DockerSSHBox
@ -82,7 +82,9 @@ def process_instance(
question = instance.description
# create a directory for the instance's workspace
instance_workspace = str(os.path.join(config.workspace_base, inst_id))
container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id))
container_inst_workspace = str(
os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
)
if os.path.exists(instance_workspace):
shutil.rmtree(instance_workspace)
os.makedirs(instance_workspace, exist_ok=True)
@ -149,9 +151,7 @@ def process_instance(
state: State = asyncio.run(
main(
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
sandbox=sandbox,
)
)
@ -215,6 +215,7 @@ def process_instance(
histories = [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
]
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
@ -223,6 +224,7 @@ def process_instance(
'instruction': instruction,
'metadata': metadata,
'history': histories,
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': {
'agent_answer': agent_answer,

View File

@ -232,6 +232,7 @@ def process_instance(
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
@ -241,6 +242,7 @@ def process_instance(
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': test_result,
}

View File

@ -177,6 +177,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
'model_answer': model_answer,
'ground_truth': instance['Final answer'],
}
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
@ -187,6 +188,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': test_result,
}

View File

@ -221,6 +221,7 @@ def process_instance(
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
@ -230,6 +231,7 @@ def process_instance(
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': test_result,
}

View File

@ -204,8 +204,8 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
sandbox = DockerSSHBox()
exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
@ -230,13 +230,16 @@ def process_instance(
if str(obs.content) in ["'A'", "'B'", "'C'"]:
final_message = obs.content
break
final_message = final_message.strip("'")
logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
logger.info(
f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
)
test_result = get_test_result(
model_answer=final_message, ground_truth=instance['answer']
)
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
@ -247,6 +250,7 @@ def process_instance(
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'final_message': final_message,
'messages': messages,
'error': state.error if state and state.error else None,
@ -254,10 +258,10 @@ def process_instance(
}
config.workspace_mount_path = old_workspace_mount_path
config.workspace_base = old_workspace_base
# Close the sandbox
sandbox.close()
return output
@ -272,7 +276,7 @@ if __name__ == '__main__':
parser.add_argument(
'--data_split',
type=str,
help='data split to evaluate on {validation}', # right now we only support validation split
help='data split to evaluate on {validation}', # right now we only support validation split
default='validation',
)
@ -313,7 +317,7 @@ if __name__ == '__main__':
'logic_reasoning',
agent_class,
dataset_name,
model_name + '_maxiter_' + str(max_iterations) + eval_note
model_name + '_maxiter_' + str(max_iterations) + eval_note,
)
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
@ -414,23 +418,25 @@ if __name__ == '__main__':
cleanup()
output_fp.close()
with open(output_file, 'r') as f:
test_result = [(json.loads(line))["test_result"]["result"] for line in f]
test_result = [(json.loads(line))['test_result']['result'] for line in f]
metadata = {
"Dataset": dataset_name,
"Data split": data_split,
"Number of Samples": len(test_result),
'Dataset': dataset_name,
'Data split': data_split,
'Number of Samples': len(test_result),
'Agent class': agent_class,
'Model name': model_name,
'Start_time': start_time,
"End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
"Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
}
'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
}
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
json.dump(metadata, f, indent=4)
logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
logger.info(
f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
)

View File

@ -172,6 +172,8 @@ def process_instance(
task_state = state.task_state
logger.info('Task state: ' + str(task_state.to_dict()))
metrics = state.metrics.get() if state.metrics else None
# Save the output
output = {
'id': instance.task_id,
@ -181,6 +183,7 @@ def process_instance(
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
'metrics': metrics,
'error': state.error if state and state.error else None,
'test_result': task_state.success if task_state else False,
}