mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
add cost metrics to evaluation outputs for all benchmarks (#2199)
This commit is contained in:
parent
8d79c3edbc
commit
22e8fb39b1
@ -141,6 +141,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
|
||||
|
||||
logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
|
||||
test_result = game.reward()
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
@ -151,6 +152,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': {
|
||||
'success': test_result,
|
||||
|
||||
@ -20,7 +20,7 @@ from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.events.action import MessageAction, CmdRunAction
|
||||
from opendevin.events.action import CmdRunAction, MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
@ -82,7 +82,9 @@ def process_instance(
|
||||
question = instance.description
|
||||
# create a directory for the instance's workspace
|
||||
instance_workspace = str(os.path.join(config.workspace_base, inst_id))
|
||||
container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id))
|
||||
container_inst_workspace = str(
|
||||
os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
|
||||
)
|
||||
if os.path.exists(instance_workspace):
|
||||
shutil.rmtree(instance_workspace)
|
||||
os.makedirs(instance_workspace, exist_ok=True)
|
||||
@ -149,9 +151,7 @@ def process_instance(
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
sandbox=sandbox,
|
||||
)
|
||||
)
|
||||
@ -215,6 +215,7 @@ def process_instance(
|
||||
histories = [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
]
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
@ -223,6 +224,7 @@ def process_instance(
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'history': histories,
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': {
|
||||
'agent_answer': agent_answer,
|
||||
|
||||
@ -232,6 +232,7 @@ def process_instance(
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
@ -241,6 +242,7 @@ def process_instance(
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': test_result,
|
||||
}
|
||||
|
||||
@ -177,6 +177,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
|
||||
'model_answer': model_answer,
|
||||
'ground_truth': instance['Final answer'],
|
||||
}
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
@ -187,6 +188,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': test_result,
|
||||
}
|
||||
|
||||
@ -221,6 +221,7 @@ def process_instance(
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
@ -230,6 +231,7 @@ def process_instance(
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': test_result,
|
||||
}
|
||||
|
||||
@ -204,8 +204,8 @@ def process_instance(
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
|
||||
sandbox = DockerSSHBox()
|
||||
exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
|
||||
|
||||
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
@ -230,13 +230,16 @@ def process_instance(
|
||||
if str(obs.content) in ["'A'", "'B'", "'C'"]:
|
||||
final_message = obs.content
|
||||
break
|
||||
|
||||
|
||||
final_message = final_message.strip("'")
|
||||
logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
|
||||
logger.info(
|
||||
f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
|
||||
)
|
||||
|
||||
test_result = get_test_result(
|
||||
model_answer=final_message, ground_truth=instance['answer']
|
||||
)
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
@ -247,6 +250,7 @@ def process_instance(
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'final_message': final_message,
|
||||
'messages': messages,
|
||||
'error': state.error if state and state.error else None,
|
||||
@ -254,10 +258,10 @@ def process_instance(
|
||||
}
|
||||
config.workspace_mount_path = old_workspace_mount_path
|
||||
config.workspace_base = old_workspace_base
|
||||
|
||||
|
||||
# Close the sandbox
|
||||
sandbox.close()
|
||||
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@ -272,7 +276,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument(
|
||||
'--data_split',
|
||||
type=str,
|
||||
help='data split to evaluate on {validation}', # right now we only support validation split
|
||||
help='data split to evaluate on {validation}', # right now we only support validation split
|
||||
default='validation',
|
||||
)
|
||||
|
||||
@ -313,7 +317,7 @@ if __name__ == '__main__':
|
||||
'logic_reasoning',
|
||||
agent_class,
|
||||
dataset_name,
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note
|
||||
model_name + '_maxiter_' + str(max_iterations) + eval_note,
|
||||
)
|
||||
|
||||
pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
|
||||
@ -414,23 +418,25 @@ if __name__ == '__main__':
|
||||
cleanup()
|
||||
|
||||
output_fp.close()
|
||||
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
test_result = [(json.loads(line))["test_result"]["result"] for line in f]
|
||||
|
||||
test_result = [(json.loads(line))['test_result']['result'] for line in f]
|
||||
|
||||
metadata = {
|
||||
"Dataset": dataset_name,
|
||||
"Data split": data_split,
|
||||
"Number of Samples": len(test_result),
|
||||
'Dataset': dataset_name,
|
||||
'Data split': data_split,
|
||||
'Number of Samples': len(test_result),
|
||||
'Agent class': agent_class,
|
||||
'Model name': model_name,
|
||||
'Start_time': start_time,
|
||||
"End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
|
||||
}
|
||||
|
||||
'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
|
||||
}
|
||||
|
||||
with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
|
||||
json.dump(metadata, f, indent=4)
|
||||
|
||||
|
||||
logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
|
||||
logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
|
||||
logger.info(
|
||||
f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
|
||||
)
|
||||
|
||||
@ -172,6 +172,8 @@ def process_instance(
|
||||
task_state = state.task_state
|
||||
logger.info('Task state: ' + str(task_state.to_dict()))
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'id': instance.task_id,
|
||||
@ -181,6 +183,7 @@ def process_instance(
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
'metrics': metrics,
|
||||
'error': state.error if state and state.error else None,
|
||||
'test_result': task_state.success if task_state else False,
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user