add cost metrics to evaluation outputs for all benchmarks (#2199)

2025-12-26 05:48:36 +08:00 · 2024-06-02 15:28:00 +07:00 · 2024-06-02 15:28:00 +07:00 · 22e8fb39b1
commit 22e8fb39b1
parent 8d79c3edbc
7 changed files with 44 additions and 25 deletions
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@ -141,6 +141,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)

    logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
    test_result = game.reward()
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@ -151,6 +152,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': {
            'success': test_result,
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@ -20,7 +20,7 @@ from opendevin.core.config import args, config, get_llm_config_arg
 from opendevin.core.logger import get_console_handler
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.main import main
-from opendevin.events.action import MessageAction, CmdRunAction
+from opendevin.events.action import CmdRunAction, MessageAction
 from opendevin.events.serialization.event import event_to_dict
 from opendevin.runtime.docker.ssh_box import DockerSSHBox

@ -82,7 +82,9 @@ def process_instance(
    question = instance.description
    # create a directory for the instance's workspace
    instance_workspace = str(os.path.join(config.workspace_base, inst_id))
-    container_inst_workspace = str(os.path.join(config.workspace_mount_path_in_sandbox, inst_id))
+    container_inst_workspace = str(
+        os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
+    )
    if os.path.exists(instance_workspace):
        shutil.rmtree(instance_workspace)
    os.makedirs(instance_workspace, exist_ok=True)
@ -149,9 +151,7 @@ def process_instance(
    state: State = asyncio.run(
        main(
            instruction,
-            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
-                agent_class
-            ),
+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
            sandbox=sandbox,
        )
    )
@ -215,6 +215,7 @@ def process_instance(
    histories = [
        (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
    ]
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@ -223,6 +224,7 @@ def process_instance(
        'instruction': instruction,
        'metadata': metadata,
        'history': histories,
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': {
            'agent_answer': agent_answer,
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@ -232,6 +232,7 @@ def process_instance(
    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
    if state is None:
        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@ -241,6 +242,7 @@ def process_instance(
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': test_result,
    }
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@ -177,6 +177,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        'model_answer': model_answer,
        'ground_truth': instance['Final answer'],
    }
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@ -187,6 +188,7 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': test_result,
    }
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@ -221,6 +221,7 @@ def process_instance(
    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
    if state is None:
        raise ValueError('State should not be None.')
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@ -230,6 +231,7 @@ def process_instance(
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': test_result,
    }
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@ -204,8 +204,8 @@ def process_instance(
    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')

    sandbox = DockerSSHBox()
-    exit_code, command_output = sandbox.execute(f'pip install scitools-pyke')
-    
+    exit_code, command_output = sandbox.execute('pip install scitools-pyke')
+
    # Here's how you can run the agent (similar to the `main` function) and get the final task state
    state: State = asyncio.run(
        main(
@ -230,13 +230,16 @@ def process_instance(
        if str(obs.content) in ["'A'", "'B'", "'C'"]:
            final_message = obs.content
            break
-    
+
    final_message = final_message.strip("'")
-    logger.info(f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}')
+    logger.info(
+        f'Predicted answer: {final_message}, Ground truth: {instance["answer"]}'
+    )

    test_result = get_test_result(
        model_answer=final_message, ground_truth=instance['answer']
    )
+    metrics = state.metrics.get() if state.metrics else None

    # Save the output
    output = {
@ -247,6 +250,7 @@ def process_instance(
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'final_message': final_message,
        'messages': messages,
        'error': state.error if state and state.error else None,
@ -254,10 +258,10 @@ def process_instance(
    }
    config.workspace_mount_path = old_workspace_mount_path
    config.workspace_base = old_workspace_base
-    
+
    # Close the sandbox
    sandbox.close()
-    
+
    return output


@ -272,7 +276,7 @@ if __name__ == '__main__':
    parser.add_argument(
        '--data_split',
        type=str,
-        help='data split to evaluate on {validation}', # right now we only support validation split
+        help='data split to evaluate on {validation}',  # right now we only support validation split
        default='validation',
    )

@ -313,7 +317,7 @@ if __name__ == '__main__':
        'logic_reasoning',
        agent_class,
        dataset_name,
-        model_name + '_maxiter_' + str(max_iterations) + eval_note
+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
    )

    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
@ -414,23 +418,25 @@ if __name__ == '__main__':
        cleanup()

    output_fp.close()
-    
+
    with open(output_file, 'r') as f:
-        test_result = [(json.loads(line))["test_result"]["result"] for line in f]
-            
+        test_result = [(json.loads(line))['test_result']['result'] for line in f]
+
    metadata = {
-        "Dataset": dataset_name,
-        "Data split": data_split,
-        "Number of Samples": len(test_result),
+        'Dataset': dataset_name,
+        'Data split': data_split,
+        'Number of Samples': len(test_result),
        'Agent class': agent_class,
        'Model name': model_name,
        'Start_time': start_time,
-        "End_time": time.strftime('%Y-%m-%d %H:%M:%S'),
-        "Final Accuracy": f"{sum(test_result)/len(test_result):.2f}",
-        }
-    
+        'End_time': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'Final Accuracy': f'{sum(test_result)/len(test_result):.2f}',
+    }
+
    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=4)
-        
+
    logger.info(f'Metadata: {json.dumps(metadata, indent=4)}')
-    logger.info(f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json')
+    logger.info(
+        f'Evaluation finished. Metadata saved to {eval_output_dir}/metadata.json'
+    )
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@ -172,6 +172,8 @@ def process_instance(
        task_state = state.task_state
        logger.info('Task state: ' + str(task_state.to_dict()))

+    metrics = state.metrics.get() if state.metrics else None
+
    # Save the output
    output = {
        'id': instance.task_id,
@ -181,6 +183,7 @@ def process_instance(
        'history': [
            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
        ],
+        'metrics': metrics,
        'error': state.error if state and state.error else None,
        'test_result': task_state.success if task_state else False,
    }