From 91d3d1d20aa22bb5e34320a8a3daf98862718c69 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 21 Aug 2025 17:43:09 +0200
Subject: [PATCH] Fix: expose aggregated LLM metrics in State for evaluation
 scripts (#10537)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 evaluation/benchmarks/EDA/run_infer.py        |   3 +-
 .../benchmarks/agent_bench/run_infer.py       |   3 +-
 .../benchmarks/aider_bench/run_infer.py       |   3 +-
 evaluation/benchmarks/biocoder/run_infer.py   |   3 +-
 evaluation/benchmarks/bird/run_infer.py       |   3 +-
 .../browsing_delegation/run_infer.py          |   3 +-
 evaluation/benchmarks/commit0/run_infer.py    |   3 +-
 .../benchmarks/discoverybench/run_infer.py    |   3 +-
 evaluation/benchmarks/gaia/run_infer.py       |   3 +-
 evaluation/benchmarks/gorilla/run_infer.py    |   3 +-
 evaluation/benchmarks/gpqa/run_infer.py       |   3 +-
 .../benchmarks/humanevalfix/run_infer.py      |   3 +-
 .../lca_ci_build_repair/run_infer.py          |   3 +-
 .../benchmarks/logic_reasoning/run_infer.py   |   3 +-
 evaluation/benchmarks/miniwob/run_infer.py    |   3 +-
 evaluation/benchmarks/mint/run_infer.py       |   3 +-
 evaluation/benchmarks/ml_bench/run_infer.py   |   3 +-
 .../benchmarks/scienceagentbench/run_infer.py |   3 +-
 .../swe_bench/run_infer_interact.py           |   3 +-
 evaluation/benchmarks/toolqa/run_infer.py     |   3 +-
 .../benchmarks/visualwebarena/run_infer.py    |   3 +-
 evaluation/benchmarks/webarena/run_infer.py   |   3 +-
 evaluation/integration_tests/run_infer.py     |   3 +-
 evaluation/utils/shared.py                    |  19 +-
 tests/unit/test_state_metrics_exposure.py     | 205 ++++++++++++++++++
 25 files changed, 268 insertions(+), 25 deletions(-)
 create mode 100644 tests/unit/test_state_metrics_exposure.py

diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py
index 649b72b11e..ff25172722 100644
--- a/evaluation/benchmarks/EDA/run_infer.py
+++ b/evaluation/benchmarks/EDA/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -146,7 +147,7 @@ def process_instance(
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
index 0452cbe2be..562df0024a 100644
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -273,7 +274,7 @@ def process_instance(
     # remove when it becomes unnecessary
     histories = compatibility_for_eval_history_pairs(state.history)
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index 20f4cb99a4..338315747d 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -246,7 +247,7 @@ def process_instance(
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
     histories = compatibility_for_eval_history_pairs(state.history)
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
index 4172e7d873..3dbc632a1b 100644
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -294,7 +295,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     test_result = complete_runtime(runtime, instance)
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
index 82b8ae848e..71886a0406 100644
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -18,6 +18,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -422,7 +423,7 @@ def process_instance(
     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
     if state is None:
         raise ValueError('State should not be None.')
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py
index c3990f9e01..e6eb259d86 100644
--- a/evaluation/benchmarks/browsing_delegation/run_infer.py
+++ b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -88,7 +89,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
diff --git a/evaluation/benchmarks/commit0/run_infer.py b/evaluation/benchmarks/commit0/run_infer.py
index 11dcb274b4..bad4a735cd 100644
--- a/evaluation/benchmarks/commit0/run_infer.py
+++ b/evaluation/benchmarks/commit0/run_infer.py
@@ -16,6 +16,7 @@ from evaluation.utils.shared import (
     assert_and_raise,
     codeact_user_response,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -480,7 +481,7 @@ def process_instance(
 
     # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
     histories = [event_to_dict(event) for event in state.history]
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py
index e783268e98..448bc18e7b 100644
--- a/evaluation/benchmarks/discoverybench/run_infer.py
+++ b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -17,6 +17,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -294,7 +295,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
     test_result = complete_runtime(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
index f9841b9566..a7e69489c3 100644
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -269,7 +270,7 @@ Here is the task:
         'model_answer': model_answer,
         'ground_truth': instance['Final answer'],
     }
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py
index 79e5fffdc6..87a71b4f00 100644
--- a/evaluation/benchmarks/gorilla/run_infer.py
+++ b/evaluation/benchmarks/gorilla/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -108,7 +109,7 @@ def process_instance(
     # attempt to parse model_answer
     ast_eval_fn = instance['ast_eval']
     correct, hallucination = ast_eval_fn(instance_id, model_answer_raw)
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
     logger.info(
         f'Final message: {model_answer_raw} | Correctness: {correct} | Hallucination: {hallucination}'
     )
diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py
index cb1bbd68b1..28ac39588a 100644
--- a/evaluation/benchmarks/gpqa/run_infer.py
+++ b/evaluation/benchmarks/gpqa/run_infer.py
@@ -30,6 +30,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -292,7 +293,7 @@ Ok now its time to start solving the question. Good luck!
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
index 9d881fb29e..8cf8b7efa1 100644
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -23,6 +23,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -248,7 +249,7 @@ def process_instance(
 
     if state is None:
         raise ValueError('State should not be None.')
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
     test_result = complete_runtime(runtime, instance)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
diff --git a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
index 1dba49413d..cbb53259ba 100644
--- a/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
+++ b/evaluation/benchmarks/lca_ci_build_repair/run_infer.py
@@ -22,6 +22,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -335,7 +336,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
         )
     )
     assert state is not None
-    metrics = state.metrics.get() if state.metrics else {}
+    metrics = get_metrics(state)
 
     test_result = complete_runtime(runtime, instance)
 
diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py
index eb4342f44a..23fa4e1d88 100644
--- a/evaluation/benchmarks/logic_reasoning/run_infer.py
+++ b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -10,6 +10,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -247,7 +248,7 @@ def process_instance(
     )
     test_result['final_message'] = final_message
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py
index a96043e6eb..ef2fdc1412 100644
--- a/evaluation/benchmarks/miniwob/run_infer.py
+++ b/evaluation/benchmarks/miniwob/run_infer.py
@@ -13,6 +13,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -174,7 +175,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Instruction is the first message from the USER
     instruction = ''
diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
index b7fac0e44d..72031e8d7b 100644
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -15,6 +15,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -205,7 +206,7 @@ def process_instance(
         task_state = state.extra_data['task_state']
         logger.info('Task state: ' + str(task_state.to_dict()))
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py
index 1b4b094520..32a8f76ca9 100644
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -26,6 +26,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -250,7 +251,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         )
     )
     assert state is not None
-    metrics = state.metrics.get() if state.metrics else {}
+    metrics = get_metrics(state)
 
     test_result = complete_runtime(runtime)
 
diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
index c346ac1da1..76b937308b 100644
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -218,7 +219,7 @@ If the program uses some packages that are incompatible, please figure out alter
     # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
     if state is None:
         raise ValueError('State should not be None.')
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
diff --git a/evaluation/benchmarks/swe_bench/run_infer_interact.py b/evaluation/benchmarks/swe_bench/run_infer_interact.py
index c97a2d6b3f..fee0c8e187 100755
--- a/evaluation/benchmarks/swe_bench/run_infer_interact.py
+++ b/evaluation/benchmarks/swe_bench/run_infer_interact.py
@@ -21,6 +21,7 @@ from evaluation.utils.shared import (
     EvalException,
     EvalMetadata,
     EvalOutput,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -179,7 +180,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     histories = [event_to_dict(event) for event in state.history]
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Save the output
     instruction = message_action.content
diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
index 4db988efa1..8353f76f77 100644
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -11,6 +11,7 @@ from evaluation.utils.shared import (
     codeact_user_response,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -134,7 +135,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     correct = eval_answer(str(model_answer_raw), str(answer))
     logger.info(f'Final message: {model_answer_raw} | Correctness: {correct}')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py
index e88f76c1bd..ef45663e6a 100644
--- a/evaluation/benchmarks/visualwebarena/run_infer.py
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -179,7 +180,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Instruction obtained from the first message from the USER
     instruction = ''
diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py
index 717579ff26..316dba63de 100644
--- a/evaluation/benchmarks/webarena/run_infer.py
+++ b/evaluation/benchmarks/webarena/run_infer.py
@@ -12,6 +12,7 @@ from evaluation.utils.shared import (
     EvalOutput,
     compatibility_for_eval_history_pairs,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -163,7 +164,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    metrics = state.metrics.get() if state.metrics else None
+    metrics = get_metrics(state)
 
     # Instruction is the first message from the USER
     instruction = ''
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 845ef1c728..c493cc173a 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -9,6 +9,7 @@ from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
     get_default_sandbox_config_for_eval,
+    get_metrics,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -135,7 +136,7 @@ def process_instance(
         assert len(histories) > 0, 'History should not be empty'
 
         test_result: TestResult = test_class.verify_result(runtime, histories)
-        metrics = state.metrics.get() if state.metrics else None
+        metrics = get_metrics(state)
     finally:
         runtime.close()
 
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index f88dc6c974..76ed563ec2 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -668,8 +668,23 @@ def is_fatal_runtime_error(error: str | None) -> bool:
 
 
 def get_metrics(state: State) -> dict[str, Any]:
-    """Extract metrics from the state."""
-    metrics = state.metrics.get() if state.metrics else {}
+    """Extract metrics for evaluations.
+
+    Prefer ConversationStats (source of truth) and fall back to state.metrics for
+    backward compatibility.
+    """
+    metrics: dict[str, Any]
+    try:
+        if getattr(state, 'conversation_stats', None):
+            combined = state.conversation_stats.get_combined_metrics()
+            metrics = combined.get()
+        elif getattr(state, 'metrics', None):
+            metrics = state.metrics.get()
+        else:
+            metrics = {}
+    except Exception:
+        metrics = state.metrics.get() if getattr(state, 'metrics', None) else {}
+
     metrics['condenser'] = get_condensation_metadata(state)
     return metrics
 
diff --git a/tests/unit/test_state_metrics_exposure.py b/tests/unit/test_state_metrics_exposure.py
new file mode 100644
index 0000000000..a6ec4b23e0
--- /dev/null
+++ b/tests/unit/test_state_metrics_exposure.py
@@ -0,0 +1,205 @@
+import asyncio
+from unittest.mock import patch
+
+import pytest
+
+from openhands.core.config import OpenHandsConfig
+from openhands.events.action import MessageAction
+from openhands.llm.metrics import Metrics
+
+
+class FakeEventStream:
+    def __init__(self):
+        self.sid = 'test-sid'
+        self.file_store = None
+        self.user_id = None
+
+    def add_event(self, *args, **kwargs):
+        pass
+
+    def subscribe(self, *args, **kwargs):
+        pass
+
+    def close(self):
+        pass
+
+
+class FakeRuntime:
+    def __init__(self):
+        self.event_stream = FakeEventStream()
+
+    async def connect(self):
+        return None
+
+    def close(self):
+        pass
+
+
+class DummyState:
+    def __init__(self, conversation_stats):
+        self.conversation_stats = conversation_stats
+        self.metrics = Metrics()
+        self.history = []
+        self.last_error = ''
+        self.extra_data = {}
+
+
+class FakeController:
+    def __init__(self, state):
+        self._state = state
+
+    def get_state(self):
+        return self._state
+
+    async def close(self, set_stop_state: bool = False):
+        return None
+
+    def get_trajectory(self, include_screenshots: bool = False):
+        return []
+
+
+class FakeConversationStats:
+    def __init__(self, cost: float = 1.23):
+        self._m = Metrics()
+        self._m.add_cost(cost)
+
+    def get_combined_metrics(self) -> Metrics:
+        return self._m
+
+
+def test_state_tracker_save_state_consolidates_metrics(tmp_path):
+    """Ensure StateTracker.save_state persists ConversationStats and does not touch State.metrics.
+
+    Eval scripts should read from state.conversation_stats via evaluation.utils.shared.get_metrics.
+    """
+    from openhands.controller.state.state_tracker import StateTracker
+    from openhands.server.services.conversation_stats import ConversationStats
+    from openhands.storage.memory import InMemoryFileStore
+
+    # Prepare conversation stats with one service metrics
+    store = InMemoryFileStore({})
+    conv_stats = ConversationStats(
+        file_store=store, conversation_id='cid', user_id=None
+    )
+    m = Metrics()
+    m.add_cost(0.5)
+    conv_stats.service_to_metrics['svc'] = m
+
+    # Create a new tracker and initialize state
+    tracker = StateTracker(sid='sid', file_store=store, user_id=None)
+    tracker.set_initial_state(
+        id='sid',
+        state=None,
+        conversation_stats=conv_stats,
+        max_iterations=1,
+        max_budget_per_task=None,
+        confirmation_mode=False,
+    )
+
+    # Preconditions
+    assert tracker.state.metrics.accumulated_cost == 0.0
+
+    # Act
+    tracker.save_state()
+
+    # Assert state.metrics unaffected (source of truth remains ConversationStats)
+    assert tracker.state.metrics.accumulated_cost == 0.0
+    # Persistence still called on ConversationStats (no exception)
+
+
+def test_run_controller_exposes_aggregated_metrics_in_state():
+    """Ensure get_metrics(state) reads from ConversationStats when available."""
+    from evaluation.utils.shared import get_metrics
+    from openhands.core.main import run_controller
+
+    cfg = OpenHandsConfig()
+    # Prevent run_controller from trying to persist state via DummyState
+    cfg.file_store = 'memory'
+
+    fake_conv_stats = FakeConversationStats(cost=2.5)
+
+    def fake_create_registry_and_conversation_stats(config, sid, _):
+        # return (llm_registry, conversation_stats, config)
+        return (None, fake_conv_stats, config)
+
+    def fake_create_agent(config, llm_registry):
+        class _AgentCfg:
+            enable_mcp = False
+
+        class _LLMCfg:
+            model = 'test-model'
+
+        class _LLM:
+            config = _LLMCfg()
+
+        class _Agent:
+            name = 'FakeAgent'
+            config = _AgentCfg()
+            llm = _LLM()
+
+        return _Agent()
+
+    def fake_create_runtime(
+        config,
+        llm_registry,
+        sid=None,
+        headless_mode=True,
+        agent=None,
+        git_provider_tokens=None,
+    ):
+        return FakeRuntime()
+
+    def fake_create_memory(
+        runtime,
+        event_stream,
+        sid,
+        selected_repository=None,
+        repo_directory=None,
+        status_callback=None,
+        conversation_instructions=None,
+        working_dir=None,
+    ):
+        return object()
+
+    def fake_create_controller(
+        agent,
+        runtime,
+        config,
+        conversation_stats,
+        headless_mode=True,
+        replay_events=None,
+    ):
+        # Return a controller that yields a DummyState with provided conversation_stats
+        state = DummyState(conversation_stats)
+        return (FakeController(state), None)
+
+    # Invoke run_controller under patch context
+    with (
+        patch(
+            'openhands.core.main.create_registry_and_conversation_stats',
+            side_effect=fake_create_registry_and_conversation_stats,
+        ),
+        patch('openhands.core.main.create_agent', side_effect=fake_create_agent),
+        patch('openhands.core.main.create_runtime', side_effect=fake_create_runtime),
+        patch('openhands.core.main.create_memory', side_effect=fake_create_memory),
+        patch(
+            'openhands.core.main.create_controller', side_effect=fake_create_controller
+        ),
+        patch(
+            'openhands.core.main.run_agent_until_done',
+            side_effect=lambda *args, **kwargs: None,
+        ),
+    ):
+        state = asyncio.run(
+            run_controller(
+                config=cfg,
+                initial_user_action=MessageAction(content='hi'),
+                sid='sid',
+                fake_user_response_fn=None,
+            )
+        )
+
+    assert state is not None
+    # get_metrics must prefer conversation_stats and reflect its values
+    m = get_metrics(state)
+    assert pytest.approx(m.get('accumulated_cost', 0.0), rel=1e-6) == 2.5