From 714e46f29a1f8b649d4e93fce15415563a7982ba Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 21 Sep 2024 23:39:13 -0500 Subject: [PATCH] [eval] save eventstream & llm completions for SWE-Bench run_infer (#3923) --- evaluation/swe_bench/run_infer.py | 7 +++---- evaluation/utils/shared.py | 7 ++++++- openhands/controller/agent_controller.py | 4 ++++ openhands/core/config.py | 2 ++ openhands/llm/llm.py | 18 ++++++++++++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 189f234071..09fe720f83 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -30,6 +30,7 @@ from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction from openhands.events.observation import CmdOutputObservation, ErrorObservation +from openhands.events.serialization.event import event_to_dict from openhands.runtime.runtime import Runtime from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue @@ -383,10 +384,7 @@ def process_instance( if state is None: raise ValueError('State should not be None.') - # history is now available as a stream of events, rather than list of pairs of (Action, Observation) - # for compatibility with the existing output format, we can remake the pairs here - # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = [event_to_dict(event) for event in state.history.get_events()] metrics = state.metrics.get() if state.metrics else None # Save the output @@ -398,6 +396,7 @@ def process_instance( metadata=metadata, history=histories, metrics=metrics, + llm_completions=state.extra_data.get('llm_completions', []), error=state.last_error if state and state.last_error else None, ) return output diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 6ed1833c2e..ed476580cb 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -58,7 +58,11 @@ class EvalOutput(BaseModel): # Interaction info metadata: EvalMetadata | None = None - history: list[tuple[dict[str, Any], dict[str, Any]]] | None = None + # list[tuple[dict[str, Any], dict[str, Any]]] - for compatibility with the old format + history: ( + list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None + ) = None + llm_completions: list[dict[str, Any]] metrics: dict[str, Any] | None = None error: str | None = None @@ -278,6 +282,7 @@ def _process_instance_wrapper( + '-' * 10 ) # Raise an error after all retries & stop the evaluation + logger.exception(e) raise RuntimeError( f'Maximum error retries reached for instance {instance.instance_id}' ) from e diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py index 25002879bf..724d2c36f3 100644 --- a/openhands/controller/agent_controller.py +++ b/openhands/controller/agent_controller.py @@ -132,6 +132,10 @@ class AgentController: async def update_state_after_step(self): # update metrics especially for cost self.state.local_metrics = self.agent.llm.metrics + if 'llm_completions' not in self.state.extra_data: + self.state.extra_data['llm_completions'] = [] + self.state.extra_data['llm_completions'].extend(self.agent.llm.llm_completions) + self.agent.llm.llm_completions.clear() async def report_error(self, message: str, exception: Exception | None = None): """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct. diff --git a/openhands/core/config.py b/openhands/core/config.py index b561945b5a..4f42b5cacb 100644 --- a/openhands/core/config.py +++ b/openhands/core/config.py @@ -53,6 +53,7 @@ class LLMConfig: drop_params: Drop any unmapped (unsupported) params without causing an exception. disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction). caching_prompt: Using the prompt caching feature provided by the LLM. + log_completions: Whether to log LLM completions to the state. """ model: str = 'gpt-4o' @@ -82,6 +83,7 @@ class LLMConfig: drop_params: bool | None = None disable_vision: bool | None = None caching_prompt: bool = False + log_completions: bool = False def defaults_to_dict(self) -> dict: """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional.""" diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index b2dc0e7e57..89b9d33e7d 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -1,7 +1,9 @@ import asyncio import copy +import time import warnings from functools import partial +from typing import Any from openhands.core.config import LLMConfig from openhands.runtime.utils.shutdown_listener import should_continue @@ -73,6 +75,11 @@ class LLM: self.cost_metric_supported = True self.config = copy.deepcopy(config) + # list of LLM completions (for logging purposes). Each completion is a dict with the following keys: + # - 'messages': list of messages + # - 'response': response from the LLM + self.llm_completions: list[dict[str, Any]] = [] + # Set up config attributes with default values to prevent AttributeError LLMConfig.set_missing_attributes(self.config) @@ -257,6 +264,16 @@ class LLM: logger.debug('No completion messages!') resp = {'choices': [{'message': {'content': ''}}]} + if self.config.log_completions: + self.llm_completions.append( + { + 'messages': messages, + 'response': resp, + 'timestamp': time.time(), + 'cost': self.completion_cost(resp), + } + ) + # log the response message_back = resp['choices'][0]['message']['content'] if message_back: @@ -659,6 +676,7 @@ class LLM: def reset(self): self.metrics = Metrics() + self.llm_completions = [] def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dict]: if isinstance(messages, Message):