Xingyao Wang ddda30d9b7
fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)
Co-authored-by: Juan Michelini <juan@juan.com.uy>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: openhands <openhands@all-hands.dev>
2025-04-09 02:44:03 +08:00

34 lines
675 B
Python

import abc
from pydantic import BaseModel
from openhands.events import Event
class CriticResult(BaseModel):
"""
A critic result is a score and a message.
"""
score: float
message: str
@property
def success(self) -> bool:
"""
Whether the agent is successful.
"""
return self.score >= 0.5
class BaseCritic(abc.ABC):
"""
A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
"""
@abc.abstractmethod
def evaluate(
self, events: list[Event], git_patch: str | None = None
) -> CriticResult:
pass