fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)

Co-authored-by: Juan Michelini <juan@juan.com.uy>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Xingyao Wang
2025-04-08 11:44:03 -07:00
committed by GitHub
parent d1851cc3ee
commit ddda30d9b7
5 changed files with 46 additions and 26 deletions

View File

@@ -23,9 +23,11 @@ class CriticResult(BaseModel):
class BaseCritic(abc.ABC):
"""
A critic is a function that takes in a list of events and returns a score about the quality of those events.
A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
"""
@abc.abstractmethod
def evaluate(self, events: list[Event]) -> CriticResult:
def evaluate(
self, events: list[Event], git_patch: str | None = None
) -> CriticResult:
pass

View File

@@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction
class AgentFinishedCritic(BaseCritic):
"""This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
If not, it will return a score of 0 and a message indicating that the agent did not finish.
If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
"""
def __init__(self):
pass
def evaluate(self, events: list[Event]) -> CriticResult:
def evaluate(
self, events: list[Event], git_patch: str | None = None
) -> CriticResult:
last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)
if git_patch is not None and len(git_patch.strip()) == 0:
return CriticResult(score=0, message='Git patch is empty.')
if isinstance(last_action, AgentFinishAction):
return CriticResult(score=1, message='Agent finished.')
else: