mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 13:47:19 +08:00
fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)
Co-authored-by: Juan Michelini <juan@juan.com.uy> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
@@ -23,9 +23,11 @@ class CriticResult(BaseModel):
|
||||
|
||||
class BaseCritic(abc.ABC):
|
||||
"""
|
||||
A critic is a function that takes in a list of events and returns a score about the quality of those events.
|
||||
A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def evaluate(self, events: list[Event]) -> CriticResult:
|
||||
def evaluate(
|
||||
self, events: list[Event], git_patch: str | None = None
|
||||
) -> CriticResult:
|
||||
pass
|
||||
|
||||
@@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction
|
||||
|
||||
class AgentFinishedCritic(BaseCritic):
|
||||
"""This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
|
||||
|
||||
If not, it will return a score of 0 and a message indicating that the agent did not finish.
|
||||
If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def evaluate(self, events: list[Event]) -> CriticResult:
|
||||
def evaluate(
|
||||
self, events: list[Event], git_patch: str | None = None
|
||||
) -> CriticResult:
|
||||
last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)
|
||||
|
||||
if git_patch is not None and len(git_patch.strip()) == 0:
|
||||
return CriticResult(score=0, message='Git patch is empty.')
|
||||
|
||||
if isinstance(last_action, AgentFinishAction):
|
||||
return CriticResult(score=1, message='Agent finished.')
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user