fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)

Co-authored-by: Juan Michelini <juan@juan.com.uy> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
2026-03-22 13:47:19 +08:00 · 2025-04-08 11:44:03 -07:00
parent d1851cc3ee
commit ddda30d9b7
5 changed files with 46 additions and 26 deletions
--- a/openhands/critic/base.py
+++ b/openhands/critic/base.py
@@ -23,9 +23,11 @@ class CriticResult(BaseModel):

 class BaseCritic(abc.ABC):
    """
-    A critic is a function that takes in a list of events and returns a score about the quality of those events.
+    A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
    """

    @abc.abstractmethod
-    def evaluate(self, events: list[Event]) -> CriticResult:
+    def evaluate(
+        self, events: list[Event], git_patch: str | None = None
+    ) -> CriticResult:
        pass
--- a/openhands/critic/finish_critic.py
+++ b/openhands/critic/finish_critic.py
@@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction

 class AgentFinishedCritic(BaseCritic):
    """This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
-
    If not, it will return a score of 0 and a message indicating that the agent did not finish.
+    If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
    """

    def __init__(self):
        pass

-    def evaluate(self, events: list[Event]) -> CriticResult:
+    def evaluate(
+        self, events: list[Event], git_patch: str | None = None
+    ) -> CriticResult:
        last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)

+        if git_patch is not None and len(git_patch.strip()) == 0:
+            return CriticResult(score=0, message='Git patch is empty.')
+
        if isinstance(last_action, AgentFinishAction):
            return CriticResult(score=1, message='Agent finished.')
        else: