fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)

Co-authored-by: Juan Michelini <juan@juan.com.uy> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
2025-12-26 05:48:36 +08:00 · 2025-04-08 11:44:03 -07:00 · 2025-04-08 11:44:03 -07:00 · ddda30d9b7
commit ddda30d9b7
parent d1851cc3ee
5 changed files with 46 additions and 26 deletions
--- a/evaluation/benchmarks/swe_bench/resource/mapping.py
+++ b/evaluation/benchmarks/swe_bench/resource/mapping.py
@ -28,7 +28,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]:

        with open(file_path, 'r') as f:
            _global_resource_mapping[dataset_name] = json.load(f)
-        logger.info(f'Loaded resource mapping for {dataset_name}')
+        logger.debug(f'Loaded resource mapping for {dataset_name}')
    return _global_resource_mapping[dataset_name]


--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@ -121,7 +121,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
        )

    if 'image_assets' in instance:
-        assets = instance['image_assets']
+        assets = json.loads(instance['image_assets'])
        assert (
            'problem_statement' in assets
        ), 'problem_statement is required in image_assets'
@ -146,8 +146,8 @@ def get_instance_docker_image(
        # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
        docker_image_prefix = 'docker.io/swebench/'
        repo, name = instance_id.split('__')
-        image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
-        logger.info(f'Using official SWE-Bench image: {image_name}')
+        image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
+        logger.debug(f'Using official SWE-Bench image: {image_name}')
        return image_name
    else:
        # OpenHands version of the image
@ -164,10 +164,7 @@ def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    # We use a different instance image for the each instance of swe-bench eval
-    use_swebench_official_image = bool(
-        ('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
-        and 'swe-gym' not in metadata.dataset.lower()
-    )
+    use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
    base_container_image = get_instance_docker_image(
        instance['instance_id'],
        swebench_official_image=use_swebench_official_image,
@ -334,15 +331,18 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

-    action = CmdRunAction(command='which python')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0 and 'testbed' in obs.content,
-        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
-    )
+    if 'multimodal' not in metadata.dataset.lower():
+        # Only for non-multimodal datasets, we need to activate the testbed environment for Python
+        # SWE-Bench multimodal datasets are not using the testbed environment
+        action = CmdRunAction(command='which python')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0 and 'testbed' in obs.content,
+            f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+        )

    logger.info('-' * 30)
    logger.info('END Runtime Initialization Fn')
@ -761,9 +761,19 @@ if __name__ == '__main__':
            with open(cur_output_file, 'r') as f:
                for line in f:
                    instance = json.loads(line)
-                    history = [event_from_dict(event) for event in instance['history']]
-                    critic_result = critic.evaluate(history)
-                    if not critic_result.success:
+                    try:
+                        history = [
+                            event_from_dict(event) for event in instance['history']
+                        ]
+                        critic_result = critic.evaluate(
+                            history, instance['test_result'].get('git_patch', '')
+                        )
+                        if not critic_result.success:
+                            instances_failed.append(instance['instance_id'])
+                    except Exception as e:
+                        logger.error(
+                            f'Error loading history for instance {instance["instance_id"]}: {e}'
+                        )
                        instances_failed.append(instance['instance_id'])
            logger.info(
                f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
--- a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
@ -18,6 +18,7 @@ if [[ -z "$item" ]]; then
  exit 1
 fi

+
 WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')

 echo "WORKSPACE_NAME: $WORKSPACE_NAME"
@ -36,5 +37,7 @@ mkdir -p /workspace
 cp -r /testbed /workspace/$WORKSPACE_NAME

 # Activate instance-specific environment
-. /opt/miniconda3/etc/profile.d/conda.sh
-conda activate testbed
+if [ -d /opt/miniconda3 ]; then
+    . /opt/miniconda3/etc/profile.d/conda.sh
+    conda activate testbed
+fi
--- a/openhands/critic/base.py
+++ b/openhands/critic/base.py
@ -23,9 +23,11 @@ class CriticResult(BaseModel):

 class BaseCritic(abc.ABC):
    """
-    A critic is a function that takes in a list of events and returns a score about the quality of those events.
+    A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
    """

    @abc.abstractmethod
-    def evaluate(self, events: list[Event]) -> CriticResult:
+    def evaluate(
+        self, events: list[Event], git_patch: str | None = None
+    ) -> CriticResult:
        pass
--- a/openhands/critic/finish_critic.py
+++ b/openhands/critic/finish_critic.py
@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction

 class AgentFinishedCritic(BaseCritic):
    """This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
-
    If not, it will return a score of 0 and a message indicating that the agent did not finish.
+    If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
    """

    def __init__(self):
        pass

-    def evaluate(self, events: list[Event]) -> CriticResult:
+    def evaluate(
+        self, events: list[Event], git_patch: str | None = None
+    ) -> CriticResult:
        last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)

+        if git_patch is not None and len(git_patch.strip()) == 0:
+            return CriticResult(score=0, message='Git patch is empty.')
+
        if isinstance(last_action, AgentFinishAction):
            return CriticResult(score=1, message='Agent finished.')
        else: