fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)

Co-authored-by: Juan Michelini <juan@juan.com.uy>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Xingyao Wang 2025-04-08 11:44:03 -07:00 committed by GitHub
parent d1851cc3ee
commit ddda30d9b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 46 additions and 26 deletions

View File

@ -28,7 +28,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]:
with open(file_path, 'r') as f:
_global_resource_mapping[dataset_name] = json.load(f)
logger.info(f'Loaded resource mapping for {dataset_name}')
logger.debug(f'Loaded resource mapping for {dataset_name}')
return _global_resource_mapping[dataset_name]

View File

@ -121,7 +121,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
)
if 'image_assets' in instance:
assets = instance['image_assets']
assets = json.loads(instance['image_assets'])
assert (
'problem_statement' in assets
), 'problem_statement is required in image_assets'
@ -146,8 +146,8 @@ def get_instance_docker_image(
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
docker_image_prefix = 'docker.io/swebench/'
repo, name = instance_id.split('__')
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
logger.info(f'Using official SWE-Bench image: {image_name}')
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
logger.debug(f'Using official SWE-Bench image: {image_name}')
return image_name
else:
# OpenHands version of the image
@ -164,10 +164,7 @@ def get_config(
metadata: EvalMetadata,
) -> AppConfig:
# We use a different instance image for the each instance of swe-bench eval
use_swebench_official_image = bool(
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
and 'swe-gym' not in metadata.dataset.lower()
)
use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
base_container_image = get_instance_docker_image(
instance['instance_id'],
swebench_official_image=use_swebench_official_image,
@ -334,15 +331,18 @@ def initialize_runtime(
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
action = CmdRunAction(command='which python')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0 and 'testbed' in obs.content,
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
)
if 'multimodal' not in metadata.dataset.lower():
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
# SWE-Bench multimodal datasets are not using the testbed environment
action = CmdRunAction(command='which python')
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(
obs.exit_code == 0 and 'testbed' in obs.content,
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
)
logger.info('-' * 30)
logger.info('END Runtime Initialization Fn')
@ -761,9 +761,19 @@ if __name__ == '__main__':
with open(cur_output_file, 'r') as f:
for line in f:
instance = json.loads(line)
history = [event_from_dict(event) for event in instance['history']]
critic_result = critic.evaluate(history)
if not critic_result.success:
try:
history = [
event_from_dict(event) for event in instance['history']
]
critic_result = critic.evaluate(
history, instance['test_result'].get('git_patch', '')
)
if not critic_result.success:
instances_failed.append(instance['instance_id'])
except Exception as e:
logger.error(
f'Error loading history for instance {instance["instance_id"]}: {e}'
)
instances_failed.append(instance['instance_id'])
logger.info(
f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'

View File

@ -18,6 +18,7 @@ if [[ -z "$item" ]]; then
exit 1
fi
WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
@ -36,5 +37,7 @@ mkdir -p /workspace
cp -r /testbed /workspace/$WORKSPACE_NAME
# Activate instance-specific environment
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate testbed
if [ -d /opt/miniconda3 ]; then
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate testbed
fi

View File

@ -23,9 +23,11 @@ class CriticResult(BaseModel):
class BaseCritic(abc.ABC):
"""
A critic is a function that takes in a list of events and returns a score about the quality of those events.
A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
"""
@abc.abstractmethod
def evaluate(self, events: list[Event]) -> CriticResult:
def evaluate(
self, events: list[Event], git_patch: str | None = None
) -> CriticResult:
pass

View File

@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction
class AgentFinishedCritic(BaseCritic):
"""This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
If not, it will return a score of 0 and a message indicating that the agent did not finish.
If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
"""
def __init__(self):
pass
def evaluate(self, events: list[Event]) -> CriticResult:
def evaluate(
self, events: list[Event], git_patch: str | None = None
) -> CriticResult:
last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)
if git_patch is not None and len(git_patch.strip()) == 0:
return CriticResult(score=0, message='Git patch is empty.')
if isinstance(last_action, AgentFinishAction):
return CriticResult(score=1, message='Agent finished.')
else: