mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
fix(eval): iterative evaluation improvements; SWE-Bench multimodal fixes (#7739)
Co-authored-by: Juan Michelini <juan@juan.com.uy> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
parent
d1851cc3ee
commit
ddda30d9b7
@ -28,7 +28,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]:
|
||||
|
||||
with open(file_path, 'r') as f:
|
||||
_global_resource_mapping[dataset_name] = json.load(f)
|
||||
logger.info(f'Loaded resource mapping for {dataset_name}')
|
||||
logger.debug(f'Loaded resource mapping for {dataset_name}')
|
||||
return _global_resource_mapping[dataset_name]
|
||||
|
||||
|
||||
|
||||
@ -121,7 +121,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
|
||||
)
|
||||
|
||||
if 'image_assets' in instance:
|
||||
assets = instance['image_assets']
|
||||
assets = json.loads(instance['image_assets'])
|
||||
assert (
|
||||
'problem_statement' in assets
|
||||
), 'problem_statement is required in image_assets'
|
||||
@ -146,8 +146,8 @@ def get_instance_docker_image(
|
||||
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
|
||||
docker_image_prefix = 'docker.io/swebench/'
|
||||
repo, name = instance_id.split('__')
|
||||
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
|
||||
logger.info(f'Using official SWE-Bench image: {image_name}')
|
||||
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
|
||||
logger.debug(f'Using official SWE-Bench image: {image_name}')
|
||||
return image_name
|
||||
else:
|
||||
# OpenHands version of the image
|
||||
@ -164,10 +164,7 @@ def get_config(
|
||||
metadata: EvalMetadata,
|
||||
) -> AppConfig:
|
||||
# We use a different instance image for the each instance of swe-bench eval
|
||||
use_swebench_official_image = bool(
|
||||
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
|
||||
and 'swe-gym' not in metadata.dataset.lower()
|
||||
)
|
||||
use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
|
||||
base_container_image = get_instance_docker_image(
|
||||
instance['instance_id'],
|
||||
swebench_official_image=use_swebench_official_image,
|
||||
@ -334,15 +331,18 @@ def initialize_runtime(
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(command='which python')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0 and 'testbed' in obs.content,
|
||||
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
|
||||
)
|
||||
if 'multimodal' not in metadata.dataset.lower():
|
||||
# Only for non-multimodal datasets, we need to activate the testbed environment for Python
|
||||
# SWE-Bench multimodal datasets are not using the testbed environment
|
||||
action = CmdRunAction(command='which python')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert_and_raise(
|
||||
obs.exit_code == 0 and 'testbed' in obs.content,
|
||||
f'Expected to find python interpreter from testbed, but got: {str(obs)}',
|
||||
)
|
||||
|
||||
logger.info('-' * 30)
|
||||
logger.info('END Runtime Initialization Fn')
|
||||
@ -761,9 +761,19 @@ if __name__ == '__main__':
|
||||
with open(cur_output_file, 'r') as f:
|
||||
for line in f:
|
||||
instance = json.loads(line)
|
||||
history = [event_from_dict(event) for event in instance['history']]
|
||||
critic_result = critic.evaluate(history)
|
||||
if not critic_result.success:
|
||||
try:
|
||||
history = [
|
||||
event_from_dict(event) for event in instance['history']
|
||||
]
|
||||
critic_result = critic.evaluate(
|
||||
history, instance['test_result'].get('git_patch', '')
|
||||
)
|
||||
if not critic_result.success:
|
||||
instances_failed.append(instance['instance_id'])
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f'Error loading history for instance {instance["instance_id"]}: {e}'
|
||||
)
|
||||
instances_failed.append(instance['instance_id'])
|
||||
logger.info(
|
||||
f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
|
||||
|
||||
@ -18,6 +18,7 @@ if [[ -z "$item" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
|
||||
|
||||
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
|
||||
@ -36,5 +37,7 @@ mkdir -p /workspace
|
||||
cp -r /testbed /workspace/$WORKSPACE_NAME
|
||||
|
||||
# Activate instance-specific environment
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate testbed
|
||||
if [ -d /opt/miniconda3 ]; then
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate testbed
|
||||
fi
|
||||
|
||||
@ -23,9 +23,11 @@ class CriticResult(BaseModel):
|
||||
|
||||
class BaseCritic(abc.ABC):
|
||||
"""
|
||||
A critic is a function that takes in a list of events and returns a score about the quality of those events.
|
||||
A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def evaluate(self, events: list[Event]) -> CriticResult:
|
||||
def evaluate(
|
||||
self, events: list[Event], git_patch: str | None = None
|
||||
) -> CriticResult:
|
||||
pass
|
||||
|
||||
@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction
|
||||
|
||||
class AgentFinishedCritic(BaseCritic):
|
||||
"""This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
|
||||
|
||||
If not, it will return a score of 0 and a message indicating that the agent did not finish.
|
||||
If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def evaluate(self, events: list[Event]) -> CriticResult:
|
||||
def evaluate(
|
||||
self, events: list[Event], git_patch: str | None = None
|
||||
) -> CriticResult:
|
||||
last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)
|
||||
|
||||
if git_patch is not None and len(git_patch.strip()) == 0:
|
||||
return CriticResult(score=0, message='Git patch is empty.')
|
||||
|
||||
if isinstance(last_action, AgentFinishAction):
|
||||
return CriticResult(score=1, message='Agent finished.')
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user