mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Two fixes to swe bench eval (#2831)
* Two fixes to swe bench eval * Add error message * Change dumping of metadata
This commit is contained in:
parent
3a3694ca17
commit
d0384cafdd
@ -163,7 +163,7 @@ def process_instance(
|
||||
'instance_id': instance['text'].strip(),
|
||||
'instance': instance,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
@ -189,7 +189,7 @@ def process_instance(
|
||||
'instance_id': inst_id,
|
||||
'instance': instance.to_dict(),
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': histories,
|
||||
'metrics': metrics,
|
||||
'error': state.last_error if state and state.last_error else None,
|
||||
|
||||
@ -202,7 +202,7 @@ def process_instance(
|
||||
'biocoder_instance': instance.to_dict(),
|
||||
'instruction': instruction,
|
||||
'generated': test_result['metadata']['1_copy_change_code'],
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
@ -249,7 +249,7 @@ def process_instance(
|
||||
output = {
|
||||
'task_id': instance.task_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
@ -171,7 +171,7 @@ def process_instance(
|
||||
'instance_id': instance['task_id'],
|
||||
'instance': instance,
|
||||
'instruction': instance['Question'],
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs))
|
||||
for action, obs in state.history
|
||||
|
||||
@ -150,7 +150,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
|
||||
'hallucination': hallucination,
|
||||
'answer_id': 'None',
|
||||
'model_id': metadata['model_name'],
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs))
|
||||
for action, obs in state.history
|
||||
|
||||
@ -236,7 +236,7 @@ def process_instance(
|
||||
'task_id': instance.task_id,
|
||||
'instance_id': instance.instance_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs))
|
||||
for action, obs in state.history
|
||||
|
||||
@ -206,7 +206,7 @@ def process_instance(
|
||||
output = {
|
||||
'task_id': instance.task_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs))
|
||||
for action, obs in state.history
|
||||
|
||||
@ -222,7 +222,7 @@ def process_instance(
|
||||
'id': instance['id'],
|
||||
'instance': instance,
|
||||
'instruction': instruction,
|
||||
# 'metadata': metadata,
|
||||
# 'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs))
|
||||
for action, obs in state.history
|
||||
|
||||
@ -114,7 +114,7 @@ def process_instance(
|
||||
output = {
|
||||
'instance_id': env_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
@ -167,7 +167,7 @@ def process_instance(
|
||||
'id': instance.task_id,
|
||||
'instance': instance.to_dict(),
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
@ -200,7 +200,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
|
||||
'instance_id': instance['id'],
|
||||
'repo': repo_url,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs))
|
||||
for action, obs in state.history
|
||||
|
||||
@ -176,9 +176,7 @@ def process_instance(
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
|
||||
workspace_mount_path = os.path.join(
|
||||
metadata.config.workspace_mount_path, '_eval_workspace'
|
||||
)
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# create process-specific workspace dir
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
@ -318,7 +316,7 @@ IMPORTANT TIPS:
|
||||
'swe_instance': instance.to_dict(), # SWE Bench specific
|
||||
'instruction': instruction,
|
||||
'git_patch': git_patch, # SWE Bench specific
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
@ -358,6 +356,8 @@ if __name__ == '__main__':
|
||||
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
if args.llm_config and llm_config is None:
|
||||
raise ValueError(f'Could not find LLM config {args.llm_config}')
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
details = {}
|
||||
@ -371,6 +371,7 @@ if __name__ == '__main__':
|
||||
llm_config,
|
||||
'swe-bench-lite',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
details=details,
|
||||
|
||||
@ -112,7 +112,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
|
||||
'correct': correct,
|
||||
'answer_id': 'None',
|
||||
'model_id': metadata.model_name,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
@ -115,7 +115,7 @@ def process_instance(
|
||||
output = {
|
||||
'instance_id': env_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': [
|
||||
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
|
||||
],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user