Two fixes to swe bench eval (#2831)

* Two fixes to swe bench eval

* Add error message

* Change dumping of metadata
This commit is contained in:
Graham Neubig 2024-07-07 16:21:50 +09:00 committed by GitHub
parent 3a3694ca17
commit d0384cafdd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 19 additions and 18 deletions

View File

@ -163,7 +163,7 @@ def process_instance(
'instance_id': instance['text'].strip(),
'instance': instance,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],

View File

@ -189,7 +189,7 @@ def process_instance(
'instance_id': inst_id,
'instance': instance.to_dict(),
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,

View File

@ -202,7 +202,7 @@ def process_instance(
'biocoder_instance': instance.to_dict(),
'instruction': instruction,
'generated': test_result['metadata']['1_copy_change_code'],
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],

View File

@ -249,7 +249,7 @@ def process_instance(
output = {
'task_id': instance.task_id,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],

View File

@ -171,7 +171,7 @@ def process_instance(
'instance_id': instance['task_id'],
'instance': instance,
'instruction': instance['Question'],
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs))
for action, obs in state.history

View File

@ -150,7 +150,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
'hallucination': hallucination,
'answer_id': 'None',
'model_id': metadata['model_name'],
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs))
for action, obs in state.history

View File

@ -236,7 +236,7 @@ def process_instance(
'task_id': instance.task_id,
'instance_id': instance.instance_id,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs))
for action, obs in state.history

View File

@ -206,7 +206,7 @@ def process_instance(
output = {
'task_id': instance.task_id,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs))
for action, obs in state.history

View File

@ -222,7 +222,7 @@ def process_instance(
'id': instance['id'],
'instance': instance,
'instruction': instruction,
# 'metadata': metadata,
# 'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs))
for action, obs in state.history

View File

@ -114,7 +114,7 @@ def process_instance(
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],

View File

@ -167,7 +167,7 @@ def process_instance(
'id': instance.task_id,
'instance': instance.to_dict(),
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],

View File

@ -200,7 +200,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
'instance_id': instance['id'],
'repo': repo_url,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs))
for action, obs in state.history

View File

@ -176,9 +176,7 @@ def process_instance(
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
workspace_mount_path = os.path.join(
metadata.config.workspace_mount_path, '_eval_workspace'
)
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
@ -318,7 +316,7 @@ IMPORTANT TIPS:
'swe_instance': instance.to_dict(), # SWE Bench specific
'instruction': instruction,
'git_patch': git_patch, # SWE Bench specific
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],
@ -358,6 +356,8 @@ if __name__ == '__main__':
id_column = 'instance_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
if args.llm_config and llm_config is None:
raise ValueError(f'Could not find LLM config {args.llm_config}')
logger.info(f'Config for evaluation: {config}')
details = {}
@ -371,6 +371,7 @@ if __name__ == '__main__':
llm_config,
'swe-bench-lite',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
details=details,

View File

@ -112,7 +112,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
'correct': correct,
'answer_id': 'None',
'model_id': metadata.model_name,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],

View File

@ -115,7 +115,7 @@ def process_instance(
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata,
'metadata': metadata.model_dump(),
'history': [
(event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
],