From f6dc89b41a0cbaf5e05bd8bfddea388657b488ab Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 6 Jul 2024 07:18:46 +0800 Subject: [PATCH] [Evaluation] Simplify eval & and multi-processing related fixes (#2810) * initialize agent inside process_instance_fn; * remove dependency on `config.max_iterations` * switch back to only include llm config to metadata --- evaluation/EDA/run_infer.py | 9 ++++----- evaluation/agent_bench/run_infer.py | 10 +++++++--- evaluation/biocoder/run_infer.py | 10 +++++++--- evaluation/bird/run_infer.py | 10 +++++++--- evaluation/gaia/run_infer.py | 8 ++++++-- evaluation/gorilla/run_infer.py | 1 + evaluation/gpqa/run_infer.py | 10 ++++++---- evaluation/humanevalfix/run_infer.py | 10 +++++++--- evaluation/logic_reasoning/run_infer.py | 8 +++++--- evaluation/miniwob/run_infer.py | 8 +++++--- evaluation/mint/run_infer.py | 11 ++++++----- evaluation/ml_bench/run_infer.py | 15 ++++++++------- evaluation/swe_bench/run_infer.py | 11 +++++++---- evaluation/toolqa/run_infer.py | 14 ++++++-------- evaluation/utils/shared.py | 7 ++----- evaluation/webarena/run_infer.py | 7 +++++-- 16 files changed, 89 insertions(+), 60 deletions(-) diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py index bf106f30ce..7bf8d62aa8 100644 --- a/evaluation/EDA/run_infer.py +++ b/evaluation/EDA/run_infer.py @@ -134,6 +134,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ agent.__class__.__name__ ], @@ -201,17 +202,15 @@ if __name__ == '__main__': ) args, _ = parser.parse_known_args() - if args.llm_config: - specified_llm_config = get_llm_config_arg(args.llm_config) - if specified_llm_config: - config.llm = specified_llm_config + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') eda_dataset = load_dataset( 'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split ) metadata = make_metadata( - config.llm, + llm_config, f'eda-{args.dataset}', args.agent_cls, args.max_iterations, diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py index 60d2952df5..6c6a709cef 100644 --- a/evaluation/agent_bench/run_infer.py +++ b/evaluation/agent_bench/run_infer.py @@ -22,7 +22,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -116,6 +116,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__], sandbox=sandbox, sid=inst_id, @@ -216,7 +217,10 @@ if __name__ == '__main__': args = parse_arguments() dataset = load_dataset('iFurySt/AgentBench') agent_bench_tests = dataset['osbench'].to_pandas() - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -227,7 +231,7 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) + run_evaluation( instances, metadata, diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py index 05be46a70b..47bb03eb0d 100644 --- a/evaluation/biocoder/run_infer.py +++ b/evaluation/biocoder/run_infer.py @@ -20,7 +20,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -181,6 +181,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ agent.__class__.__name__ ], @@ -220,7 +221,10 @@ if __name__ == '__main__': args = parse_arguments() dataset = load_dataset('lilbillbiscuit/biocoder_public') biocoder_tests = dataset['test'].to_pandas() - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -231,7 +235,7 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) + run_evaluation( instances, metadata, diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py index 864329b102..8aa2291b55 100644 --- a/evaluation/bird/run_infer.py +++ b/evaluation/bird/run_infer.py @@ -22,7 +22,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -228,6 +228,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ agent.__class__.__name__ ], @@ -393,7 +394,10 @@ if __name__ == '__main__': args = parse_arguments() bird_dataset = load_bird() dataset = bird_dataset['test'].to_pandas() - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -404,7 +408,7 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) + run_evaluation( instances, metadata, diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py index 72bc10c27d..51131107ea 100644 --- a/evaluation/gaia/run_infer.py +++ b/evaluation/gaia/run_infer.py @@ -21,7 +21,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import config, get_parser +from opendevin.core.config import config, get_llm_config_arg, get_parser from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -121,6 +121,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ agent.__class__.__name__ ], @@ -199,8 +200,11 @@ if __name__ == '__main__': config.workspace_base = os.path.abspath(args.directory) logger.info(f'Setting workspace base to {config.workspace_base}') + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( - llm_config=config.llm, + llm_config=llm_config, dataset_name='gaia', agent_class=args.agent_cls, max_iterations=args.max_iterations, diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py index c01b5e890f..7adb0bb3ff 100644 --- a/evaluation/gorilla/run_infer.py +++ b/evaluation/gorilla/run_infer.py @@ -116,6 +116,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( agent.__class__.__name__ ), diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py index 9590b156f9..32a89d2662 100644 --- a/evaluation/gpqa/run_infer.py +++ b/evaluation/gpqa/run_infer.py @@ -37,7 +37,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import config, get_parser +from opendevin.core.config import config, get_llm_config_arg, get_parser from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -200,6 +200,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( agent.__class__.__name__ ), @@ -266,6 +267,9 @@ if __name__ == '__main__': ) args, _ = parser.parse_known_args() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing # so we don't need to manage file uploading to OpenDevin's repo dataset = load_dataset('Idavidrein/gpqa', args.data_split) @@ -279,7 +283,7 @@ if __name__ == '__main__': # gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True) metadata = make_metadata( - llm_config=config.llm, + llm_config=llm_config, dataset_name='gpqa', agent_class=args.agent_cls, max_iterations=args.max_iterations, @@ -293,8 +297,6 @@ if __name__ == '__main__': gpqa_dataset, output_file, args.eval_n_limit, 'task_id' ) - agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm)) - run_evaluation( dataset=prepared_dataset, metadata=metadata, diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py index fa1de3edf0..40fcfecc8e 100644 --- a/evaluation/humanevalfix/run_infer.py +++ b/evaluation/humanevalfix/run_infer.py @@ -28,7 +28,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -185,6 +185,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( agent.__class__.__name__ ), @@ -234,7 +235,10 @@ if __name__ == '__main__': hefix_tests = dataset['test'].to_pandas() id_column = 'task_id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -245,7 +249,7 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) + run_evaluation( instances, metadata, diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py index 33385f38b5..0febf0b4a5 100644 --- a/evaluation/logic_reasoning/run_infer.py +++ b/evaluation/logic_reasoning/run_infer.py @@ -18,7 +18,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser +from opendevin.core.config import config, get_llm_config_arg, get_parser from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -182,6 +182,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( agent.__class__.__name__ ), @@ -271,7 +272,9 @@ if __name__ == '__main__': logic_reasoning_tests = dataset[data_split] id_column = 'id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -282,7 +285,6 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) run_evaluation( instances, metadata, diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py index fc84fe3b70..d15d597560 100644 --- a/evaluation/miniwob/run_infer.py +++ b/evaluation/miniwob/run_infer.py @@ -15,7 +15,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -81,6 +81,7 @@ def process_instance( run_agent_controller( agent, 'PLACEHOLDER_GOAL', + max_iterations=metadata.max_iterations, runtime_tools_config=runtime_tools_config, sandbox=get_sandbox(), sid=env_id, @@ -139,7 +140,9 @@ if __name__ == '__main__': ) id_column = 'id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -150,7 +153,6 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) _ = get_sandbox() # Initialize the sandbox run_evaluation( instances, diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py index e953b9c0f4..bc8b5b6d34 100644 --- a/evaluation/mint/run_infer.py +++ b/evaluation/mint/run_infer.py @@ -17,7 +17,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser +from opendevin.core.config import config, get_llm_config_arg, get_parser from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -65,11 +65,11 @@ AGENT_CLS_TO_INST_SUFFIX = { def process_instance( - agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True, ): + agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config)) workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace') # create process-specific workspace dir workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid())) @@ -145,6 +145,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=fake_user_response_fn, sandbox=sandbox, sid=sid, @@ -209,7 +210,9 @@ if __name__ == '__main__': mint_tests = mint_dataset.to_pandas() id_column = 'id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -221,9 +224,7 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) run_evaluation( - agent, instances, metadata, output_file, diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py index b6588c4fd7..c20ade226c 100644 --- a/evaluation/ml_bench/run_infer.py +++ b/evaluation/ml_bench/run_infer.py @@ -32,7 +32,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser +from opendevin.core.config import config, get_llm_config_arg, get_parser from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -68,9 +68,8 @@ ID2CONDA = { } -def process_instance( - agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True -): +def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True): + agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config)) old_workspace_mount_path = config.workspace_mount_path old_workspace_base = config.workspace_base try: @@ -154,6 +153,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( agent.__class__.__name__ ), @@ -242,7 +242,9 @@ if __name__ == '__main__': ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas() id_column = 'instance_id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name, @@ -253,9 +255,8 @@ if __name__ == '__main__': ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) + run_evaluation( - agent, instances, metadata, output_file, diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 02c833456e..2c8b56e2f8 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -21,7 +21,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -176,7 +176,9 @@ def process_instance( # Create the agent agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config)) - workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace') + workspace_mount_path = os.path.join( + metadata.config.workspace_mount_path, '_eval_workspace' + ) # create process-specific workspace dir workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid())) pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True) @@ -283,6 +285,7 @@ IMPORTANT TIPS: run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ agent.__class__.__name__ ], @@ -354,7 +357,8 @@ if __name__ == '__main__': swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id') id_column = 'instance_id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') details = {} _agent_cls = agenthub.Agent.get_cls(args.agent_cls) @@ -367,7 +371,6 @@ if __name__ == '__main__': llm_config, 'swe-bench-lite', args.agent_cls, - args.max_iterations, args.eval_note, args.eval_output_dir, details=details, diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py index a35b48a03a..9146ed2373 100644 --- a/evaluation/toolqa/run_infer.py +++ b/evaluation/toolqa/run_infer.py @@ -16,7 +16,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser +from opendevin.core.config import config, get_llm_config_arg, get_parser from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -36,9 +36,8 @@ AGENT_CLS_TO_INST_SUFFIX = { } -def process_instance( - agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True -): +def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True): + agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config)) # create process-specific workspace dir # we will create a workspace directory for EACH process # so that different agent don't interfere with each other. @@ -83,6 +82,7 @@ def process_instance( run_agent_controller( agent, instruction, + max_iterations=metadata.max_iterations, fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ agent.__class__.__name__ ], @@ -143,6 +143,8 @@ if __name__ == '__main__': default='YOUR_WOLFRAMALPHA_APPID', ) args, _ = parser.parse_known_args() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') dataset = '' hardness = '' @@ -172,20 +174,16 @@ if __name__ == '__main__': toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid) id_column = 'qid' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() metadata = make_metadata( llm_config, f'toolqa-{args.dataset}-{args.hardness}', args.agent_cls, - args.max_iterations, args.eval_note, args.eval_output_dir, ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column) - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config)) run_evaluation( - agent, instances, metadata, output_file, diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 42413d14ff..0b6c36d13b 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -12,12 +12,10 @@ import pandas as pd from pydantic import BaseModel from tqdm import tqdm -from opendevin.controller.agent import Agent from opendevin.controller.state.state import State from opendevin.core.config import LLMConfig from opendevin.events.action import Action from opendevin.events.action.message import MessageAction -from opendevin.llm.llm import LLM class EvalMetadata(BaseModel): @@ -166,10 +164,9 @@ def run_evaluation( process_instance_func: Callable[[pd.Series, EvalMetadata, bool], Any], id_column: str, ): - agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config)) logger.info( - f'Evaluation started with Agent {agent.__class__.name}, ' - f'model {agent.llm.model_name}, max iterations {metadata.max_iterations}.' + f'Evaluation started with Agent {metadata.agent_class}, ' + f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.' ) pbar = tqdm(total=len(dataset)) output_fp = open(output_file, 'a') diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py index 90e5c187dc..b7130e3f8e 100644 --- a/evaluation/webarena/run_infer.py +++ b/evaluation/webarena/run_infer.py @@ -15,7 +15,7 @@ from evaluation.utils.shared import ( ) from opendevin.controller.agent import Agent from opendevin.controller.state.state import State -from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments +from opendevin.core.config import config, get_llm_config_arg, parse_arguments from opendevin.core.logger import get_console_handler from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import run_agent_controller @@ -82,6 +82,7 @@ def process_instance( run_agent_controller( agent, 'PLACEHOLDER_GOAL', + max_iterations=metadata.max_iterations, runtime_tools_config=runtime_tools_config, sandbox=get_sandbox(), sid=env_id, @@ -144,7 +145,9 @@ if __name__ == '__main__': ) id_column = 'id' - llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig() + llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm + logger.info(f'Config for evaluation: {config}') + metadata = make_metadata( llm_config, args.dataset_name,