mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[Evaluation] Simplify eval & and multi-processing related fixes (#2810)
* initialize agent inside process_instance_fn; * remove dependency on `config.max_iterations` * switch back to only include llm config to metadata
This commit is contained in:
parent
a47713ecb0
commit
f6dc89b41a
@ -134,6 +134,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@ -201,17 +202,15 @@ if __name__ == '__main__':
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if args.llm_config:
|
||||
specified_llm_config = get_llm_config_arg(args.llm_config)
|
||||
if specified_llm_config:
|
||||
config.llm = specified_llm_config
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
eda_dataset = load_dataset(
|
||||
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
|
||||
)
|
||||
|
||||
metadata = make_metadata(
|
||||
config.llm,
|
||||
llm_config,
|
||||
f'eda-{args.dataset}',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
|
||||
@ -22,7 +22,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -116,6 +116,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
|
||||
sandbox=sandbox,
|
||||
sid=inst_id,
|
||||
@ -216,7 +217,10 @@ if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
dataset = load_dataset('iFurySt/AgentBench')
|
||||
agent_bench_tests = dataset['osbench'].to_pandas()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -227,7 +231,7 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
|
||||
@ -20,7 +20,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -181,6 +181,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@ -220,7 +221,10 @@ if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
dataset = load_dataset('lilbillbiscuit/biocoder_public')
|
||||
biocoder_tests = dataset['test'].to_pandas()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -231,7 +235,7 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
|
||||
@ -22,7 +22,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -228,6 +228,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@ -393,7 +394,10 @@ if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
bird_dataset = load_bird()
|
||||
dataset = bird_dataset['test'].to_pandas()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -404,7 +408,7 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
|
||||
@ -21,7 +21,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_parser
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -121,6 +121,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@ -199,8 +200,11 @@ if __name__ == '__main__':
|
||||
config.workspace_base = os.path.abspath(args.directory)
|
||||
logger.info(f'Setting workspace base to {config.workspace_base}')
|
||||
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config=config.llm,
|
||||
llm_config=llm_config,
|
||||
dataset_name='gaia',
|
||||
agent_class=args.agent_cls,
|
||||
max_iterations=args.max_iterations,
|
||||
|
||||
@ -116,6 +116,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
|
||||
@ -37,7 +37,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_parser
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -200,6 +200,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
@ -266,6 +267,9 @@ if __name__ == '__main__':
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
dataset = load_dataset('Idavidrein/gpqa', args.data_split)
|
||||
@ -279,7 +283,7 @@ if __name__ == '__main__':
|
||||
# gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config=config.llm,
|
||||
llm_config=llm_config,
|
||||
dataset_name='gpqa',
|
||||
agent_class=args.agent_cls,
|
||||
max_iterations=args.max_iterations,
|
||||
@ -293,8 +297,6 @@ if __name__ == '__main__':
|
||||
gpqa_dataset, output_file, args.eval_n_limit, 'task_id'
|
||||
)
|
||||
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
|
||||
|
||||
run_evaluation(
|
||||
dataset=prepared_dataset,
|
||||
metadata=metadata,
|
||||
|
||||
@ -28,7 +28,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -185,6 +185,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
@ -234,7 +235,10 @@ if __name__ == '__main__':
|
||||
hefix_tests = dataset['test'].to_pandas()
|
||||
|
||||
id_column = 'task_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -245,7 +249,7 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
|
||||
@ -18,7 +18,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -182,6 +182,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
@ -271,7 +272,9 @@ if __name__ == '__main__':
|
||||
logic_reasoning_tests = dataset[data_split]
|
||||
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -282,7 +285,6 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
|
||||
@ -15,7 +15,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -81,6 +81,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
max_iterations=metadata.max_iterations,
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=get_sandbox(),
|
||||
sid=env_id,
|
||||
@ -139,7 +140,9 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -150,7 +153,6 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
_ = get_sandbox() # Initialize the sandbox
|
||||
run_evaluation(
|
||||
instances,
|
||||
|
||||
@ -17,7 +17,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -65,11 +65,11 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
instance: Any,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
# create process-specific workspace dir
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
@ -145,6 +145,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=fake_user_response_fn,
|
||||
sandbox=sandbox,
|
||||
sid=sid,
|
||||
@ -209,7 +210,9 @@ if __name__ == '__main__':
|
||||
mint_tests = mint_dataset.to_pandas()
|
||||
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -221,9 +224,7 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
agent,
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
|
||||
@ -32,7 +32,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -68,9 +68,8 @@ ID2CONDA = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
|
||||
):
|
||||
def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
try:
|
||||
@ -154,6 +153,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent.__class__.__name__
|
||||
),
|
||||
@ -242,7 +242,9 @@ if __name__ == '__main__':
|
||||
ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
|
||||
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
@ -253,9 +255,8 @@ if __name__ == '__main__':
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
|
||||
run_evaluation(
|
||||
agent,
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
|
||||
@ -21,7 +21,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -176,7 +176,9 @@ def process_instance(
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
|
||||
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
|
||||
workspace_mount_path = os.path.join(
|
||||
metadata.config.workspace_mount_path, '_eval_workspace'
|
||||
)
|
||||
# create process-specific workspace dir
|
||||
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
|
||||
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
|
||||
@ -283,6 +285,7 @@ IMPORTANT TIPS:
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@ -354,7 +357,8 @@ if __name__ == '__main__':
|
||||
swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')
|
||||
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
details = {}
|
||||
_agent_cls = agenthub.Agent.get_cls(args.agent_cls)
|
||||
@ -367,7 +371,6 @@ if __name__ == '__main__':
|
||||
llm_config,
|
||||
'swe-bench-lite',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
details=details,
|
||||
|
||||
@ -16,7 +16,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -36,9 +36,8 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
|
||||
):
|
||||
def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@ -83,6 +82,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
@ -143,6 +143,8 @@ if __name__ == '__main__':
|
||||
default='YOUR_WOLFRAMALPHA_APPID',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
dataset = ''
|
||||
hardness = ''
|
||||
@ -172,20 +174,16 @@ if __name__ == '__main__':
|
||||
toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
|
||||
|
||||
id_column = 'qid'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
f'toolqa-{args.dataset}-{args.hardness}',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column)
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
||||
run_evaluation(
|
||||
agent,
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
|
||||
@ -12,12 +12,10 @@ import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig
|
||||
from opendevin.events.action import Action
|
||||
from opendevin.events.action.message import MessageAction
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
class EvalMetadata(BaseModel):
|
||||
@ -166,10 +164,9 @@ def run_evaluation(
|
||||
process_instance_func: Callable[[pd.Series, EvalMetadata, bool], Any],
|
||||
id_column: str,
|
||||
):
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
|
||||
logger.info(
|
||||
f'Evaluation started with Agent {agent.__class__.name}, '
|
||||
f'model {agent.llm.model_name}, max iterations {metadata.max_iterations}.'
|
||||
f'Evaluation started with Agent {metadata.agent_class}, '
|
||||
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.'
|
||||
)
|
||||
pbar = tqdm(total=len(dataset))
|
||||
output_fp = open(output_file, 'a')
|
||||
|
||||
@ -15,7 +15,7 @@ from evaluation.utils.shared import (
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
@ -82,6 +82,7 @@ def process_instance(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
max_iterations=metadata.max_iterations,
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=get_sandbox(),
|
||||
sid=env_id,
|
||||
@ -144,7 +145,9 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
id_column = 'id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
args.dataset_name,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user