[Evaluation] Simplify eval & and multi-processing related fixes (#2810)

* initialize agent inside process_instance_fn;

* remove dependency on `config.max_iterations`

* switch back to only include llm config to metadata
This commit is contained in:
Xingyao Wang 2024-07-06 07:18:46 +08:00 committed by GitHub
parent a47713ecb0
commit f6dc89b41a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 89 additions and 60 deletions

View File

@ -134,6 +134,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
@ -201,17 +202,15 @@ if __name__ == '__main__':
)
args, _ = parser.parse_known_args()
if args.llm_config:
specified_llm_config = get_llm_config_arg(args.llm_config)
if specified_llm_config:
config.llm = specified_llm_config
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
eda_dataset = load_dataset(
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
)
metadata = make_metadata(
config.llm,
llm_config,
f'eda-{args.dataset}',
args.agent_cls,
args.max_iterations,

View File

@ -22,7 +22,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -116,6 +116,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
sandbox=sandbox,
sid=inst_id,
@ -216,7 +217,10 @@ if __name__ == '__main__':
args = parse_arguments()
dataset = load_dataset('iFurySt/AgentBench')
agent_bench_tests = dataset['osbench'].to_pandas()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -227,7 +231,7 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
instances,
metadata,

View File

@ -20,7 +20,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -181,6 +181,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
@ -220,7 +221,10 @@ if __name__ == '__main__':
args = parse_arguments()
dataset = load_dataset('lilbillbiscuit/biocoder_public')
biocoder_tests = dataset['test'].to_pandas()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -231,7 +235,7 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
instances,
metadata,

View File

@ -22,7 +22,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -228,6 +228,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
@ -393,7 +394,10 @@ if __name__ == '__main__':
args = parse_arguments()
bird_dataset = load_bird()
dataset = bird_dataset['test'].to_pandas()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -404,7 +408,7 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
instances,
metadata,

View File

@ -21,7 +21,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_parser
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -121,6 +121,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
@ -199,8 +200,11 @@ if __name__ == '__main__':
config.workspace_base = os.path.abspath(args.directory)
logger.info(f'Setting workspace base to {config.workspace_base}')
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config=config.llm,
llm_config=llm_config,
dataset_name='gaia',
agent_class=args.agent_cls,
max_iterations=args.max_iterations,

View File

@ -116,6 +116,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),

View File

@ -37,7 +37,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_parser
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -200,6 +200,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
@ -266,6 +267,9 @@ if __name__ == '__main__':
)
args, _ = parser.parse_known_args()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset('Idavidrein/gpqa', args.data_split)
@ -279,7 +283,7 @@ if __name__ == '__main__':
# gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
metadata = make_metadata(
llm_config=config.llm,
llm_config=llm_config,
dataset_name='gpqa',
agent_class=args.agent_cls,
max_iterations=args.max_iterations,
@ -293,8 +297,6 @@ if __name__ == '__main__':
gpqa_dataset, output_file, args.eval_n_limit, 'task_id'
)
agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
run_evaluation(
dataset=prepared_dataset,
metadata=metadata,

View File

@ -28,7 +28,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -185,6 +185,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
@ -234,7 +235,10 @@ if __name__ == '__main__':
hefix_tests = dataset['test'].to_pandas()
id_column = 'task_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -245,7 +249,7 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
instances,
metadata,

View File

@ -18,7 +18,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -182,6 +182,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
@ -271,7 +272,9 @@ if __name__ == '__main__':
logic_reasoning_tests = dataset[data_split]
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -282,7 +285,6 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
instances,
metadata,

View File

@ -15,7 +15,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -81,6 +81,7 @@ def process_instance(
run_agent_controller(
agent,
'PLACEHOLDER_GOAL',
max_iterations=metadata.max_iterations,
runtime_tools_config=runtime_tools_config,
sandbox=get_sandbox(),
sid=env_id,
@ -139,7 +140,9 @@ if __name__ == '__main__':
)
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -150,7 +153,6 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
_ = get_sandbox() # Initialize the sandbox
run_evaluation(
instances,

View File

@ -17,7 +17,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -65,11 +65,11 @@ AGENT_CLS_TO_INST_SUFFIX = {
def process_instance(
agent: Agent,
instance: Any,
metadata: EvalMetadata,
reset_logger: bool = True,
):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
@ -145,6 +145,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=fake_user_response_fn,
sandbox=sandbox,
sid=sid,
@ -209,7 +210,9 @@ if __name__ == '__main__':
mint_tests = mint_dataset.to_pandas()
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -221,9 +224,7 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
agent,
instances,
metadata,
output_file,

View File

@ -32,7 +32,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -68,9 +68,8 @@ ID2CONDA = {
}
def process_instance(
agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
):
def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
try:
@ -154,6 +153,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent.__class__.__name__
),
@ -242,7 +242,9 @@ if __name__ == '__main__':
ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
id_column = 'instance_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,
@ -253,9 +255,8 @@ if __name__ == '__main__':
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
agent,
instances,
metadata,
output_file,

View File

@ -21,7 +21,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -176,7 +176,9 @@ def process_instance(
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
workspace_mount_path = os.path.join(
metadata.config.workspace_mount_path, '_eval_workspace'
)
# create process-specific workspace dir
workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
@ -283,6 +285,7 @@ IMPORTANT TIPS:
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
@ -354,7 +357,8 @@ if __name__ == '__main__':
swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')
id_column = 'instance_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
details = {}
_agent_cls = agenthub.Agent.get_cls(args.agent_cls)
@ -367,7 +371,6 @@ if __name__ == '__main__':
llm_config,
'swe-bench-lite',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
details=details,

View File

@ -16,7 +16,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -36,9 +36,8 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(
agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
):
def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
@ -83,6 +82,7 @@ def process_instance(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
@ -143,6 +143,8 @@ if __name__ == '__main__':
default='YOUR_WOLFRAMALPHA_APPID',
)
args, _ = parser.parse_known_args()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
dataset = ''
hardness = ''
@ -172,20 +174,16 @@ if __name__ == '__main__':
toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
id_column = 'qid'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
metadata = make_metadata(
llm_config,
f'toolqa-{args.dataset}-{args.hardness}',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column)
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
run_evaluation(
agent,
instances,
metadata,
output_file,

View File

@ -12,12 +12,10 @@ import pandas as pd
from pydantic import BaseModel
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig
from opendevin.events.action import Action
from opendevin.events.action.message import MessageAction
from opendevin.llm.llm import LLM
class EvalMetadata(BaseModel):
@ -166,10 +164,9 @@ def run_evaluation(
process_instance_func: Callable[[pd.Series, EvalMetadata, bool], Any],
id_column: str,
):
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
logger.info(
f'Evaluation started with Agent {agent.__class__.name}, '
f'model {agent.llm.model_name}, max iterations {metadata.max_iterations}.'
f'Evaluation started with Agent {metadata.agent_class}, '
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.'
)
pbar = tqdm(total=len(dataset))
output_fp = open(output_file, 'a')

View File

@ -15,7 +15,7 @@ from evaluation.utils.shared import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import LLMConfig, get_llm_config_arg, parse_arguments
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
@ -82,6 +82,7 @@ def process_instance(
run_agent_controller(
agent,
'PLACEHOLDER_GOAL',
max_iterations=metadata.max_iterations,
runtime_tools_config=runtime_tools_config,
sandbox=get_sandbox(),
sid=env_id,
@ -144,7 +145,9 @@ if __name__ == '__main__':
)
id_column = 'id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
args.dataset_name,