Remove global args (#2760)

* Remove global args

* Remove global args

* Update files

* Update main

* Bug fixes

* Fix logging
This commit is contained in:
Graham Neubig 2024-07-03 20:07:52 +09:00 committed by GitHub
parent 0d0e6db1e3
commit ffd3c7144c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 346 additions and 182 deletions

View File

@ -13,15 +13,17 @@ from datasets import load_dataset
from tqdm import tqdm
from evaluation.EDA.game import Q20Game, Q20GameCelebrity
from opendevin.controller.agent import Agent
# from evaluation.EDA.scorer import question_scorer
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
game = None
@ -42,6 +44,7 @@ def codeact_user_response(state: State) -> str:
if isinstance(act, MessageAction) and act.source == 'agent':
model_guess = act.content
break
assert game is not None, 'Game is not initialized.'
msg = game.generate_user_response(model_guess)
game.curr_turn += 1
logger.info(f'Model guess: {model_guess}')
@ -66,7 +69,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
def process_instance(
instance, agent_class, metadata, openai_api_key, reset_logger: bool = True
agent: Agent, instance, metadata, openai_api_key, reset_logger: bool = True
):
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
eval_output_dir = metadata['eval_output_dir']
@ -118,14 +121,17 @@ def process_instance(
# instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=instance['text'].strip(),
)
)
@ -309,6 +315,9 @@ if __name__ == '__main__':
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -316,8 +325,8 @@ if __name__ == '__main__':
for instance in eda_dataset:
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
args.OPENAI_API_KEY,
reset_logger=bool(num_workers > 1),

View File

@ -19,13 +19,15 @@ from evaluation.agent_bench.helper import (
create_sh_file,
try_parse_answer,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import CmdRunAction, MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox
@ -78,8 +80,8 @@ AGENT_CLS_TO_INST_SUFFIX = {
def process_instance(
agent,
instance,
agent_class,
metadata,
eval_output_dir,
reset_logger: bool = True,
@ -138,7 +140,7 @@ def process_instance(
'to you AND NEVER ASK FOR HUMAN HELP.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# =============================================
# create sandbox and run the agent
@ -158,10 +160,13 @@ def process_instance(
logger.info(f'Init script result: {init_res}')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sandbox=sandbox,
sid=inst_id,
)
@ -257,10 +262,11 @@ def process_instance(
if __name__ == '__main__':
args = parse_arguments()
# =============================================
# load datasets
# =============================================
dataset = load_dataset('iFurySt/AgentBench')
agent_bench_tests = dataset['osbench'].to_pandas()
logger.info(f'Loaded {len(agent_bench_tests)} tests.')
@ -379,6 +385,9 @@ if __name__ == '__main__':
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# Create the agent
agent = Agent.get_cls(agent_cls)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -386,8 +395,8 @@ if __name__ == '__main__':
for inst in agent_bench_tests:
future = executor.submit(
process_instance,
agent,
inst,
agent_cls,
meta,
eval_op_dir,
reset_logger=bool(num_workers > 1),

View File

@ -12,15 +12,16 @@ import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import agenthub
from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
def cleanup():
@ -111,8 +112,8 @@ def get_test_result(instance, sandbox, workspace_dir_name):
def process_instance(
agent: Agent,
instance,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,
@ -169,7 +170,7 @@ def process_instance(
workspace_dir_name,
skip_workspace_mount=False,
workspace_mount_path=workspace_mount_path,
sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
sandbox_plugins=agent.sandbox_plugins,
)
sandbox.remove_code()
@ -211,16 +212,19 @@ def process_instance(
# )
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# use a session id for concurrent evaluation
sid = instance.test_case_id.replace('/', '__')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sandbox=sandbox,
sid=sid,
)
@ -253,6 +257,8 @@ def process_instance(
if __name__ == '__main__':
args = parse_arguments()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset('lilbillbiscuit/biocoder_public')
@ -369,6 +375,9 @@ if __name__ == '__main__':
skip_workspace_mount = agent_class == 'CodeActAgent'
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -376,8 +385,8 @@ if __name__ == '__main__':
for row_idx, instance in biocoder_tests.iterrows():
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,

View File

@ -16,13 +16,15 @@ from datasets import load_dataset
from func_timeout import FunctionTimedOut, func_timeout
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
def cleanup():
@ -126,7 +128,7 @@ def get_test_result(instance, path, timeout=30):
def process_instance(
instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
):
workspace_mount_path = os.path.join(
config.workspace_mount_path, 'bird_eval_workspace'
@ -217,12 +219,15 @@ def process_instance(
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=sid,
)
)
@ -381,6 +386,7 @@ def create_prompt(e, database_path):
if __name__ == '__main__':
args = parse_arguments()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
# Due to the large size of the BIRD database, it cannot be hosted on huggingface datasets, so it needs to be downloaded
@ -492,6 +498,9 @@ if __name__ == '__main__':
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -499,8 +508,8 @@ if __name__ == '__main__':
for row_idx, instance in bird_tests.iterrows():
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
skip_workspace_mount=False,
reset_logger=bool(num_workers > 1),

View File

@ -15,13 +15,15 @@ from datasets import load_dataset
from tqdm import tqdm
from evaluation.gaia.scorer import question_scorer
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import CmdRunAction, MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
@ -72,7 +74,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
def process_instance(agent, instance, metadata, reset_logger: bool = True):
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
@ -135,16 +137,17 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
'For example: The answer to the question is <solution> 42 </solution>.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=instance['task_id'],
)
)
@ -344,6 +347,9 @@ if __name__ == '__main__':
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -351,8 +357,8 @@ if __name__ == '__main__':
for instance in gaia_tests:
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
reset_logger=bool(num_workers > 1),
)

View File

@ -9,15 +9,18 @@ import time
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from utils import encode_question, get_data
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from .utils import encode_question, get_data
def cleanup():
@ -63,9 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(
question_id, question, agent_class, metadata, reset_logger: bool = True
):
def process_instance(agent, question_id, question, metadata, reset_logger: bool = True):
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
@ -107,15 +108,16 @@ def process_instance(
instruction = encode_question(question, metadata['hub'])
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
agent.__class__.__name__
),
sid=question_id,
)
@ -295,6 +297,9 @@ if __name__ == '__main__':
output_fp.flush()
finished_task_ids.add(output['question_id'])
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
# This sets the multi-processing
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
@ -308,9 +313,9 @@ if __name__ == '__main__':
question = questions[i]
future = executor.submit(
process_instance,
agent,
question_id,
question,
agent_class,
metadata,
reset_logger=bool(num_workers > 1),
)

View File

@ -33,13 +33,15 @@ import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
def cleanup():
@ -154,8 +156,8 @@ def convert_instance_dict(instance):
def process_instance(
agent: Agent,
instance: dict,
agent_class: str,
metadata: dict,
skip_workspace_mount: bool,
eval_output_dir: str,
@ -242,18 +244,20 @@ def process_instance(
"""
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
agent.__class__.__name__
),
sid=instance.instance_id,
)
)
assert state is not None, 'State should not be None.'
# ======= Attempt to evaluate the agent's edits =======
# get the final message from the state history (default to None if not found)
@ -441,6 +445,9 @@ if __name__ == '__main__':
skip_workspace_mount = agent_class == 'CodeActAgent'
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -448,8 +455,8 @@ if __name__ == '__main__':
for row_idx, instance in gpqa_dataset.iterrows():
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,

View File

@ -24,13 +24,15 @@ from datasets import load_dataset
from evaluate import load
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
IMPORT_HELPER = {
'python': [
@ -136,7 +138,7 @@ def get_test_result(instance, path, language='python', timeout=10):
def process_instance(
instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
agent: Agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
):
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
@ -209,14 +211,15 @@ def process_instance(
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
agent.__class__.__name__
),
sid=sid,
)
@ -254,6 +257,8 @@ def process_instance(
if __name__ == '__main__':
args = parse_arguments()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset(
@ -366,6 +371,9 @@ if __name__ == '__main__':
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -373,8 +381,8 @@ if __name__ == '__main__':
for row_idx, instance in hefix_tests.iterrows():
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
skip_workspace_mount=False,
reset_logger=bool(num_workers > 1),

View File

@ -12,13 +12,15 @@ from datasets import load_dataset
from tqdm import tqdm
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
def cleanup():
@ -103,7 +105,7 @@ def get_choice(answer_str):
def get_test_result(
model_answer: str,
ground_truth: str,
) -> bool:
) -> dict[str, bool]:
gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
answer_str = model_answer if model_answer is not None else ''
prediction = get_choice(answer_str)
@ -128,9 +130,8 @@ def get_test_result(
def process_instance(
agent,
instance,
agent_class,
# metadata,
dataset_name,
skip_workspace_mount,
eval_output_dir,
@ -205,7 +206,7 @@ def process_instance(
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# use a session id for concurrent evaluation
sid = instance['id'] + '_' + str(os.getpid())
@ -213,11 +214,12 @@ def process_instance(
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
agent.__class__.__name__
),
sandbox=sandbox,
sid=sid,
@ -407,6 +409,9 @@ if __name__ == '__main__':
skip_workspace_mount = False
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -414,8 +419,8 @@ if __name__ == '__main__':
for instance in logic_reasoning_tests:
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
dataset_name,
skip_workspace_mount,
eval_output_dir,

View File

@ -10,12 +10,14 @@ import browsergym.miniwob # noqa F401 register miniwob tasks as gym environment
import gymnasium as gym
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.tools import RuntimeTool
@ -23,6 +25,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
def process_instance(
agent: Agent,
env_id: str,
metadata: dict,
eval_output_dir: str,
@ -60,8 +63,9 @@ def process_instance(
}
}
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
'PLACEHOLDER_GOAL',
runtime_tools_config=runtime_tools_config,
sandbox=docker_sandbox,
@ -108,6 +112,8 @@ def process_instance(
if __name__ == '__main__':
args = parse_arguments()
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
]
@ -195,11 +201,14 @@ if __name__ == '__main__':
)
# =============================================
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
docker_sandbox = DockerSSHBox()
for env_id in tqdm(env_ids):
try:
output = process_instance(
agent=agent,
env_id=env_id,
metadata=metadata,
eval_output_dir=eval_output_dir,

View File

@ -11,21 +11,24 @@ from concurrent.futures import ProcessPoolExecutor
from typing import Dict
import tasks
from config_variables import TASK_INFO_MAP
from datasets import load_dataset
from datatypes import TaskState
from env import SimplifiedEnv
from prompts import ToolPromptTemplate
from tasks import Task
from tqdm import tqdm
from evaluation.swe_bench.swe_env_box import DockerSSHBox
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from .config_variables import TASK_INFO_MAP
from .datatypes import TaskState
from .env import SimplifiedEnv
from .prompts import ToolPromptTemplate
from .tasks import Task
def cleanup():
@ -144,11 +147,11 @@ def process_instance(
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
fake_user_response_fn = functools.partial(
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
task=instance,
task_config={
'max_iterations': metadata['max_iterations'],
@ -156,8 +159,9 @@ def process_instance(
},
)
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=fake_user_response_fn,
sandbox=sandbox,
@ -337,6 +341,9 @@ if __name__ == '__main__':
skip_workspace_mount = agent_class == 'CodeActAgent'
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -344,8 +351,8 @@ if __name__ == '__main__':
for instance in mint_dataset:
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,

View File

@ -27,13 +27,15 @@ from concurrent.futures import ProcessPoolExecutor
from datasets import load_dataset
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox
@ -99,7 +101,7 @@ ID2CONDA = {
def process_instance(
instance, agent_class, metadata, eval_output_dir, reset_logger: bool = True
agent: Agent, instance, metadata, eval_output_dir, reset_logger: bool = True
):
old_workspace_mount_path = config.workspace_mount_path
old_workspace_base = config.workspace_base
@ -177,19 +179,21 @@ def process_instance(
)
+ 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
)
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Run the agent
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
agent_class
agent.__class__.__name__
),
sandbox=sandbox,
sid=sid,
)
)
assert state is not None
metrics = state.metrics.get() if state.metrics else {}
# Evaluate the agent's script
@ -365,14 +369,17 @@ if __name__ == '__main__':
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
for _, instance in enumerate(new_instances):
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
eval_output_dir,
reset_logger=bool(num_workers > 1),

View File

@ -7,6 +7,7 @@ import pathlib
import subprocess
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Any
import pandas as pd
import toml
@ -16,13 +17,15 @@ from tqdm import tqdm
import agenthub
from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
@ -190,8 +193,8 @@ def get_test_result(instance, sandbox, workspace_dir_name):
def process_instance(
instance: dict,
agent_class: str,
agent: Agent,
instance: Any,
metadata: dict,
skip_workspace_mount: bool,
eval_output_dir: str,
@ -302,13 +305,16 @@ IMPORTANT TIPS:
)
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sandbox=sandbox,
sid=instance.instance_id,
)
@ -369,6 +375,8 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
if __name__ == '__main__':
args = parse_arguments()
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
@ -488,6 +496,9 @@ if __name__ == '__main__':
skip_workspace_mount = agent_class == 'CodeActAgent'
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
try:
with ProcessPoolExecutor(num_workers) as executor:
futures = []
@ -495,8 +506,8 @@ if __name__ == '__main__':
for row_idx, instance in swe_bench_tests.iterrows():
future = executor.submit(
process_instance,
agent,
instance,
agent_class,
metadata,
skip_workspace_mount,
eval_output_dir,

View File

@ -9,15 +9,18 @@ import time
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from utils import download_data, download_tools, encode_question, eval_answer, get_data
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, get_parser
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.action import MessageAction
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from .utils import download_data, download_tools, encode_question, eval_answer, get_data
def cleanup():
@ -63,7 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
}
def process_instance(task, agent_class, metadata, reset_logger: bool = True):
def process_instance(agent: Agent, task, metadata, reset_logger: bool = True):
# create process-specific workspace dir
# we will create a workspace directory for EACH process
# so that different agent don't interfere with each other.
@ -100,14 +103,17 @@ def process_instance(task, agent_class, metadata, reset_logger: bool = True):
instruction = encode_question(question)
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
# NOTE: You can actually set slightly different instruction for different agents
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
# logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
agent.__class__.__name__
],
sid=qid,
)
)
@ -304,6 +310,9 @@ if __name__ == '__main__':
output_fp.flush()
finished_task_ids.add(output['qid'])
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
# This sets the multi-processing
num_workers = args.eval_num_workers
logger.info(f'Using {num_workers} workers for evaluation.')
@ -315,8 +324,8 @@ if __name__ == '__main__':
try:
future = executor.submit(
process_instance,
agent,
task,
agent_class,
metadata,
reset_logger=bool(num_workers > 1),
)

View File

@ -10,12 +10,14 @@ import browsergym.webarena # noqa F401 register webarena tasks as gym environme
import gymnasium as gym
from tqdm import tqdm
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.core.main import run_agent_controller
from opendevin.events.serialization.event import event_to_dict
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.tools import RuntimeTool
@ -23,6 +25,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
def process_instance(
agent: Agent,
env_id: str,
metadata: dict,
eval_output_dir: str,
@ -60,8 +63,9 @@ def process_instance(
}
}
state: State = asyncio.run(
main(
state: State | None = asyncio.run(
run_agent_controller(
agent,
'PLACEHOLDER_GOAL',
runtime_tools_config=runtime_tools_config,
sandbox=docker_sandbox,
@ -108,6 +112,8 @@ def process_instance(
if __name__ == '__main__':
args = parse_arguments()
env_ids = [
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
]
@ -196,10 +202,14 @@ if __name__ == '__main__':
# =============================================
# Create the agent
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
docker_sandbox = DockerSSHBox()
for env_id in tqdm(env_ids):
try:
output = process_instance(
agent=agent,
env_id=env_id,
metadata=metadata,
eval_output_dir=eval_output_dir,

View File

@ -59,7 +59,7 @@ class AgentController:
agent: Agent,
event_stream: EventStream,
sid: str = 'default',
max_iterations: int = MAX_ITERATIONS,
max_iterations: int | None = MAX_ITERATIONS,
max_budget_per_task: float | None = MAX_BUDGET_PER_TASK,
initial_state: State | None = None,
is_delegate: bool = False,
@ -86,6 +86,9 @@ class AgentController:
)
# state from the previous session, state from a parent agent, or a fresh state
max_iterations = (
max_iterations if max_iterations is not None else MAX_ITERATIONS
)
self.set_initial_state(
state=initial_state,
max_iterations=max_iterations,

View File

@ -476,7 +476,7 @@ def get_llm_config_arg(llm_config_arg: str):
# Command line arguments
def get_parser():
def get_parser() -> argparse.ArgumentParser:
"""
Get the parser for the command line arguments.
"""
@ -559,7 +559,7 @@ def get_parser():
return parser
def parse_arguments():
def parse_arguments() -> argparse.Namespace:
"""
Parse the command line arguments.
"""
@ -569,6 +569,3 @@ def parse_arguments():
config.workspace_base = os.path.abspath(parsed_args.directory)
print(f'Setting workspace base to {config.workspace_base}')
return parsed_args
args = parse_arguments()

View File

@ -7,7 +7,7 @@ import agenthub # noqa F401 (we import this to get the agents registered)
from opendevin.controller import AgentController
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.schema import AgentState
from opendevin.events import EventSource, EventStream, EventStreamSubscriber
@ -30,8 +30,11 @@ def read_task_from_stdin() -> str:
return sys.stdin.read()
async def main(
task_str: str = '',
async def run_agent_controller(
agent: Agent,
task_str: str,
max_iterations: int | None = None,
max_budget_per_task: float | None = None,
exit_on_message: bool = False,
fake_user_response_fn: Callable[[State | None], str] | None = None,
sandbox: Sandbox | None = None,
@ -48,43 +51,10 @@ async def main(
sandbox: An optional sandbox to run the agent in.
"""
# Determine the task source
if task_str:
task = task_str
elif args.file:
task = read_task_from_file(args.file)
elif args.task:
task = args.task
elif not sys.stdin.isatty():
task = read_task_from_stdin()
else:
raise ValueError('No task provided. Please specify a task through -t, -f.')
# only one of model_name or llm_config is required
if args.llm_config:
# --llm_config
# llm_config can contain any of the attributes of LLMConfig
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
logger.info(
f'Running agent {args.agent_cls} (model: {llm_config.model}, llm_config: {args.llm_config}) with task: "{task}"'
)
# create LLM instance with the given config
llm = LLM(llm_config=llm_config)
else:
# --model-name model_name
logger.info(
f'Running agent {args.agent_cls} (model: {args.model_name}), with task: "{task}"'
)
llm = LLM(args.model_name)
# set up the agent
AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
agent = AgentCls(llm=llm)
# Logging
logger.info(
f'Running agent {type(agent)}, model {agent.llm.model_name}, with task: "{task_str}"'
)
# set up the event stream
cli_session = 'main' + ('_' + sid if sid else '')
@ -102,8 +72,8 @@ async def main(
# init controller with this initial state
controller = AgentController(
agent=agent,
max_iterations=args.max_iterations,
max_budget_per_task=args.max_budget_per_task,
max_iterations=max_iterations,
max_budget_per_task=max_budget_per_task,
event_stream=event_stream,
initial_state=initial_state,
)
@ -124,8 +94,8 @@ async def main(
with open(
os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
) as f:
task = f.read()
logger.info(f'Dynamic Eval task: {task}')
task_str = f.read()
logger.info(f'Dynamic Eval task: {task_str}')
# start event is a MessageAction with the task, either resumed or new
if config.enable_cli_session and initial_state is not None:
@ -138,7 +108,7 @@ async def main(
)
elif initial_state is None:
# init with the provided task
await event_stream.add_event(MessageAction(content=task), EventSource.USER)
await event_stream.add_event(MessageAction(content=task_str), EventSource.USER)
async def on_event(event: Event):
if isinstance(event, AgentStateChangedObservation):
@ -174,4 +144,36 @@ async def main(
if __name__ == '__main__':
asyncio.run(main())
args = parse_arguments()
# Determine the task
if args.file:
task_str = read_task_from_file(args.file)
elif args.task:
task_str = args.task
elif not sys.stdin.isatty():
task_str = read_task_from_stdin()
else:
raise ValueError('No task provided. Please specify a task through -t, -f.')
# Figure out the LLM config
if args.llm_config:
llm_config = get_llm_config_arg(args.llm_config)
if llm_config is None:
raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
llm = LLM(llm_config=llm_config)
else:
llm = LLM(model=args.model_name)
# Create the agent
AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
agent = AgentCls(llm=llm)
asyncio.run(
run_agent_controller(
agent=agent,
task_str=task_str,
max_iterations=args.max_iterations,
max_budget_per_task=args.max_budget_per_task,
)
)

View File

@ -5,13 +5,16 @@ import subprocess
import pytest
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.main import main
from opendevin.core.config import parse_arguments
from opendevin.core.main import run_agent_controller
from opendevin.core.schema import AgentState
from opendevin.events.action import (
AgentFinishAction,
AgentRejectAction,
)
from opendevin.llm.llm import LLM
workspace_base = os.getenv('WORKSPACE_BASE')
workspace_mount_path = os.getenv('WORKSPACE_MOUNT_PATH')
@ -29,7 +32,7 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
)
@pytest.mark.skipif(
(os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
)
@pytest.mark.skipif(
@ -38,7 +41,14 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
)
def test_write_simple_script():
task = "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point."
final_state: State = asyncio.run(main(task, exit_on_message=True))
args = parse_arguments()
# Create the agent
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
final_state: State | None = asyncio.run(
run_agent_controller(agent, task, exit_on_message=True)
)
assert final_state.agent_state == AgentState.STOPPED
assert final_state.last_error is None
@ -61,7 +71,7 @@ def test_write_simple_script():
)
@pytest.mark.skipif(
(os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
)
@pytest.mark.skipif(
@ -73,6 +83,7 @@ def test_write_simple_script():
reason='local sandbox shows environment-dependent absolute path for pwd command',
)
def test_edits():
args = parse_arguments()
# Copy workspace artifacts to workspace_base location
source_dir = os.path.join(os.path.dirname(__file__), 'workspace/test_edits/')
files = os.listdir(source_dir)
@ -82,9 +93,14 @@ def test_edits():
os.remove(dest_file)
shutil.copy(os.path.join(source_dir, file), dest_file)
# Create the agent
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
# Execute the task
task = 'Fix typos in bad.txt. Do not ask me for confirmation at any point.'
final_state: State = asyncio.run(main(task, exit_on_message=True))
final_state: State | None = asyncio.run(
run_agent_controller(agent, task, exit_on_message=True)
)
assert final_state.agent_state == AgentState.STOPPED
assert final_state.last_error is None
@ -108,9 +124,16 @@ Enjoy!
reason='Currently, only ssh sandbox supports stateful tasks',
)
def test_ipython():
args = parse_arguments()
# Create the agent
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
# Execute the task
task = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point."
final_state: State = asyncio.run(main(task, exit_on_message=True))
final_state: State | None = asyncio.run(
run_agent_controller(agent, task, exit_on_message=True)
)
assert final_state.agent_state == AgentState.STOPPED
assert final_state.last_error is None
@ -135,10 +158,15 @@ def test_ipython():
reason='FIXME: local sandbox does not capture stderr',
)
def test_simple_task_rejection():
args = parse_arguments()
# Create the agent
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
# Give an impossible task to do: cannot write a commit message because
# the workspace is not a git repo
task = 'Write a git commit message for the current staging area. Do not ask me for confirmation at any point.'
final_state: State = asyncio.run(main(task))
final_state: State | None = asyncio.run(run_agent_controller(agent, task))
assert final_state.agent_state == AgentState.STOPPED
assert final_state.last_error is None
assert isinstance(final_state.history[-1][0], AgentRejectAction)
@ -153,9 +181,16 @@ def test_simple_task_rejection():
reason='Currently, only ssh sandbox supports stateful tasks',
)
def test_ipython_module():
args = parse_arguments()
# Create the agent
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
# Execute the task
task = "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point."
final_state: State = asyncio.run(main(task, exit_on_message=True))
final_state: State | None = asyncio.run(
run_agent_controller(agent, task, exit_on_message=True)
)
assert final_state.agent_state == AgentState.STOPPED
assert final_state.last_error is None
@ -178,13 +213,20 @@ def test_ipython_module():
)
@pytest.mark.skipif(
(os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
)
def test_browse_internet(http_server):
args = parse_arguments()
# Create the agent
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
# Execute the task
task = 'Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.'
final_state: State = asyncio.run(main(task, exit_on_message=True))
final_state: State | None = asyncio.run(
run_agent_controller(agent, task, exit_on_message=True)
)
assert final_state.agent_state == AgentState.STOPPED
assert final_state.last_error is None
assert isinstance(final_state.history[-1][0], AgentFinishAction)