mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 13:52:43 +08:00
Remove global args (#2760)
* Remove global args * Remove global args * Update files * Update main * Bug fixes * Fix logging
This commit is contained in:
parent
0d0e6db1e3
commit
ffd3c7144c
@ -13,15 +13,17 @@ from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.EDA.game import Q20Game, Q20GameCelebrity
|
||||
from opendevin.controller.agent import Agent
|
||||
|
||||
# from evaluation.EDA.scorer import question_scorer
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
game = None
|
||||
|
||||
@ -42,6 +44,7 @@ def codeact_user_response(state: State) -> str:
|
||||
if isinstance(act, MessageAction) and act.source == 'agent':
|
||||
model_guess = act.content
|
||||
break
|
||||
assert game is not None, 'Game is not initialized.'
|
||||
msg = game.generate_user_response(model_guess)
|
||||
game.curr_turn += 1
|
||||
logger.info(f'Model guess: {model_guess}')
|
||||
@ -66,7 +69,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance, agent_class, metadata, openai_api_key, reset_logger: bool = True
|
||||
agent: Agent, instance, metadata, openai_api_key, reset_logger: bool = True
|
||||
):
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
eval_output_dir = metadata['eval_output_dir']
|
||||
@ -118,14 +121,17 @@ def process_instance(
|
||||
|
||||
# instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sid=instance['text'].strip(),
|
||||
)
|
||||
)
|
||||
@ -309,6 +315,9 @@ if __name__ == '__main__':
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -316,8 +325,8 @@ if __name__ == '__main__':
|
||||
for instance in eda_dataset:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
args.OPENAI_API_KEY,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
|
||||
@ -19,13 +19,15 @@ from evaluation.agent_bench.helper import (
|
||||
create_sh_file,
|
||||
try_parse_answer,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import CmdRunAction, MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
|
||||
@ -78,8 +80,8 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
eval_output_dir,
|
||||
reset_logger: bool = True,
|
||||
@ -138,7 +140,7 @@ def process_instance(
|
||||
'to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
)
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# =============================================
|
||||
# create sandbox and run the agent
|
||||
@ -158,10 +160,13 @@ def process_instance(
|
||||
logger.info(f'Init script result: {init_res}')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sandbox=sandbox,
|
||||
sid=inst_id,
|
||||
)
|
||||
@ -257,10 +262,11 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
# =============================================
|
||||
# load datasets
|
||||
# =============================================
|
||||
|
||||
dataset = load_dataset('iFurySt/AgentBench')
|
||||
agent_bench_tests = dataset['osbench'].to_pandas()
|
||||
logger.info(f'Loaded {len(agent_bench_tests)} tests.')
|
||||
@ -379,6 +385,9 @@ if __name__ == '__main__':
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_cls)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -386,8 +395,8 @@ if __name__ == '__main__':
|
||||
for inst in agent_bench_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
inst,
|
||||
agent_cls,
|
||||
meta,
|
||||
eval_op_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
|
||||
@ -12,15 +12,16 @@ import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
import agenthub
|
||||
from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -111,8 +112,8 @@ def get_test_result(instance, sandbox, workspace_dir_name):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
@ -169,7 +170,7 @@ def process_instance(
|
||||
workspace_dir_name,
|
||||
skip_workspace_mount=False,
|
||||
workspace_mount_path=workspace_mount_path,
|
||||
sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
|
||||
sandbox_plugins=agent.sandbox_plugins,
|
||||
)
|
||||
|
||||
sandbox.remove_code()
|
||||
@ -211,16 +212,19 @@ def process_instance(
|
||||
# )
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# use a session id for concurrent evaluation
|
||||
sid = instance.test_case_id.replace('/', '__')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sandbox=sandbox,
|
||||
sid=sid,
|
||||
)
|
||||
@ -253,6 +257,8 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
dataset = load_dataset('lilbillbiscuit/biocoder_public')
|
||||
@ -369,6 +375,9 @@ if __name__ == '__main__':
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -376,8 +385,8 @@ if __name__ == '__main__':
|
||||
for row_idx, instance in biocoder_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
|
||||
@ -16,13 +16,15 @@ from datasets import load_dataset
|
||||
from func_timeout import FunctionTimedOut, func_timeout
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -126,7 +128,7 @@ def get_test_result(instance, path, timeout=30):
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
|
||||
agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
|
||||
):
|
||||
workspace_mount_path = os.path.join(
|
||||
config.workspace_mount_path, 'bird_eval_workspace'
|
||||
@ -217,12 +219,15 @@ def process_instance(
|
||||
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
|
||||
)
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sid=sid,
|
||||
)
|
||||
)
|
||||
@ -381,6 +386,7 @@ def create_prompt(e, database_path):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
# Due to the large size of the BIRD database, it cannot be hosted on huggingface datasets, so it needs to be downloaded
|
||||
@ -492,6 +498,9 @@ if __name__ == '__main__':
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -499,8 +508,8 @@ if __name__ == '__main__':
|
||||
for row_idx, instance in bird_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount=False,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
|
||||
@ -15,13 +15,15 @@ from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.gaia.scorer import question_scorer
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import CmdRunAction, MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
DATASET_CACHE_DIR = '~/.cache/open-devin/evals/gaia'
|
||||
DATASET_CACHE_DIR = os.path.expanduser(DATASET_CACHE_DIR)
|
||||
@ -72,7 +74,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
|
||||
def process_instance(agent, instance, metadata, reset_logger: bool = True):
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@ -135,16 +137,17 @@ def process_instance(instance, agent_class, metadata, reset_logger: bool = True)
|
||||
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
||||
)
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent.__class__.__name__, '')
|
||||
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sid=instance['task_id'],
|
||||
)
|
||||
)
|
||||
@ -344,6 +347,9 @@ if __name__ == '__main__':
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -351,8 +357,8 @@ if __name__ == '__main__':
|
||||
for instance in gaia_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
|
||||
@ -9,15 +9,18 @@ import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
from tqdm import tqdm
|
||||
from utils import encode_question, get_data
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
from .utils import encode_question, get_data
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -63,9 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(
|
||||
question_id, question, agent_class, metadata, reset_logger: bool = True
|
||||
):
|
||||
def process_instance(agent, question_id, question, metadata, reset_logger: bool = True):
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@ -107,15 +108,16 @@ def process_instance(
|
||||
instruction = encode_question(question, metadata['hub'])
|
||||
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
# logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
agent.__class__.__name__
|
||||
),
|
||||
sid=question_id,
|
||||
)
|
||||
@ -295,6 +297,9 @@ if __name__ == '__main__':
|
||||
output_fp.flush()
|
||||
finished_task_ids.add(output['question_id'])
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
@ -308,9 +313,9 @@ if __name__ == '__main__':
|
||||
question = questions[i]
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
question_id,
|
||||
question,
|
||||
agent_class,
|
||||
metadata,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
|
||||
@ -33,13 +33,15 @@ import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -154,8 +156,8 @@ def convert_instance_dict(instance):
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
instance: dict,
|
||||
agent_class: str,
|
||||
metadata: dict,
|
||||
skip_workspace_mount: bool,
|
||||
eval_output_dir: str,
|
||||
@ -242,18 +244,20 @@ def process_instance(
|
||||
"""
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
agent.__class__.__name__
|
||||
),
|
||||
sid=instance.instance_id,
|
||||
)
|
||||
)
|
||||
assert state is not None, 'State should not be None.'
|
||||
|
||||
# ======= Attempt to evaluate the agent's edits =======
|
||||
# get the final message from the state history (default to None if not found)
|
||||
@ -441,6 +445,9 @@ if __name__ == '__main__':
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -448,8 +455,8 @@ if __name__ == '__main__':
|
||||
for row_idx, instance in gpqa_dataset.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
|
||||
@ -24,13 +24,15 @@ from datasets import load_dataset
|
||||
from evaluate import load
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
IMPORT_HELPER = {
|
||||
'python': [
|
||||
@ -136,7 +138,7 @@ def get_test_result(instance, path, language='python', timeout=10):
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance, agent_class, metadata, skip_workspace_mount, reset_logger: bool = True
|
||||
agent: Agent, instance, metadata, skip_workspace_mount, reset_logger: bool = True
|
||||
):
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
@ -209,14 +211,15 @@ def process_instance(
|
||||
'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n'
|
||||
)
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
agent.__class__.__name__
|
||||
),
|
||||
sid=sid,
|
||||
)
|
||||
@ -254,6 +257,8 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
dataset = load_dataset(
|
||||
@ -366,6 +371,9 @@ if __name__ == '__main__':
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -373,8 +381,8 @@ if __name__ == '__main__':
|
||||
for row_idx, instance in hefix_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount=False,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
|
||||
@ -12,13 +12,15 @@ from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -103,7 +105,7 @@ def get_choice(answer_str):
|
||||
def get_test_result(
|
||||
model_answer: str,
|
||||
ground_truth: str,
|
||||
) -> bool:
|
||||
) -> dict[str, bool]:
|
||||
gold_answer = ground_truth.replace('(', '').replace(')', '').strip()
|
||||
answer_str = model_answer if model_answer is not None else ''
|
||||
prediction = get_choice(answer_str)
|
||||
@ -128,9 +130,8 @@ def get_test_result(
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
# metadata,
|
||||
dataset_name,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
@ -205,7 +206,7 @@ def process_instance(
|
||||
)
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# use a session id for concurrent evaluation
|
||||
sid = instance['id'] + '_' + str(os.getpid())
|
||||
@ -213,11 +214,12 @@ def process_instance(
|
||||
exit_code, command_output = sandbox.execute('pip install scitools-pyke')
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
agent.__class__.__name__
|
||||
),
|
||||
sandbox=sandbox,
|
||||
sid=sid,
|
||||
@ -407,6 +409,9 @@ if __name__ == '__main__':
|
||||
skip_workspace_mount = False
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -414,8 +419,8 @@ if __name__ == '__main__':
|
||||
for instance in logic_reasoning_tests:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
dataset_name,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
|
||||
@ -10,12 +10,14 @@ import browsergym.miniwob # noqa F401 register miniwob tasks as gym environment
|
||||
import gymnasium as gym
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
@ -23,6 +25,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
env_id: str,
|
||||
metadata: dict,
|
||||
eval_output_dir: str,
|
||||
@ -60,8 +63,9 @@ def process_instance(
|
||||
}
|
||||
}
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=docker_sandbox,
|
||||
@ -108,6 +112,8 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
|
||||
]
|
||||
@ -195,11 +201,14 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
# =============================================
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
docker_sandbox = DockerSSHBox()
|
||||
for env_id in tqdm(env_ids):
|
||||
try:
|
||||
output = process_instance(
|
||||
agent=agent,
|
||||
env_id=env_id,
|
||||
metadata=metadata,
|
||||
eval_output_dir=eval_output_dir,
|
||||
|
||||
@ -11,21 +11,24 @@ from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Dict
|
||||
|
||||
import tasks
|
||||
from config_variables import TASK_INFO_MAP
|
||||
from datasets import load_dataset
|
||||
from datatypes import TaskState
|
||||
from env import SimplifiedEnv
|
||||
from prompts import ToolPromptTemplate
|
||||
from tasks import Task
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.swe_bench.swe_env_box import DockerSSHBox
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
from .config_variables import TASK_INFO_MAP
|
||||
from .datatypes import TaskState
|
||||
from .env import SimplifiedEnv
|
||||
from .prompts import ToolPromptTemplate
|
||||
from .tasks import Task
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -144,11 +147,11 @@ def process_instance(
|
||||
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
fake_user_response_fn = functools.partial(
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
|
||||
task=instance,
|
||||
task_config={
|
||||
'max_iterations': metadata['max_iterations'],
|
||||
@ -156,8 +159,9 @@ def process_instance(
|
||||
},
|
||||
)
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=fake_user_response_fn,
|
||||
sandbox=sandbox,
|
||||
@ -337,6 +341,9 @@ if __name__ == '__main__':
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -344,8 +351,8 @@ if __name__ == '__main__':
|
||||
for instance in mint_dataset:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
|
||||
@ -27,13 +27,15 @@ from concurrent.futures import ProcessPoolExecutor
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
|
||||
|
||||
@ -99,7 +101,7 @@ ID2CONDA = {
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance, agent_class, metadata, eval_output_dir, reset_logger: bool = True
|
||||
agent: Agent, instance, metadata, eval_output_dir, reset_logger: bool = True
|
||||
):
|
||||
old_workspace_mount_path = config.workspace_mount_path
|
||||
old_workspace_base = config.workspace_base
|
||||
@ -177,19 +179,21 @@ def process_instance(
|
||||
)
|
||||
+ 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
|
||||
)
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# Run the agent
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
|
||||
agent_class
|
||||
agent.__class__.__name__
|
||||
),
|
||||
sandbox=sandbox,
|
||||
sid=sid,
|
||||
)
|
||||
)
|
||||
assert state is not None
|
||||
metrics = state.metrics.get() if state.metrics else {}
|
||||
|
||||
# Evaluate the agent's script
|
||||
@ -365,14 +369,17 @@ if __name__ == '__main__':
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
for _, instance in enumerate(new_instances):
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
eval_output_dir,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
|
||||
@ -7,6 +7,7 @@ import pathlib
|
||||
import subprocess
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import toml
|
||||
@ -16,13 +17,15 @@ from tqdm import tqdm
|
||||
|
||||
import agenthub
|
||||
from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
|
||||
|
||||
@ -190,8 +193,8 @@ def get_test_result(instance, sandbox, workspace_dir_name):
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: dict,
|
||||
agent_class: str,
|
||||
agent: Agent,
|
||||
instance: Any,
|
||||
metadata: dict,
|
||||
skip_workspace_mount: bool,
|
||||
eval_output_dir: str,
|
||||
@ -302,13 +305,16 @@ IMPORTANT TIPS:
|
||||
)
|
||||
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sandbox=sandbox,
|
||||
sid=instance.instance_id,
|
||||
)
|
||||
@ -369,6 +375,8 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenDevin's repo
|
||||
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
|
||||
@ -488,6 +496,9 @@ if __name__ == '__main__':
|
||||
skip_workspace_mount = agent_class == 'CodeActAgent'
|
||||
logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
try:
|
||||
with ProcessPoolExecutor(num_workers) as executor:
|
||||
futures = []
|
||||
@ -495,8 +506,8 @@ if __name__ == '__main__':
|
||||
for row_idx, instance in swe_bench_tests.iterrows():
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
instance,
|
||||
agent_class,
|
||||
metadata,
|
||||
skip_workspace_mount,
|
||||
eval_output_dir,
|
||||
|
||||
@ -9,15 +9,18 @@ import time
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
from tqdm import tqdm
|
||||
from utils import download_data, download_tools, encode_question, eval_answer, get_data
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, get_parser
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.action import MessageAction
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
from .utils import download_data, download_tools, encode_question, eval_answer, get_data
|
||||
|
||||
|
||||
def cleanup():
|
||||
@ -63,7 +66,7 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
||||
}
|
||||
|
||||
|
||||
def process_instance(task, agent_class, metadata, reset_logger: bool = True):
|
||||
def process_instance(agent: Agent, task, metadata, reset_logger: bool = True):
|
||||
# create process-specific workspace dir
|
||||
# we will create a workspace directory for EACH process
|
||||
# so that different agent don't interfere with each other.
|
||||
@ -100,14 +103,17 @@ def process_instance(task, agent_class, metadata, reset_logger: bool = True):
|
||||
instruction = encode_question(question)
|
||||
instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
||||
# NOTE: You can actually set slightly different instruction for different agents
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
|
||||
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
|
||||
# logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
|
||||
|
||||
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
|
||||
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
||||
agent.__class__.__name__
|
||||
],
|
||||
sid=qid,
|
||||
)
|
||||
)
|
||||
@ -304,6 +310,9 @@ if __name__ == '__main__':
|
||||
output_fp.flush()
|
||||
finished_task_ids.add(output['qid'])
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
# This sets the multi-processing
|
||||
num_workers = args.eval_num_workers
|
||||
logger.info(f'Using {num_workers} workers for evaluation.')
|
||||
@ -315,8 +324,8 @@ if __name__ == '__main__':
|
||||
try:
|
||||
future = executor.submit(
|
||||
process_instance,
|
||||
agent,
|
||||
task,
|
||||
agent_class,
|
||||
metadata,
|
||||
reset_logger=bool(num_workers > 1),
|
||||
)
|
||||
|
||||
@ -10,12 +10,14 @@ import browsergym.webarena # noqa F401 register webarena tasks as gym environme
|
||||
import gymnasium as gym
|
||||
from tqdm import tqdm
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.events.serialization.event import event_to_dict
|
||||
from opendevin.llm.llm import LLM
|
||||
from opendevin.runtime.docker.ssh_box import DockerSSHBox
|
||||
from opendevin.runtime.tools import RuntimeTool
|
||||
|
||||
@ -23,6 +25,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
|
||||
|
||||
|
||||
def process_instance(
|
||||
agent: Agent,
|
||||
env_id: str,
|
||||
metadata: dict,
|
||||
eval_output_dir: str,
|
||||
@ -60,8 +63,9 @@ def process_instance(
|
||||
}
|
||||
}
|
||||
|
||||
state: State = asyncio.run(
|
||||
main(
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
'PLACEHOLDER_GOAL',
|
||||
runtime_tools_config=runtime_tools_config,
|
||||
sandbox=docker_sandbox,
|
||||
@ -108,6 +112,8 @@ def process_instance(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
env_ids = [
|
||||
id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
|
||||
]
|
||||
@ -196,10 +202,14 @@ if __name__ == '__main__':
|
||||
|
||||
# =============================================
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
|
||||
|
||||
docker_sandbox = DockerSSHBox()
|
||||
for env_id in tqdm(env_ids):
|
||||
try:
|
||||
output = process_instance(
|
||||
agent=agent,
|
||||
env_id=env_id,
|
||||
metadata=metadata,
|
||||
eval_output_dir=eval_output_dir,
|
||||
|
||||
@ -59,7 +59,7 @@ class AgentController:
|
||||
agent: Agent,
|
||||
event_stream: EventStream,
|
||||
sid: str = 'default',
|
||||
max_iterations: int = MAX_ITERATIONS,
|
||||
max_iterations: int | None = MAX_ITERATIONS,
|
||||
max_budget_per_task: float | None = MAX_BUDGET_PER_TASK,
|
||||
initial_state: State | None = None,
|
||||
is_delegate: bool = False,
|
||||
@ -86,6 +86,9 @@ class AgentController:
|
||||
)
|
||||
|
||||
# state from the previous session, state from a parent agent, or a fresh state
|
||||
max_iterations = (
|
||||
max_iterations if max_iterations is not None else MAX_ITERATIONS
|
||||
)
|
||||
self.set_initial_state(
|
||||
state=initial_state,
|
||||
max_iterations=max_iterations,
|
||||
|
||||
@ -476,7 +476,7 @@ def get_llm_config_arg(llm_config_arg: str):
|
||||
|
||||
|
||||
# Command line arguments
|
||||
def get_parser():
|
||||
def get_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Get the parser for the command line arguments.
|
||||
"""
|
||||
@ -559,7 +559,7 @@ def get_parser():
|
||||
return parser
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
def parse_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Parse the command line arguments.
|
||||
"""
|
||||
@ -569,6 +569,3 @@ def parse_arguments():
|
||||
config.workspace_base = os.path.abspath(parsed_args.directory)
|
||||
print(f'Setting workspace base to {config.workspace_base}')
|
||||
return parsed_args
|
||||
|
||||
|
||||
args = parse_arguments()
|
||||
|
||||
@ -7,7 +7,7 @@ import agenthub # noqa F401 (we import this to get the agents registered)
|
||||
from opendevin.controller import AgentController
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import args, config, get_llm_config_arg
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.schema import AgentState
|
||||
from opendevin.events import EventSource, EventStream, EventStreamSubscriber
|
||||
@ -30,8 +30,11 @@ def read_task_from_stdin() -> str:
|
||||
return sys.stdin.read()
|
||||
|
||||
|
||||
async def main(
|
||||
task_str: str = '',
|
||||
async def run_agent_controller(
|
||||
agent: Agent,
|
||||
task_str: str,
|
||||
max_iterations: int | None = None,
|
||||
max_budget_per_task: float | None = None,
|
||||
exit_on_message: bool = False,
|
||||
fake_user_response_fn: Callable[[State | None], str] | None = None,
|
||||
sandbox: Sandbox | None = None,
|
||||
@ -48,43 +51,10 @@ async def main(
|
||||
sandbox: An optional sandbox to run the agent in.
|
||||
"""
|
||||
|
||||
# Determine the task source
|
||||
if task_str:
|
||||
task = task_str
|
||||
elif args.file:
|
||||
task = read_task_from_file(args.file)
|
||||
elif args.task:
|
||||
task = args.task
|
||||
elif not sys.stdin.isatty():
|
||||
task = read_task_from_stdin()
|
||||
else:
|
||||
raise ValueError('No task provided. Please specify a task through -t, -f.')
|
||||
|
||||
# only one of model_name or llm_config is required
|
||||
if args.llm_config:
|
||||
# --llm_config
|
||||
# llm_config can contain any of the attributes of LLMConfig
|
||||
llm_config = get_llm_config_arg(args.llm_config)
|
||||
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
|
||||
|
||||
logger.info(
|
||||
f'Running agent {args.agent_cls} (model: {llm_config.model}, llm_config: {args.llm_config}) with task: "{task}"'
|
||||
)
|
||||
|
||||
# create LLM instance with the given config
|
||||
llm = LLM(llm_config=llm_config)
|
||||
else:
|
||||
# --model-name model_name
|
||||
logger.info(
|
||||
f'Running agent {args.agent_cls} (model: {args.model_name}), with task: "{task}"'
|
||||
)
|
||||
llm = LLM(args.model_name)
|
||||
|
||||
# set up the agent
|
||||
AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
|
||||
agent = AgentCls(llm=llm)
|
||||
# Logging
|
||||
logger.info(
|
||||
f'Running agent {type(agent)}, model {agent.llm.model_name}, with task: "{task_str}"'
|
||||
)
|
||||
|
||||
# set up the event stream
|
||||
cli_session = 'main' + ('_' + sid if sid else '')
|
||||
@ -102,8 +72,8 @@ async def main(
|
||||
# init controller with this initial state
|
||||
controller = AgentController(
|
||||
agent=agent,
|
||||
max_iterations=args.max_iterations,
|
||||
max_budget_per_task=args.max_budget_per_task,
|
||||
max_iterations=max_iterations,
|
||||
max_budget_per_task=max_budget_per_task,
|
||||
event_stream=event_stream,
|
||||
initial_state=initial_state,
|
||||
)
|
||||
@ -124,8 +94,8 @@ async def main(
|
||||
with open(
|
||||
os.path.join(runtime.browser.eval_dir, 'goal.txt'), 'r', encoding='utf-8'
|
||||
) as f:
|
||||
task = f.read()
|
||||
logger.info(f'Dynamic Eval task: {task}')
|
||||
task_str = f.read()
|
||||
logger.info(f'Dynamic Eval task: {task_str}')
|
||||
|
||||
# start event is a MessageAction with the task, either resumed or new
|
||||
if config.enable_cli_session and initial_state is not None:
|
||||
@ -138,7 +108,7 @@ async def main(
|
||||
)
|
||||
elif initial_state is None:
|
||||
# init with the provided task
|
||||
await event_stream.add_event(MessageAction(content=task), EventSource.USER)
|
||||
await event_stream.add_event(MessageAction(content=task_str), EventSource.USER)
|
||||
|
||||
async def on_event(event: Event):
|
||||
if isinstance(event, AgentStateChangedObservation):
|
||||
@ -174,4 +144,36 @@ async def main(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
args = parse_arguments()
|
||||
|
||||
# Determine the task
|
||||
if args.file:
|
||||
task_str = read_task_from_file(args.file)
|
||||
elif args.task:
|
||||
task_str = args.task
|
||||
elif not sys.stdin.isatty():
|
||||
task_str = read_task_from_stdin()
|
||||
else:
|
||||
raise ValueError('No task provided. Please specify a task through -t, -f.')
|
||||
|
||||
# Figure out the LLM config
|
||||
if args.llm_config:
|
||||
llm_config = get_llm_config_arg(args.llm_config)
|
||||
if llm_config is None:
|
||||
raise ValueError(f'Invalid toml file, cannot read {args.llm_config}')
|
||||
llm = LLM(llm_config=llm_config)
|
||||
else:
|
||||
llm = LLM(model=args.model_name)
|
||||
|
||||
# Create the agent
|
||||
AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls)
|
||||
agent = AgentCls(llm=llm)
|
||||
|
||||
asyncio.run(
|
||||
run_agent_controller(
|
||||
agent=agent,
|
||||
task_str=task_str,
|
||||
max_iterations=args.max_iterations,
|
||||
max_budget_per_task=args.max_budget_per_task,
|
||||
)
|
||||
)
|
||||
|
||||
@ -5,13 +5,16 @@ import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.main import main
|
||||
from opendevin.core.config import parse_arguments
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.core.schema import AgentState
|
||||
from opendevin.events.action import (
|
||||
AgentFinishAction,
|
||||
AgentRejectAction,
|
||||
)
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
workspace_base = os.getenv('WORKSPACE_BASE')
|
||||
workspace_mount_path = os.getenv('WORKSPACE_MOUNT_PATH')
|
||||
@ -29,7 +32,7 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
(os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
|
||||
and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
|
||||
and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
|
||||
reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
@ -38,7 +41,14 @@ print(f'workspace_mount_path_in_sandbox: {workspace_mount_path_in_sandbox}')
|
||||
)
|
||||
def test_write_simple_script():
|
||||
task = "Write a shell script 'hello.sh' that prints 'hello'. Do not ask me for confirmation at any point."
|
||||
final_state: State = asyncio.run(main(task, exit_on_message=True))
|
||||
args = parse_arguments()
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
|
||||
|
||||
final_state: State | None = asyncio.run(
|
||||
run_agent_controller(agent, task, exit_on_message=True)
|
||||
)
|
||||
assert final_state.agent_state == AgentState.STOPPED
|
||||
assert final_state.last_error is None
|
||||
|
||||
@ -61,7 +71,7 @@ def test_write_simple_script():
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
(os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
|
||||
and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
|
||||
and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
|
||||
reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
@ -73,6 +83,7 @@ def test_write_simple_script():
|
||||
reason='local sandbox shows environment-dependent absolute path for pwd command',
|
||||
)
|
||||
def test_edits():
|
||||
args = parse_arguments()
|
||||
# Copy workspace artifacts to workspace_base location
|
||||
source_dir = os.path.join(os.path.dirname(__file__), 'workspace/test_edits/')
|
||||
files = os.listdir(source_dir)
|
||||
@ -82,9 +93,14 @@ def test_edits():
|
||||
os.remove(dest_file)
|
||||
shutil.copy(os.path.join(source_dir, file), dest_file)
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
|
||||
|
||||
# Execute the task
|
||||
task = 'Fix typos in bad.txt. Do not ask me for confirmation at any point.'
|
||||
final_state: State = asyncio.run(main(task, exit_on_message=True))
|
||||
final_state: State | None = asyncio.run(
|
||||
run_agent_controller(agent, task, exit_on_message=True)
|
||||
)
|
||||
assert final_state.agent_state == AgentState.STOPPED
|
||||
assert final_state.last_error is None
|
||||
|
||||
@ -108,9 +124,16 @@ Enjoy!
|
||||
reason='Currently, only ssh sandbox supports stateful tasks',
|
||||
)
|
||||
def test_ipython():
|
||||
args = parse_arguments()
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
|
||||
|
||||
# Execute the task
|
||||
task = "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'. Do not ask me for confirmation at any point."
|
||||
final_state: State = asyncio.run(main(task, exit_on_message=True))
|
||||
final_state: State | None = asyncio.run(
|
||||
run_agent_controller(agent, task, exit_on_message=True)
|
||||
)
|
||||
assert final_state.agent_state == AgentState.STOPPED
|
||||
assert final_state.last_error is None
|
||||
|
||||
@ -135,10 +158,15 @@ def test_ipython():
|
||||
reason='FIXME: local sandbox does not capture stderr',
|
||||
)
|
||||
def test_simple_task_rejection():
|
||||
args = parse_arguments()
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
|
||||
|
||||
# Give an impossible task to do: cannot write a commit message because
|
||||
# the workspace is not a git repo
|
||||
task = 'Write a git commit message for the current staging area. Do not ask me for confirmation at any point.'
|
||||
final_state: State = asyncio.run(main(task))
|
||||
final_state: State | None = asyncio.run(run_agent_controller(agent, task))
|
||||
assert final_state.agent_state == AgentState.STOPPED
|
||||
assert final_state.last_error is None
|
||||
assert isinstance(final_state.history[-1][0], AgentRejectAction)
|
||||
@ -153,9 +181,16 @@ def test_simple_task_rejection():
|
||||
reason='Currently, only ssh sandbox supports stateful tasks',
|
||||
)
|
||||
def test_ipython_module():
|
||||
args = parse_arguments()
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
|
||||
|
||||
# Execute the task
|
||||
task = "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt. Do not ask me for confirmation at any point."
|
||||
final_state: State = asyncio.run(main(task, exit_on_message=True))
|
||||
final_state: State | None = asyncio.run(
|
||||
run_agent_controller(agent, task, exit_on_message=True)
|
||||
)
|
||||
assert final_state.agent_state == AgentState.STOPPED
|
||||
assert final_state.last_error is None
|
||||
|
||||
@ -178,13 +213,20 @@ def test_ipython_module():
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
(os.getenv('AGENT') == 'CodeActAgent' or os.getenv('AGENT') == 'CodeActSWEAgent')
|
||||
and os.getenv('SANDBOX_TYPE').lower() != 'ssh',
|
||||
and os.getenv('SANDBOX_TYPE', '').lower() != 'ssh',
|
||||
reason='CodeActAgent/CodeActSWEAgent only supports ssh sandbox which is stateful',
|
||||
)
|
||||
def test_browse_internet(http_server):
|
||||
args = parse_arguments()
|
||||
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(args.agent_cls)(llm=LLM(args.model_name))
|
||||
|
||||
# Execute the task
|
||||
task = 'Browse localhost:8000, and tell me the ultimate answer to life. Do not ask me for confirmation at any point.'
|
||||
final_state: State = asyncio.run(main(task, exit_on_message=True))
|
||||
final_state: State | None = asyncio.run(
|
||||
run_agent_controller(agent, task, exit_on_message=True)
|
||||
)
|
||||
assert final_state.agent_state == AgentState.STOPPED
|
||||
assert final_state.last_error is None
|
||||
assert isinstance(final_state.history[-1][0], AgentFinishAction)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user