mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 13:52:43 +08:00
* move multi-line bash tests to test_runtime; support multi-line bash for esruntime; * add testcase to handle PS2 prompt * use bashlex for bash parsing to handle multi-line commands; add testcases for multi-line commands * revert ghcr runtime change * Apply stash * fix run as other user; make test async; * fix test runtime for run as od * add run-as-devin to all the runtime tests * handle the case when username is root * move all run-as-devin tests from sandbox; only tests a few cases on different user to save time; * move over multi-line echo related tests to test_runtime * fix user-specific jupyter by fixing the pypoetry virtualenv folder * make plugin's init async; chdir at initialization of jupyter plugin; move ipy simple testcase to test runtime; * support agentskills import in move tests for jupyter pwd tests; overload `add_env_vars` for EventStreamRuntime to update env var also in Jupyter; make agentskills read env var lazily, in case env var is updated; * fix ServerRuntime agentskills issue * move agnostic image test to test_runtime * merge runtime tests in CI * fix enable auto lint as env var * update warning message * update warning message * test for different container images * change parsing output as debug * add exception handling for update_pwd_decorator * fix unit test indentation * add plugins as default input to Runtime class; remove init_sandbox_plugins; implement add_env_var (include jupyter) in the base class; * fix server runtime auto lint * Revert "add exception handling for update_pwd_decorator" This reverts commit 2b668b1506e02145cb8f87e321aad62febca3d50. * tries to print debugging info for agentskills * explictly setting uid (try fix permission issue) * Revert "tries to print debugging info for agentskills" This reverts commit 8be4c86756f0e3fc62957b327ba2ac4999c419de. * set sandbox user id during testing to hopefully fix the permission issue * add browser tools for server runtime * try to debug for old pwd * update debug cmd * only test agnostic runtime when TEST_RUNTIME is Server * fix temp dir mkdir * load TEST_RUNTIME at the beginning * remove ipython tests * only log to file when DEBUG * default logging to project root * temporarily remove log to file * fix LLM logger dir * fix logger * make set pwd an optional aux action * fix prev pwd * fix infinity recursion * simplify * do not import the whole od library to avoid logger folder by jupyter * fix browsing * increase timeout * attempt to fix agentskills yet again * clean up in testcases, since CI maybe run as non-root * add _cause attribute for event.id * remove parent * add a bunch of debugging statement again for CI :( * fix temp_dir fixture * change all temp dir to follow pytest's tmp_path_factory * remove extra bracket * clean up error printing a bit * jupyter chdir to self.config.workspace_mount_path_in_sandbox on initialization * jupyter chdir to self.config.workspace_mount_path_in_sandbox on initialization * add typing for tmp dir fixture * clear the directory before running the test to avoid weird CI temp dir * remove agnostic test case for server runtime * Revert "remove agnostic test case for server runtime" This reverts commit 30e2181c3fc1410e69596c2dcd06be01f1d016b3. * disable agnostic tests in CI * fix test * make sure plugin arg is not passed when no plugin is specified; remove redundant on_event function; * move mock prompt * rename runtime * remove extra logging * refactor run_controller's interface; support multiple runtime for integration test; filter out hostname for prompt * uncomment other tests * pass the right runtime to controller * log runtime when start * uncomment tests * improve symbol filters * add intergration test prompts that seemd ok * add integration test workflow * add python3 to default ubuntu image * symlink python and fix permission to jupyter pip * add retry for jupyter execute server * fix jupyter pip install; add post-process for jupyter pip install; simplify init by add agent_skills path to PYTHONPATH; add testcase to tests jupyter pip install; * fix bug * use ubuntu:22.04 for eventstream integration tests * add todo * update testcase * remove redundant code * fix unit test * reduce dependency for runtime * try making llama-index an optional dependency that's not installed by default * remove pip install since it seemd not needed * log ipython execution; await write message since it returns a future * update ipy testcase * do not install llama-index in CI * do not install llama-index in the app docker as well * set sandbox container image in the integration test script * log plugins & env var for runtime * update conftest for sha256 * add git * remove all non-alphanumeric chalracters * add working ipy module tests! * default to use host network * remove is_async from browser to make thing a little more reliable; retry loading browser when error; * add sleep to wait a bit for http server * kill http server before regenerate browsing tests * fix browsing * only set sandbox container image if undefined * skip empty config value * update evaluation to use the latest run_controller * revert logger in execute_server to be compatible with server runtime * revert logging level to fix jupyter * set logger level * revert the logging * chmod for workspace to fix permission * support getting timeout from action * update test for server runtime * try to fix file permission * fix test_cmd_run_action_serialization_deserialization test (added timeout) * poetry: pip 24.2, torch 2.2.2 * revert adding pip to pyproject.toml * add build to dependencies in pyproject.toml * forgot poetry lock --no-update * fix a DelegatorAgent prompt_002.log (timeout) * fix a DelegatorAgent prompt_003.log (timeout) * couple more timeout attribs in prompt files * some more prompt files * prompts galore * add clarification comment for timeout * default timeout to config * add assert * update integraton tests for eventstream * update integration tests * fix timeout for action<->dict * remove redundant on_event * default to use instance image * update run_controller interface * add logging for copy * refactor swe_bench for the new design * fix action execution timeout * updatelock * remove build sandbox locally * fix runtime * use plain for-loop for single process * remove extra print * get swebench inference working * print whole `test_result` dict * got swebench patch post-process working * update swe-bench evaluation readme * refactor using shared reset_logger function * move messy swebench prompt to a different file * support the ability to specify whether to keep prompt * support the ability to specify whether to keep prompt * fix dockerfile * fix import and remove unnecessary strip logic * fix action serialization * get agentbench running * remove extra ls for agent bench * fix agentbench metric * factor out common documentation for eval * update biocoder doc * remove swe_env_box since it is no longer needed * get biocoder working * add func timeout for bird * fix jupyter pwd with ~ as user name * fix jupyter pwd with ~ as user name * get bird working * get browsing evaluation working * make eda runnable * fix id column * fix eda run_infer * unify eval output using a structured format; make swebench coompatible with that format; update client source code for every swebench run; do not inject testcmd for swebench * standardize existing benchs for the new eval output * set update source code = true * get gaia standardized * fix gaia * gorilla refactored but stuck at language.so to test * refactor and make gpqa work * refactor humanevalfix and get it working * refactor logic reasoning and get it working * refactor browser env so it works with eventstream runtime for eval * add initial version of miniwob refactor * fix browsergym environment * get miniwob working!! * allowing injecting additional dependency to OD runtime docker image * allowing injecting additional dependency to OD runtime docker image * support logic reasoning with pre-injected dependency * get mint working * update runtime build * fix mint docker * add test for keep_prompt; add missing await close for some tests * update integration tests for eventstream runtime * fix integration tests for server runtime * refactor ml bench and toolqa * refactor webarena * fix default factory * Update run_infer.py * add APIError to retry * increase timeout for swebench * make sure to hide api key when dump eval output * update the behavior of put source code to put files instead of tarball * add dishash to dependency * sendintr when timeout * fix dockerfile copy * reduce timeout * use dirhash to avoid repeat building for update source * fix runtime_build testcase * add dir_hash to docker build pipeline * revert api error * update poetry lock * add retries for swebench run infer * fix git patch * update poetry lock * adjust config order * fix mount volumns * enforce all eval to use "instance_id" * remove file store from runtime * make file_store public inside eventstream * move the runtime logic inside `main` out * support using async function for process_instance_fn * refactor run_infer with the create_time * fix file store * Update evaluation/toolqa/utils.py Co-authored-by: Graham Neubig <neubig@gmail.com> * fix typo --------- Co-authored-by: tobitege <tobitege@gmx.de> Co-authored-by: super-dainiu <78588128+super-dainiu@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
186 lines
6.7 KiB
Python
186 lines
6.7 KiB
Python
import base64
|
|
import pickle
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from opendevin.controller.state.task import RootTask
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
from opendevin.core.metrics import Metrics
|
|
from opendevin.core.schema import AgentState
|
|
from opendevin.events.action import (
|
|
MessageAction,
|
|
)
|
|
from opendevin.events.action.agent import AgentFinishAction
|
|
from opendevin.memory.history import ShortTermHistory
|
|
from opendevin.storage.files import FileStore
|
|
|
|
|
|
class TrafficControlState(str, Enum):
|
|
# default state, no rate limiting
|
|
NORMAL = 'normal'
|
|
|
|
# task paused due to traffic control
|
|
THROTTLING = 'throttling'
|
|
|
|
# traffic control is temporarily paused
|
|
PAUSED = 'paused'
|
|
|
|
|
|
RESUMABLE_STATES = [
|
|
AgentState.RUNNING,
|
|
AgentState.PAUSED,
|
|
AgentState.AWAITING_USER_INPUT,
|
|
AgentState.FINISHED,
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class State:
|
|
"""
|
|
OpenDevin is a multi-agentic system.
|
|
|
|
A `task` is an end-to-end conversation between OpenDevin (the whole sytem) and the
|
|
user, which might involve one or more inputs from the user. It starts with
|
|
an initial input (typically a task statement) from the user, and ends with either
|
|
a `AgentFinishAction` initiated by the agent, or an error.
|
|
|
|
A `subtask` is an end-to-end conversation between an agent and the user, or
|
|
another agent. If a `task` is conducted by a single agent, then it's also a `subtask`
|
|
itself. Otherwise, a `task` consists of multiple `subtasks`, each executed by
|
|
one agent.
|
|
|
|
A `State` is a mutable object associated with a `subtask`. It includes several
|
|
mutable and immutable fields, among which `iteration` is shared across
|
|
subtasks.
|
|
|
|
For example, considering a task from the user: `tell me how many GitHub stars
|
|
OpenDevin repo has`. Let's assume the default agent is CodeActAgent.
|
|
|
|
-- TASK STARTS (SUBTASK 0 STARTS) --
|
|
|
|
DELEGATE_LEVEL 0, ITERATION 0, LOCAL_ITERATION 0
|
|
CodeActAgent: I should request help from BrowsingAgent
|
|
|
|
-- DELEGATE STARTS (SUBTASK 1 STARTS) --
|
|
|
|
DELEGATE_LEVEL 1, ITERATION 1, LOCAL_ITERATION 0
|
|
BrowsingAgent: Let me find the answer on GitHub
|
|
|
|
DELEGATE_LEVEL 1, ITERATION 2, LOCAL_ITERATION 1
|
|
BrowsingAgent: I found the answer, let me convey the result and finish
|
|
|
|
-- DELEGATE ENDS (SUBTASK 1 ENDS) --
|
|
|
|
DELEGATE_LEVEL 0, ITERATION 3, LOCAL_ITERATION 1
|
|
CodeActAgent: I got the answer from BrowsingAgent, let me convey the result
|
|
and finish
|
|
|
|
-- TASK ENDS (SUBTASK 0 ENDS) --
|
|
|
|
Note how ITERATION counter is shared across agents, while LOCAL_ITERATION
|
|
is local to each subtask.
|
|
"""
|
|
|
|
root_task: RootTask = field(default_factory=RootTask)
|
|
# global iteration for the current task
|
|
iteration: int = 0
|
|
# local iteration for the current subtask
|
|
local_iteration: int = 0
|
|
# max number of iterations for the current task
|
|
max_iterations: int = 100
|
|
confirmation_mode: bool = False
|
|
history: ShortTermHistory = field(default_factory=ShortTermHistory)
|
|
inputs: dict = field(default_factory=dict)
|
|
outputs: dict = field(default_factory=dict)
|
|
last_error: str | None = None
|
|
agent_state: AgentState = AgentState.LOADING
|
|
resume_state: AgentState | None = None
|
|
traffic_control_state: TrafficControlState = TrafficControlState.NORMAL
|
|
# global metrics for the current task
|
|
metrics: Metrics = field(default_factory=Metrics)
|
|
# local metrics for the current subtask
|
|
local_metrics: Metrics = field(default_factory=Metrics)
|
|
# root agent has level 0, and every delegate increases the level by one
|
|
delegate_level: int = 0
|
|
# start_id and end_id track the range of events in history
|
|
start_id: int = -1
|
|
end_id: int = -1
|
|
almost_stuck: int = 0
|
|
# NOTE: This will never be used by the controller, but it can be used by different
|
|
# evaluation tasks to store extra data needed to track the progress/state of the task.
|
|
extra_data: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def save_to_session(self, sid: str, file_store: FileStore):
|
|
pickled = pickle.dumps(self)
|
|
logger.debug(f'Saving state to session {sid}:{self.agent_state}')
|
|
encoded = base64.b64encode(pickled).decode('utf-8')
|
|
try:
|
|
file_store.write(f'sessions/{sid}/agent_state.pkl', encoded)
|
|
except Exception as e:
|
|
logger.error(f'Failed to save state to session: {e}')
|
|
raise e
|
|
|
|
@staticmethod
|
|
def restore_from_session(sid: str, file_store: FileStore) -> 'State':
|
|
try:
|
|
encoded = file_store.read(f'sessions/{sid}/agent_state.pkl')
|
|
pickled = base64.b64decode(encoded)
|
|
state = pickle.loads(pickled)
|
|
except Exception as e:
|
|
logger.error(f'Failed to restore state from session: {e}')
|
|
raise e
|
|
|
|
# update state
|
|
if state.agent_state in RESUMABLE_STATES:
|
|
state.resume_state = state.agent_state
|
|
else:
|
|
state.resume_state = None
|
|
|
|
# don't carry last_error anymore after restore
|
|
state.last_error = None
|
|
|
|
# first state after restore
|
|
state.agent_state = AgentState.LOADING
|
|
return state
|
|
|
|
def __getstate__(self):
|
|
state = self.__dict__.copy()
|
|
|
|
# save the relevant data from recent history
|
|
# so that we can restore it when the state is restored
|
|
if 'history' in state:
|
|
state['start_id'] = state['history'].start_id
|
|
state['end_id'] = state['history'].end_id
|
|
|
|
# don't save history object itself
|
|
state.pop('history', None)
|
|
return state
|
|
|
|
def __setstate__(self, state):
|
|
self.__dict__.update(state)
|
|
|
|
# recreate the history object
|
|
if not hasattr(self, 'history'):
|
|
self.history = ShortTermHistory()
|
|
|
|
# restore the relevant data in history from the state
|
|
self.history.start_id = self.start_id
|
|
self.history.end_id = self.end_id
|
|
|
|
# remove the restored data from the state if any
|
|
|
|
def get_current_user_intent(self):
|
|
"""Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
|
|
last_user_message = None
|
|
last_user_message_image_urls: list[str] | None = []
|
|
for event in self.history.get_events(reverse=True):
|
|
if isinstance(event, MessageAction) and event.source == 'user':
|
|
last_user_message = event.content
|
|
last_user_message_image_urls = event.images_urls
|
|
elif isinstance(event, AgentFinishAction):
|
|
if last_user_message is not None:
|
|
return last_user_message
|
|
|
|
return last_user_message, last_user_message_image_urls
|