Support microagents in CLI and Headless (#5971)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2025-12-26 05:48:36 +08:00 · 2025-01-02 16:52:45 -05:00 · 2025-01-02 16:52:45 -05:00 · 8983d719bd
commit 8983d719bd
parent 9dd5463e06
3 changed files with 132 additions and 129 deletions
--- a/openhands/core/cli.py
+++ b/openhands/core/cli.py
@ -1,15 +1,12 @@
 import asyncio
 import logging
 import sys
-from typing import Type
 from uuid import uuid4

 from termcolor import colored

 import openhands.agenthub  # noqa F401 (we import this to get the agents registered)
 from openhands import __version__
-from openhands.controller import AgentController
-from openhands.controller.agent import Agent
 from openhands.core.config import (
    AppConfig,
    get_parser,
@ -18,7 +15,8 @@ from openhands.core.config import (
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.loop import run_agent_until_done
 from openhands.core.schema import AgentState
-from openhands.events import EventSource, EventStream, EventStreamSubscriber
+from openhands.core.setup import create_agent, create_controller, create_runtime
+from openhands.events import EventSource, EventStreamSubscriber
 from openhands.events.action import (
    Action,
    ActionConfirmationStatus,
@ -34,11 +32,6 @@ from openhands.events.observation import (
    FileEditObservation,
    NullObservation,
 )
-from openhands.llm.llm import LLM
-from openhands.runtime import get_runtime_cls
-from openhands.runtime.base import Runtime
-from openhands.security import SecurityAnalyzer, options
-from openhands.storage import get_file_store


 def display_message(message: str):
@ -114,39 +107,12 @@ async def main(loop):
    config = load_app_config(config_file=args.config_file)
    sid = str(uuid4())

-    agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
-    agent_config = config.get_agent_config(config.default_agent)
-    llm_config = config.get_llm_config_from_agent(config.default_agent)
-    agent = agent_cls(
-        llm=LLM(config=llm_config),
-        config=agent_config,
-    )
+    runtime = create_runtime(config, sid=sid, headless_mode=True)
+    await runtime.connect()
+    agent = create_agent(runtime, config)
+    controller, _ = create_controller(agent, runtime, config)

-    file_store = get_file_store(config.file_store, config.file_store_path)
-    event_stream = EventStream(sid, file_store)
-
-    runtime_cls = get_runtime_cls(config.runtime)
-    runtime: Runtime = runtime_cls(  # noqa: F841
-        config=config,
-        event_stream=event_stream,
-        sid=sid,
-        plugins=agent_cls.sandbox_plugins,
-        headless_mode=True,
-    )
-
-    if config.security.security_analyzer:
-        options.SecurityAnalyzers.get(
-            config.security.security_analyzer, SecurityAnalyzer
-        )(event_stream)
-
-    controller = AgentController(
-        agent=agent,
-        max_iterations=config.max_iterations,
-        max_budget_per_task=config.max_budget_per_task,
-        agent_to_llm_config=config.get_agent_to_llm_config_map(),
-        event_stream=event_stream,
-        confirmation_mode=config.security.confirmation_mode,
-    )
+    event_stream = runtime.event_stream

    async def prompt_for_next_task():
        # Run input() in a thread pool to avoid blocking the event loop
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@ -1,13 +1,10 @@
 import asyncio
-import hashlib
 import json
 import os
 import sys
-import uuid
-from typing import Callable, Protocol, Type
+from typing import Callable, Protocol

 import openhands.agenthub  # noqa F401 (we import this to get the agents registered)
-from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import (
@ -19,16 +16,19 @@ from openhands.core.config import (
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.loop import run_agent_until_done
 from openhands.core.schema import AgentState
-from openhands.events import EventSource, EventStream, EventStreamSubscriber
+from openhands.core.setup import (
+    create_agent,
+    create_controller,
+    create_runtime,
+    generate_sid,
+)
+from openhands.events import EventSource, EventStreamSubscriber
 from openhands.events.action import MessageAction
 from openhands.events.action.action import Action
 from openhands.events.event import Event
 from openhands.events.observation import AgentStateChangedObservation
 from openhands.events.serialization.event import event_to_trajectory
-from openhands.llm.llm import LLM
-from openhands.runtime import get_runtime_cls
 from openhands.runtime.base import Runtime
-from openhands.storage import get_file_store


 class FakeUserResponseFunc(Protocol):
@ -51,45 +51,6 @@ def read_task_from_stdin() -> str:
    return sys.stdin.read()


-def create_runtime(
-    config: AppConfig,
-    sid: str | None = None,
-    headless_mode: bool = True,
-) -> Runtime:
-    """Create a runtime for the agent to run on.
-
-    config: The app config.
-    sid: (optional) The session id. IMPORTANT: please don't set this unless you know what you're doing.
-        Set it to incompatible value will cause unexpected behavior on RemoteRuntime.
-    headless_mode: Whether the agent is run in headless mode. `create_runtime` is typically called within evaluation scripts,
-        where we don't want to have the VSCode UI open, so it defaults to True.
-    """
-    # if sid is provided on the command line, use it as the name of the event stream
-    # otherwise generate it on the basis of the configured jwt_secret
-    # we can do this better, this is just so that the sid is retrieved when we want to restore the session
-    session_id = sid or generate_sid(config)
-
-    # set up the event stream
-    file_store = get_file_store(config.file_store, config.file_store_path)
-    event_stream = EventStream(session_id, file_store)
-
-    # agent class
-    agent_cls = openhands.agenthub.Agent.get_cls(config.default_agent)
-
-    # runtime and tools
-    runtime_cls = get_runtime_cls(config.runtime)
-    logger.debug(f'Initializing runtime: {runtime_cls.__name__}')
-    runtime: Runtime = runtime_cls(
-        config=config,
-        event_stream=event_stream,
-        sid=session_id,
-        plugins=agent_cls.sandbox_plugins,
-        headless_mode=headless_mode,
-    )
-
-    return runtime
-
-
 async def run_controller(
    config: AppConfig,
    initial_user_action: Action,
@ -115,17 +76,6 @@ async def run_controller(
            (could be None) and returns a fake user response.
        headless_mode: Whether the agent is run in headless mode.
    """
-    # Create the agent
-    if agent is None:
-        agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
-        agent_config = config.get_agent_config(config.default_agent)
-        llm_config = config.get_llm_config_from_agent(config.default_agent)
-        agent = agent_cls(
-            llm=LLM(config=llm_config),
-            config=agent_config,
-        )
-
-    # make sure the session id is set
    sid = sid or generate_sid(config)

    if runtime is None:
@ -134,28 +84,10 @@ async def run_controller(

    event_stream = runtime.event_stream

-    # restore cli session if available
-    initial_state = None
-    try:
-        logger.debug(
-            f'Trying to restore agent state from cli session {event_stream.sid} if available'
-        )
-        initial_state = State.restore_from_session(
-            event_stream.sid, event_stream.file_store
-        )
-    except Exception as e:
-        logger.debug(f'Cannot restore agent state: {e}')
+    if agent is None:
+        agent = create_agent(runtime, config)

-    # init controller with this initial state
-    controller = AgentController(
-        agent=agent,
-        max_iterations=config.max_iterations,
-        max_budget_per_task=config.max_budget_per_task,
-        agent_to_llm_config=config.get_agent_to_llm_config_map(),
-        event_stream=event_stream,
-        initial_state=initial_state,
-        headless_mode=headless_mode,
-    )
+    controller, initial_state = create_controller(agent, runtime, config)

    assert isinstance(
        initial_user_action, Action
@ -234,15 +166,6 @@ async def run_controller(
    return state


-def generate_sid(config: AppConfig, session_name: str | None = None) -> str:
-    """Generate a session id based on the session name and the jwt secret."""
-    session_name = session_name or str(uuid.uuid4())
-    jwt_secret = config.jwt_secret
-
-    hash_str = hashlib.sha256(f'{session_name}{jwt_secret}'.encode('utf-8')).hexdigest()
-    return f'{session_name}-{hash_str[:16]}'
-
-
 def auto_continue_response(
    state: State,
    encapsulate_solution: bool = False,
--- a/openhands/core/setup.py
+++ b/openhands/core/setup.py
@ -0,0 +1,114 @@
+import hashlib
+import uuid
+from typing import Tuple, Type
+
+import openhands.agenthub  # noqa F401 (we import this to get the agents registered)
+from openhands.controller import AgentController
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.events import EventStream
+from openhands.llm.llm import LLM
+from openhands.runtime import get_runtime_cls
+from openhands.runtime.base import Runtime
+from openhands.security import SecurityAnalyzer, options
+from openhands.storage import get_file_store
+
+
+def create_runtime(
+    config: AppConfig,
+    sid: str | None = None,
+    headless_mode: bool = True,
+) -> Runtime:
+    """Create a runtime for the agent to run on.
+
+    config: The app config.
+    sid: (optional) The session id. IMPORTANT: please don't set this unless you know what you're doing.
+        Set it to incompatible value will cause unexpected behavior on RemoteRuntime.
+    headless_mode: Whether the agent is run in headless mode. `create_runtime` is typically called within evaluation scripts,
+        where we don't want to have the VSCode UI open, so it defaults to True.
+    """
+    # if sid is provided on the command line, use it as the name of the event stream
+    # otherwise generate it on the basis of the configured jwt_secret
+    # we can do this better, this is just so that the sid is retrieved when we want to restore the session
+    session_id = sid or generate_sid(config)
+
+    # set up the event stream
+    file_store = get_file_store(config.file_store, config.file_store_path)
+    event_stream = EventStream(session_id, file_store)
+
+    # agent class
+    agent_cls = openhands.agenthub.Agent.get_cls(config.default_agent)
+
+    # runtime and tools
+    runtime_cls = get_runtime_cls(config.runtime)
+    logger.debug(f'Initializing runtime: {runtime_cls.__name__}')
+    runtime: Runtime = runtime_cls(
+        config=config,
+        event_stream=event_stream,
+        sid=session_id,
+        plugins=agent_cls.sandbox_plugins,
+        headless_mode=headless_mode,
+    )
+
+    return runtime
+
+
+def create_agent(runtime: Runtime, config: AppConfig) -> Agent:
+    agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
+    agent_config = config.get_agent_config(config.default_agent)
+    llm_config = config.get_llm_config_from_agent(config.default_agent)
+    agent = agent_cls(
+        llm=LLM(config=llm_config),
+        config=agent_config,
+    )
+    if agent.prompt_manager:
+        microagents = runtime.get_custom_microagents(None)
+        agent.prompt_manager.load_microagent_files(microagents)
+
+    if config.security.security_analyzer:
+        options.SecurityAnalyzers.get(
+            config.security.security_analyzer, SecurityAnalyzer
+        )(runtime.event_stream)
+
+    return agent
+
+
+def create_controller(
+    agent: Agent, runtime: Runtime, config: AppConfig, headless_mode: bool = True
+) -> Tuple[AgentController, State | None]:
+    event_stream = runtime.event_stream
+    initial_state = None
+    try:
+        logger.debug(
+            f'Trying to restore agent state from session {event_stream.sid} if available'
+        )
+        initial_state = State.restore_from_session(
+            event_stream.sid, event_stream.file_store
+        )
+    except Exception as e:
+        logger.debug(f'Cannot restore agent state: {e}')
+
+    controller = AgentController(
+        agent=agent,
+        max_iterations=config.max_iterations,
+        max_budget_per_task=config.max_budget_per_task,
+        agent_to_llm_config=config.get_agent_to_llm_config_map(),
+        event_stream=event_stream,
+        initial_state=initial_state,
+        headless_mode=headless_mode,
+        confirmation_mode=config.security.confirmation_mode,
+    )
+    return (controller, initial_state)
+
+
+def generate_sid(config: AppConfig, session_name: str | None = None) -> str:
+    """Generate a session id based on the session name and the jwt secret."""
+    session_name = session_name or str(uuid.uuid4())
+    jwt_secret = config.jwt_secret
+
+    hash_str = hashlib.sha256(f'{session_name}{jwt_secret}'.encode('utf-8')).hexdigest()
+    return f'{session_name}-{hash_str[:16]}'