[agent] Add "thinking" tool only (#6977)

2026-03-22 13:47:19 +08:00 · 2025-02-27 16:47:39 -05:00
parent 6e3e9e397e
commit 42332294a1
22 changed files with 177 additions and 4 deletions
--- a/frontend/src/i18n/translation.json
+++ b/frontend/src/i18n/translation.json
@@ -4178,6 +4178,21 @@
        "es": "Navegando en la web",
        "tr": "Web'de geziniyor"
    },
+    "ACTION_MESSAGE$THINK": {
+        "en": "Thinking",
+        "zh-CN": "思考",
+        "zh-TW": "思考",
+        "ko-KR": "생각",
+        "ja": "考える",
+        "no": "Tenker",
+        "ar": "يفكر",
+        "de": "Denkt",
+        "fr": "Pensant",
+        "it": "Pensando",
+        "pt": "Pensando",
+        "es": "Pensando",
+        "tr": "Düşünüyor"
+    },
    "OBSERVATION_MESSAGE$RUN": {
        "en": "Ran a bash command",
        "zh-CN": "运行",
--- a/frontend/src/services/observations.ts
+++ b/frontend/src/services/observations.ts
@@ -48,6 +48,8 @@ export function handleObservationMessage(message: ObservationMessage) {
      break;
    case ObservationType.READ:
    case ObservationType.EDIT:
+    case ObservationType.THINK:
+    case ObservationType.NULL:
      break; // We don't display the default message for these observations
    default:
      store.dispatch(addAssistantMessage(message.message));
--- a/frontend/src/state/chat-slice.ts
+++ b/frontend/src/state/chat-slice.ts
@@ -115,6 +115,8 @@ export const chatSlice = createSlice({
        ) {
          text += `\n\n${getRiskText(action.payload.args.security_risk as unknown as ActionSecurityRisk)}`;
        }
+      } else if (actionID === "think") {
+        text = action.payload.args.thought;
      }
      const message: Message = {
        type: "action",
--- a/frontend/src/types/action-type.tsx
+++ b/frontend/src/types/action-type.tsx
@@ -26,6 +26,9 @@ enum ActionType {
  // Delegate a (sub)task to another agent.
  DELEGATE = "delegate",

+  // Logs a thought.
+  THINK = "think",
+
  // If you're absolutely certain that you've completed your task and have tested your work,
  // use the finish action to stop working.
  FINISH = "finish",
--- a/frontend/src/types/core/actions.ts
+++ b/frontend/src/types/core/actions.ts
@@ -41,6 +41,13 @@ export interface IPythonAction extends OpenHandsActionEvent<"run_ipython"> {
  };
 }

+export interface ThinkAction extends OpenHandsActionEvent<"think"> {
+  source: "agent";
+  args: {
+    thought: string;
+  };
+}
+
 export interface FinishAction extends OpenHandsActionEvent<"finish"> {
  source: "agent";
  args: {
@@ -129,6 +136,7 @@ export type OpenHandsAction =
  | AssistantMessageAction
  | CommandAction
  | IPythonAction
+  | ThinkAction
  | FinishAction
  | DelegateAction
  | BrowseAction
--- a/frontend/src/types/core/base.ts
+++ b/frontend/src/types/core/base.ts
@@ -10,6 +10,7 @@ export type OpenHandsEventType =
  | "browse"
  | "browse_interactive"
  | "reject"
+  | "think"
  | "finish"
  | "error";

--- a/frontend/src/types/core/observations.ts
+++ b/frontend/src/types/core/observations.ts
@@ -80,8 +80,17 @@ export interface ErrorObservation extends OpenHandsObservationEvent<"error"> {
  };
 }

+export interface AgentThinkObservation
+  extends OpenHandsObservationEvent<"think"> {
+  source: "agent";
+  extras: {
+    thought: string;
+  };
+}
+
 export type OpenHandsObservation =
  | AgentStateChangeObservation
+  | AgentThinkObservation
  | CommandObservation
  | IPythonObservation
  | DelegateObservation
--- a/frontend/src/types/observation-type.tsx
+++ b/frontend/src/types/observation-type.tsx
@@ -22,6 +22,12 @@ enum ObservationType {

  // Delegate result
  DELEGATE = "delegate",
+
+  // A response to the agent's thought (usually a static message)
+  THINK = "think",
+
+  // A no-op observation
+  NULL = "null",
 }

 export default ObservationType;
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -17,6 +17,7 @@ from openhands.agenthub.codeact_agent.tools import (
    IPythonTool,
    LLMBasedFileEditTool,
    StrReplaceEditorTool,
+    ThinkTool,
    WebReadTool,
 )
 from openhands.core.exceptions import (
@@ -27,6 +28,7 @@ from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
+    AgentThinkAction,
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
@@ -42,7 +44,9 @@ from openhands.events.tool import ToolCallMetadata
 def combine_thought(action: Action, thought: str) -> Action:
    if not hasattr(action, 'thought'):
        return action
-    if thought:
+    if thought and action.thought:
+        action.thought = f'{thought}\n{action.thought}'
+    elif thought:
        action.thought = thought
    return action

@@ -71,6 +75,11 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                raise RuntimeError(
                    f'Failed to parse tool call arguments: {tool_call.function.arguments}'
                ) from e
+
+            # ================================================
+            # CmdRunTool (Bash)
+            # ================================================
+
            if tool_call.function.name == CmdRunTool['function']['name']:
                if 'command' not in arguments:
                    raise FunctionCallValidationError(
@@ -79,6 +88,10 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                # convert is_input to boolean
                is_input = arguments.get('is_input', 'false') == 'true'
                action = CmdRunAction(command=arguments['command'], is_input=is_input)
+
+            # ================================================
+            # IPythonTool (Jupyter)
+            # ================================================
            elif tool_call.function.name == IPythonTool['function']['name']:
                if 'code' not in arguments:
                    raise FunctionCallValidationError(
@@ -90,8 +103,16 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                    agent='BrowsingAgent',
                    inputs=arguments,
                )
+
+            # ================================================
+            # AgentFinishAction
+            # ================================================
            elif tool_call.function.name == FinishTool['function']['name']:
                action = AgentFinishAction()
+
+            # ================================================
+            # LLMBasedFileEditTool (LLM-based file editor, deprecated)
+            # ================================================
            elif tool_call.function.name == LLMBasedFileEditTool['function']['name']:
                if 'path' not in arguments:
                    raise FunctionCallValidationError(
@@ -138,12 +159,25 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                        impl_source=FileEditSource.OH_ACI,
                        **other_kwargs,
                    )
+            # ================================================
+            # AgentThinkAction
+            # ================================================
+            elif tool_call.function.name == ThinkTool['function']['name']:
+                action = AgentThinkAction(thought=arguments.get('thought', ''))
+
+            # ================================================
+            # BrowserTool
+            # ================================================
            elif tool_call.function.name == BrowserTool['function']['name']:
                if 'code' not in arguments:
                    raise FunctionCallValidationError(
                        f'Missing required argument "code" in tool call {tool_call.function.name}'
                    )
                action = BrowseInteractiveAction(browser_actions=arguments['code'])
+
+            # ================================================
+            # WebReadTool (simplified browsing)
+            # ================================================
            elif tool_call.function.name == WebReadTool['function']['name']:
                if 'url' not in arguments:
                    raise FunctionCallValidationError(
@@ -183,7 +217,7 @@ def get_tools(
    codeact_enable_llm_editor: bool = False,
    codeact_enable_jupyter: bool = False,
 ) -> list[ChatCompletionToolParam]:
-    tools = [CmdRunTool, FinishTool]
+    tools = [CmdRunTool, ThinkTool, FinishTool]
    if codeact_enable_browsing:
        tools.append(WebReadTool)
        tools.append(BrowserTool)
--- a/openhands/agenthub/codeact_agent/tools/init.py
+++ b/openhands/agenthub/codeact_agent/tools/init.py
@@ -4,6 +4,7 @@ from .finish import FinishTool
 from .ipython import IPythonTool
 from .llm_based_edit import LLMBasedFileEditTool
 from .str_replace_editor import StrReplaceEditorTool
+from .think import ThinkTool
 from .web_read import WebReadTool

 __all__ = [
@@ -14,4 +15,5 @@ __all__ = [
    'LLMBasedFileEditTool',
    'StrReplaceEditorTool',
    'WebReadTool',
+    'ThinkTool',
 ]
--- a/openhands/agenthub/codeact_agent/tools/think.py
+++ b/openhands/agenthub/codeact_agent/tools/think.py
@@ -0,0 +1,27 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+_THINK_DESCRIPTION = """Use the tool to think about something. It will not obtain new information or make any changes to the repository, but just log the thought. Use it when complex reasoning or brainstorming is needed.
+
+Common use cases:
+1. When exploring a repository and discovering the source of a bug, call this tool to brainstorm several unique ways of fixing the bug, and assess which change(s) are likely to be simplest and most effective.
+2. After receiving test results, use this tool to brainstorm ways to fix failing tests.
+3. When planning a complex refactoring, use this tool to outline different approaches and their tradeoffs.
+4. When designing a new feature, use this tool to think through architecture decisions and implementation details.
+5. When debugging a complex issue, use this tool to organize your thoughts and hypotheses.
+
+The tool simply logs your thought process for better transparency and does not execute any code or make changes."""
+
+ThinkTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='think',
+        description=_THINK_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'thought': {'type': 'string', 'description': 'The thought to log.'},
+            },
+            'required': ['thought'],
+        },
+    ),
+)
--- a/openhands/core/message_utils.py
+++ b/openhands/core/message_utils.py
@@ -7,6 +7,7 @@ from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
+    AgentThinkAction,
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
@@ -19,6 +20,7 @@ from openhands.events.event import Event
 from openhands.events.observation import (
    AgentCondensationObservation,
    AgentDelegateObservation,
+    AgentThinkObservation,
    BrowserOutputObservation,
    CmdOutputObservation,
    FileEditObservation,
@@ -151,6 +153,7 @@ def get_action_message(
            FileReadAction,
            BrowseInteractiveAction,
            BrowseURLAction,
+            AgentThinkAction,
        ),
    ) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
        tool_metadata = action.tool_call_metadata
@@ -323,6 +326,9 @@ def get_observation_message(
            max_message_chars,
        )
        message = Message(role='user', content=[TextContent(text=text)])
+    elif isinstance(obs, AgentThinkObservation):
+        text = truncate_content(obs.content, max_message_chars)
+        message = Message(role='user', content=[TextContent(text=text)])
    elif isinstance(obs, ErrorObservation):
        text = truncate_content(obs.content, max_message_chars)
        text += '\n[Error occurred in processing last action]'
--- a/openhands/core/schema/action.py
+++ b/openhands/core/schema/action.py
@@ -44,6 +44,10 @@ class ActionTypeSchema(BaseModel):
    """Delegates a task to another agent.
    """

+    THINK: str = Field(default='think')
+    """Logs a thought.
+    """
+
    FINISH: str = Field(default='finish')
    """If you're absolutely certain that you've completed your task and have tested your work,
    use the finish action to stop working.
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -40,6 +40,8 @@ class ObservationTypeSchema(BaseModel):

    NULL: str = Field(default='null')

+    THINK: str = Field(default='think')
+
    AGENT_STATE_CHANGED: str = Field(default='agent_state_changed')

    USER_REJECTED: str = Field(default='user_rejected')
--- a/openhands/events/action/init.py
+++ b/openhands/events/action/init.py
@@ -4,6 +4,7 @@ from openhands.events.action.agent import (
    AgentFinishAction,
    AgentRejectAction,
    AgentSummarizeAction,
+    AgentThinkAction,
    ChangeAgentStateAction,
 )
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -33,4 +34,5 @@ __all__ = [
    'IPythonRunCellAction',
    'MessageAction',
    'ActionConfirmationStatus',
+    'AgentThinkAction',
 ]
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -54,6 +54,23 @@ class AgentFinishAction(Action):
        return "All done! What's next on the agenda?"


+@dataclass
+class AgentThinkAction(Action):
+    """An action where the agent logs a thought.
+
+    Attributes:
+        thought (str): The agent's explanation of its actions.
+        action (str): The action type, namely ActionType.THINK.
+    """
+
+    thought: str = ''
+    action: str = ActionType.THINK
+
+    @property
+    def message(self) -> str:
+        return f'I am thinking...: {self.thought}'
+
+
@dataclass
 class AgentRejectAction(Action):
    outputs: dict = field(default_factory=dict)
--- a/openhands/events/observation/init.py
+++ b/openhands/events/observation/init.py
@@ -1,6 +1,7 @@
 from openhands.events.observation.agent import (
    AgentCondensationObservation,
    AgentStateChangedObservation,
+    AgentThinkObservation,
 )
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
@@ -9,7 +10,9 @@ from openhands.events.observation.commands import (
    IPythonRunCellObservation,
 )
 from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.observation.empty import NullObservation
+from openhands.events.observation.empty import (
+    NullObservation,
+)
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.files import (
    FileEditObservation,
@@ -23,6 +26,7 @@ from openhands.events.observation.success import SuccessObservation
 __all__ = [
    'Observation',
    'NullObservation',
+    'AgentThinkObservation',
    'CmdOutputObservation',
    'CmdOutputMetadata',
    'IPythonRunCellObservation',
--- a/openhands/events/observation/agent.py
+++ b/openhands/events/observation/agent.py
@@ -25,3 +25,18 @@ class AgentCondensationObservation(Observation):
    @property
    def message(self) -> str:
        return self.content
+
+
+@dataclass
+class AgentThinkObservation(Observation):
+    """The output of a think action.
+
+    In practice, this is a no-op, since it will just reply a static message to the agent
+    acknowledging that the thought has been logged.
+    """
+
+    observation: str = ObservationType.THINK
+
+    @property
+    def message(self) -> str:
+        return self.content
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -6,6 +6,7 @@ from openhands.events.action.agent import (
    AgentDelegateAction,
    AgentFinishAction,
    AgentRejectAction,
+    AgentThinkAction,
    ChangeAgentStateAction,
 )
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -30,6 +31,7 @@ actions = (
    FileReadAction,
    FileWriteAction,
    FileEditAction,
+    AgentThinkAction,
    AgentFinishAction,
    AgentRejectAction,
    AgentDelegateAction,
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@@ -3,6 +3,7 @@ import copy
 from openhands.events.observation.agent import (
    AgentCondensationObservation,
    AgentStateChangedObservation,
+    AgentThinkObservation,
 )
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
@@ -11,7 +12,9 @@ from openhands.events.observation.commands import (
    IPythonRunCellObservation,
 )
 from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.observation.empty import NullObservation
+from openhands.events.observation.empty import (
+    NullObservation,
+)
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.files import (
    FileEditObservation,
@@ -36,6 +39,7 @@ observations = (
    AgentStateChangedObservation,
    UserRejectObservation,
    AgentCondensationObservation,
+    AgentThinkObservation,
 )

 OBSERVATION_TYPE_TO_CLASS = {
--- a/openhands/runtime/base.py
+++ b/openhands/runtime/base.py
@@ -22,6 +22,7 @@ from openhands.events import EventSource, EventStream, EventStreamSubscriber
 from openhands.events.action import (
    Action,
    ActionConfirmationStatus,
+    AgentThinkAction,
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
@@ -31,6 +32,7 @@ from openhands.events.action import (
 )
 from openhands.events.event import Event
 from openhands.events.observation import (
+    AgentThinkObservation,
    CmdOutputObservation,
    ErrorObservation,
    FileReadObservation,
@@ -381,6 +383,8 @@ class Runtime(FileEditRuntimeMixin):
        If the action is not supported by the current runtime, an ErrorObservation is returned.
        """
        if not action.runnable:
+            if isinstance(action, AgentThinkAction):
+                return AgentThinkObservation('Your thought has been logged.')
            return NullObservation('')
        if (
            hasattr(action, 'confirmation_state')
--- a/openhands/runtime/impl/action_execution/action_execution_client.py
+++ b/openhands/runtime/impl/action_execution/action_execution_client.py
@@ -16,6 +16,7 @@ from openhands.core.exceptions import (
 from openhands.events import EventStream
 from openhands.events.action import (
    ActionConfirmationStatus,
+    AgentThinkAction,
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
@@ -27,6 +28,7 @@ from openhands.events.action import (
 from openhands.events.action.action import Action
 from openhands.events.action.files import FileEditSource
 from openhands.events.observation import (
+    AgentThinkObservation,
    ErrorObservation,
    NullObservation,
    Observation,
@@ -230,6 +232,8 @@ class ActionExecutionClient(Runtime):

        with self.action_semaphore:
            if not action.runnable:
+                if isinstance(action, AgentThinkAction):
+                    return AgentThinkObservation('Your thought has been logged.')
                return NullObservation('')
            if (
                hasattr(action, 'confirmation_state')