[agent] Add "thinking" tool only (#6977)

This commit is contained in:
Xingyao Wang
2025-02-27 16:47:39 -05:00
committed by GitHub
parent 6e3e9e397e
commit 42332294a1
22 changed files with 177 additions and 4 deletions

View File

@@ -4178,6 +4178,21 @@
"es": "Navegando en la web",
"tr": "Web'de geziniyor"
},
"ACTION_MESSAGE$THINK": {
"en": "Thinking",
"zh-CN": "思考",
"zh-TW": "思考",
"ko-KR": "생각",
"ja": "考える",
"no": "Tenker",
"ar": "يفكر",
"de": "Denkt",
"fr": "Pensant",
"it": "Pensando",
"pt": "Pensando",
"es": "Pensando",
"tr": "Düşünüyor"
},
"OBSERVATION_MESSAGE$RUN": {
"en": "Ran a bash command",
"zh-CN": "运行",

View File

@@ -48,6 +48,8 @@ export function handleObservationMessage(message: ObservationMessage) {
break;
case ObservationType.READ:
case ObservationType.EDIT:
case ObservationType.THINK:
case ObservationType.NULL:
break; // We don't display the default message for these observations
default:
store.dispatch(addAssistantMessage(message.message));

View File

@@ -115,6 +115,8 @@ export const chatSlice = createSlice({
) {
text += `\n\n${getRiskText(action.payload.args.security_risk as unknown as ActionSecurityRisk)}`;
}
} else if (actionID === "think") {
text = action.payload.args.thought;
}
const message: Message = {
type: "action",

View File

@@ -26,6 +26,9 @@ enum ActionType {
// Delegate a (sub)task to another agent.
DELEGATE = "delegate",
// Logs a thought.
THINK = "think",
// If you're absolutely certain that you've completed your task and have tested your work,
// use the finish action to stop working.
FINISH = "finish",

View File

@@ -41,6 +41,13 @@ export interface IPythonAction extends OpenHandsActionEvent<"run_ipython"> {
};
}
export interface ThinkAction extends OpenHandsActionEvent<"think"> {
source: "agent";
args: {
thought: string;
};
}
export interface FinishAction extends OpenHandsActionEvent<"finish"> {
source: "agent";
args: {
@@ -129,6 +136,7 @@ export type OpenHandsAction =
| AssistantMessageAction
| CommandAction
| IPythonAction
| ThinkAction
| FinishAction
| DelegateAction
| BrowseAction

View File

@@ -10,6 +10,7 @@ export type OpenHandsEventType =
| "browse"
| "browse_interactive"
| "reject"
| "think"
| "finish"
| "error";

View File

@@ -80,8 +80,17 @@ export interface ErrorObservation extends OpenHandsObservationEvent<"error"> {
};
}
export interface AgentThinkObservation
extends OpenHandsObservationEvent<"think"> {
source: "agent";
extras: {
thought: string;
};
}
export type OpenHandsObservation =
| AgentStateChangeObservation
| AgentThinkObservation
| CommandObservation
| IPythonObservation
| DelegateObservation

View File

@@ -22,6 +22,12 @@ enum ObservationType {
// Delegate result
DELEGATE = "delegate",
// A response to the agent's thought (usually a static message)
THINK = "think",
// A no-op observation
NULL = "null",
}
export default ObservationType;

View File

@@ -17,6 +17,7 @@ from openhands.agenthub.codeact_agent.tools import (
IPythonTool,
LLMBasedFileEditTool,
StrReplaceEditorTool,
ThinkTool,
WebReadTool,
)
from openhands.core.exceptions import (
@@ -27,6 +28,7 @@ from openhands.events.action import (
Action,
AgentDelegateAction,
AgentFinishAction,
AgentThinkAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
@@ -42,7 +44,9 @@ from openhands.events.tool import ToolCallMetadata
def combine_thought(action: Action, thought: str) -> Action:
if not hasattr(action, 'thought'):
return action
if thought:
if thought and action.thought:
action.thought = f'{thought}\n{action.thought}'
elif thought:
action.thought = thought
return action
@@ -71,6 +75,11 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
raise RuntimeError(
f'Failed to parse tool call arguments: {tool_call.function.arguments}'
) from e
# ================================================
# CmdRunTool (Bash)
# ================================================
if tool_call.function.name == CmdRunTool['function']['name']:
if 'command' not in arguments:
raise FunctionCallValidationError(
@@ -79,6 +88,10 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
# convert is_input to boolean
is_input = arguments.get('is_input', 'false') == 'true'
action = CmdRunAction(command=arguments['command'], is_input=is_input)
# ================================================
# IPythonTool (Jupyter)
# ================================================
elif tool_call.function.name == IPythonTool['function']['name']:
if 'code' not in arguments:
raise FunctionCallValidationError(
@@ -90,8 +103,16 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
agent='BrowsingAgent',
inputs=arguments,
)
# ================================================
# AgentFinishAction
# ================================================
elif tool_call.function.name == FinishTool['function']['name']:
action = AgentFinishAction()
# ================================================
# LLMBasedFileEditTool (LLM-based file editor, deprecated)
# ================================================
elif tool_call.function.name == LLMBasedFileEditTool['function']['name']:
if 'path' not in arguments:
raise FunctionCallValidationError(
@@ -138,12 +159,25 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
impl_source=FileEditSource.OH_ACI,
**other_kwargs,
)
# ================================================
# AgentThinkAction
# ================================================
elif tool_call.function.name == ThinkTool['function']['name']:
action = AgentThinkAction(thought=arguments.get('thought', ''))
# ================================================
# BrowserTool
# ================================================
elif tool_call.function.name == BrowserTool['function']['name']:
if 'code' not in arguments:
raise FunctionCallValidationError(
f'Missing required argument "code" in tool call {tool_call.function.name}'
)
action = BrowseInteractiveAction(browser_actions=arguments['code'])
# ================================================
# WebReadTool (simplified browsing)
# ================================================
elif tool_call.function.name == WebReadTool['function']['name']:
if 'url' not in arguments:
raise FunctionCallValidationError(
@@ -183,7 +217,7 @@ def get_tools(
codeact_enable_llm_editor: bool = False,
codeact_enable_jupyter: bool = False,
) -> list[ChatCompletionToolParam]:
tools = [CmdRunTool, FinishTool]
tools = [CmdRunTool, ThinkTool, FinishTool]
if codeact_enable_browsing:
tools.append(WebReadTool)
tools.append(BrowserTool)

View File

@@ -4,6 +4,7 @@ from .finish import FinishTool
from .ipython import IPythonTool
from .llm_based_edit import LLMBasedFileEditTool
from .str_replace_editor import StrReplaceEditorTool
from .think import ThinkTool
from .web_read import WebReadTool
__all__ = [
@@ -14,4 +15,5 @@ __all__ = [
'LLMBasedFileEditTool',
'StrReplaceEditorTool',
'WebReadTool',
'ThinkTool',
]

View File

@@ -0,0 +1,27 @@
from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
_THINK_DESCRIPTION = """Use the tool to think about something. It will not obtain new information or make any changes to the repository, but just log the thought. Use it when complex reasoning or brainstorming is needed.
Common use cases:
1. When exploring a repository and discovering the source of a bug, call this tool to brainstorm several unique ways of fixing the bug, and assess which change(s) are likely to be simplest and most effective.
2. After receiving test results, use this tool to brainstorm ways to fix failing tests.
3. When planning a complex refactoring, use this tool to outline different approaches and their tradeoffs.
4. When designing a new feature, use this tool to think through architecture decisions and implementation details.
5. When debugging a complex issue, use this tool to organize your thoughts and hypotheses.
The tool simply logs your thought process for better transparency and does not execute any code or make changes."""
ThinkTool = ChatCompletionToolParam(
type='function',
function=ChatCompletionToolParamFunctionChunk(
name='think',
description=_THINK_DESCRIPTION,
parameters={
'type': 'object',
'properties': {
'thought': {'type': 'string', 'description': 'The thought to log.'},
},
'required': ['thought'],
},
),
)

View File

@@ -7,6 +7,7 @@ from openhands.events.action import (
Action,
AgentDelegateAction,
AgentFinishAction,
AgentThinkAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
@@ -19,6 +20,7 @@ from openhands.events.event import Event
from openhands.events.observation import (
AgentCondensationObservation,
AgentDelegateObservation,
AgentThinkObservation,
BrowserOutputObservation,
CmdOutputObservation,
FileEditObservation,
@@ -151,6 +153,7 @@ def get_action_message(
FileReadAction,
BrowseInteractiveAction,
BrowseURLAction,
AgentThinkAction,
),
) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
tool_metadata = action.tool_call_metadata
@@ -323,6 +326,9 @@ def get_observation_message(
max_message_chars,
)
message = Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, AgentThinkObservation):
text = truncate_content(obs.content, max_message_chars)
message = Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, ErrorObservation):
text = truncate_content(obs.content, max_message_chars)
text += '\n[Error occurred in processing last action]'

View File

@@ -44,6 +44,10 @@ class ActionTypeSchema(BaseModel):
"""Delegates a task to another agent.
"""
THINK: str = Field(default='think')
"""Logs a thought.
"""
FINISH: str = Field(default='finish')
"""If you're absolutely certain that you've completed your task and have tested your work,
use the finish action to stop working.

View File

@@ -40,6 +40,8 @@ class ObservationTypeSchema(BaseModel):
NULL: str = Field(default='null')
THINK: str = Field(default='think')
AGENT_STATE_CHANGED: str = Field(default='agent_state_changed')
USER_REJECTED: str = Field(default='user_rejected')

View File

@@ -4,6 +4,7 @@ from openhands.events.action.agent import (
AgentFinishAction,
AgentRejectAction,
AgentSummarizeAction,
AgentThinkAction,
ChangeAgentStateAction,
)
from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -33,4 +34,5 @@ __all__ = [
'IPythonRunCellAction',
'MessageAction',
'ActionConfirmationStatus',
'AgentThinkAction',
]

View File

@@ -54,6 +54,23 @@ class AgentFinishAction(Action):
return "All done! What's next on the agenda?"
@dataclass
class AgentThinkAction(Action):
"""An action where the agent logs a thought.
Attributes:
thought (str): The agent's explanation of its actions.
action (str): The action type, namely ActionType.THINK.
"""
thought: str = ''
action: str = ActionType.THINK
@property
def message(self) -> str:
return f'I am thinking...: {self.thought}'
@dataclass
class AgentRejectAction(Action):
outputs: dict = field(default_factory=dict)

View File

@@ -1,6 +1,7 @@
from openhands.events.observation.agent import (
AgentCondensationObservation,
AgentStateChangedObservation,
AgentThinkObservation,
)
from openhands.events.observation.browse import BrowserOutputObservation
from openhands.events.observation.commands import (
@@ -9,7 +10,9 @@ from openhands.events.observation.commands import (
IPythonRunCellObservation,
)
from openhands.events.observation.delegate import AgentDelegateObservation
from openhands.events.observation.empty import NullObservation
from openhands.events.observation.empty import (
NullObservation,
)
from openhands.events.observation.error import ErrorObservation
from openhands.events.observation.files import (
FileEditObservation,
@@ -23,6 +26,7 @@ from openhands.events.observation.success import SuccessObservation
__all__ = [
'Observation',
'NullObservation',
'AgentThinkObservation',
'CmdOutputObservation',
'CmdOutputMetadata',
'IPythonRunCellObservation',

View File

@@ -25,3 +25,18 @@ class AgentCondensationObservation(Observation):
@property
def message(self) -> str:
return self.content
@dataclass
class AgentThinkObservation(Observation):
"""The output of a think action.
In practice, this is a no-op, since it will just reply a static message to the agent
acknowledging that the thought has been logged.
"""
observation: str = ObservationType.THINK
@property
def message(self) -> str:
return self.content

View File

@@ -6,6 +6,7 @@ from openhands.events.action.agent import (
AgentDelegateAction,
AgentFinishAction,
AgentRejectAction,
AgentThinkAction,
ChangeAgentStateAction,
)
from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -30,6 +31,7 @@ actions = (
FileReadAction,
FileWriteAction,
FileEditAction,
AgentThinkAction,
AgentFinishAction,
AgentRejectAction,
AgentDelegateAction,

View File

@@ -3,6 +3,7 @@ import copy
from openhands.events.observation.agent import (
AgentCondensationObservation,
AgentStateChangedObservation,
AgentThinkObservation,
)
from openhands.events.observation.browse import BrowserOutputObservation
from openhands.events.observation.commands import (
@@ -11,7 +12,9 @@ from openhands.events.observation.commands import (
IPythonRunCellObservation,
)
from openhands.events.observation.delegate import AgentDelegateObservation
from openhands.events.observation.empty import NullObservation
from openhands.events.observation.empty import (
NullObservation,
)
from openhands.events.observation.error import ErrorObservation
from openhands.events.observation.files import (
FileEditObservation,
@@ -36,6 +39,7 @@ observations = (
AgentStateChangedObservation,
UserRejectObservation,
AgentCondensationObservation,
AgentThinkObservation,
)
OBSERVATION_TYPE_TO_CLASS = {

View File

@@ -22,6 +22,7 @@ from openhands.events import EventSource, EventStream, EventStreamSubscriber
from openhands.events.action import (
Action,
ActionConfirmationStatus,
AgentThinkAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
@@ -31,6 +32,7 @@ from openhands.events.action import (
)
from openhands.events.event import Event
from openhands.events.observation import (
AgentThinkObservation,
CmdOutputObservation,
ErrorObservation,
FileReadObservation,
@@ -381,6 +383,8 @@ class Runtime(FileEditRuntimeMixin):
If the action is not supported by the current runtime, an ErrorObservation is returned.
"""
if not action.runnable:
if isinstance(action, AgentThinkAction):
return AgentThinkObservation('Your thought has been logged.')
return NullObservation('')
if (
hasattr(action, 'confirmation_state')

View File

@@ -16,6 +16,7 @@ from openhands.core.exceptions import (
from openhands.events import EventStream
from openhands.events.action import (
ActionConfirmationStatus,
AgentThinkAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
@@ -27,6 +28,7 @@ from openhands.events.action import (
from openhands.events.action.action import Action
from openhands.events.action.files import FileEditSource
from openhands.events.observation import (
AgentThinkObservation,
ErrorObservation,
NullObservation,
Observation,
@@ -230,6 +232,8 @@ class ActionExecutionClient(Runtime):
with self.action_semaphore:
if not action.runnable:
if isinstance(action, AgentThinkAction):
return AgentThinkObservation('Your thought has been logged.')
return NullObservation('')
if (
hasattr(action, 'confirmation_state')