Files
OpenHands/agenthub/codeact_agent/codeact_agent.py
2024-06-27 04:18:26 +02:00

292 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from litellm.exceptions import ContextWindowExceededError
from agenthub.codeact_agent.action_parser import CodeActResponseParser
from agenthub.codeact_agent.prompt import (
COMMAND_DOCS,
EXAMPLES,
GITHUB_MESSAGE,
SYSTEM_PREFIX,
SYSTEM_SUFFIX,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.exceptions import (
ContextWindowLimitExceededError,
TokenLimitExceedError,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.events.action import (
Action,
AgentFinishAction,
AgentSummarizeAction,
BrowseInteractiveAction,
CmdRunAction,
IPythonRunCellAction,
MessageAction,
)
from opendevin.events.observation import (
AgentDelegateObservation,
BrowserOutputObservation,
CmdOutputObservation,
IPythonRunCellObservation,
)
from opendevin.events.serialization.event import truncate_content
from opendevin.llm.llm import LLM
from opendevin.memory.condenser import MemoryCondenser
from opendevin.runtime.plugins import (
AgentSkillsRequirement,
JupyterRequirement,
PluginRequirement,
)
from opendevin.runtime.tools import RuntimeTool
ENABLE_GITHUB = True
def action_to_str(action: Action) -> str:
if isinstance(action, CmdRunAction):
return f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
elif isinstance(action, IPythonRunCellAction):
return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
elif isinstance(action, BrowseInteractiveAction):
return f'{action.thought}\n<execute_browse>\n{action.browser_actions}\n</execute_browse>'
elif isinstance(action, MessageAction):
return action.content
elif isinstance(action, AgentSummarizeAction):
return action.summary
return ''
def get_action_message(action: Action) -> dict[str, str] | None:
if (
isinstance(action, BrowseInteractiveAction)
or isinstance(action, CmdRunAction)
or isinstance(action, IPythonRunCellAction)
or isinstance(action, MessageAction)
or isinstance(action, AgentSummarizeAction)
):
return {
'role': 'user' if action.source == 'user' else 'assistant',
'content': action_to_str(action),
}
return None
def get_observation_message(obs) -> dict[str, str] | None:
if isinstance(obs, CmdOutputObservation):
content = 'OBSERVATION:\n' + truncate_content(obs.content)
content += (
f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
)
return {'role': 'user', 'content': content}
elif isinstance(obs, IPythonRunCellObservation):
content = 'OBSERVATION:\n' + obs.content
# replace base64 images with a placeholder
splitted = content.split('\n')
for i, line in enumerate(splitted):
if '![image](data:image/png;base64,' in line:
splitted[i] = (
'![image](data:image/png;base64, ...) already displayed to user'
)
content = '\n'.join(splitted)
content = truncate_content(content)
return {'role': 'user', 'content': content}
elif isinstance(obs, BrowserOutputObservation):
content = 'OBSERVATION:\n' + truncate_content(obs.content)
return {'role': 'user', 'content': content}
elif isinstance(obs, AgentDelegateObservation):
content = 'OBSERVATION:\n' + truncate_content(str(obs.outputs))
return {'role': 'user', 'content': content}
return None
# FIXME: We can tweak these two settings to create MicroAgents specialized toward different area
def get_system_message() -> str:
if ENABLE_GITHUB:
return f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
else:
return f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}'
def get_in_context_example() -> str:
return EXAMPLES
class CodeActAgent(Agent):
VERSION = '1.7'
"""
The Code Act Agent is a minimalist agent.
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
### Overview
This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.13463), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
The conceptual idea is illustrated below. At each turn, the agent can:
1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
2. **CodeAct**: Choose to perform the task by executing code
- Execute any valid Linux `bash` command
- Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
![image](https://github.com/OpenDevin/OpenDevin/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
### Plugin System
To make the CodeAct agent more powerful with only access to `bash` action space, CodeAct agent leverages OpenDevin's plugin system:
- [Jupyter plugin](https://github.com/OpenDevin/OpenDevin/tree/main/opendevin/runtime/plugins/jupyter): for IPython execution via bash command
- [SWE-agent tool plugin](https://github.com/OpenDevin/OpenDevin/tree/main/opendevin/runtime/plugins/swe_agent_commands): Powerful bash command line tools for software development tasks introduced by [swe-agent](https://github.com/princeton-nlp/swe-agent).
### Demo
https://github.com/OpenDevin/OpenDevin/assets/38853559/f592a192-e86c-4f48-ad31-d69282d5f6ac
*Example of CodeActAgent with `gpt-4-turbo-2024-04-09` performing a data science task (linear regression)*
### Work-in-progress & Next step
[] Support web-browsing
[] Complete the workflow for CodeAct agent to submit Github PRs
"""
sandbox_plugins: list[PluginRequirement] = [
# NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
# AgentSkillsRequirement provides a lot of Python functions
# and it need to be initialized before Jupyter for Jupyter to use those functions.
AgentSkillsRequirement(),
JupyterRequirement(),
]
runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
system_message: str = get_system_message()
in_context_example: str = f"Here is an example of how you can interact with the environment for task solving:\n{get_in_context_example()}\n\nNOW, LET'S START!"
attempts_to_condense: int = 10
action_parser = CodeActResponseParser()
def __init__(
self,
llm: LLM,
attempts_to_condense: int = 2,
) -> None:
"""Initializes a new instance of the CodeActAgent class.
Parameters:
- llm (LLM): The llm to be used by this agent
"""
super().__init__(llm)
self.memory_condenser = MemoryCondenser(llm)
self.attempts_to_condense = attempts_to_condense
self.reset()
def reset(self) -> None:
"""
Resets the CodeAct Agent.
"""
super().reset()
def step(self, state: State) -> Action:
"""
Run the agent for one step.
Parameters:
- state: The current state of the environment.
Returns:
- CmdRunAction(command) - bash command to run
- IPythonRunCellAction(code) - IPython code to run
- AgentDelegateAction(agent, inputs) - delegate action for (sub)task
- MessageAction(content) - Message action to run (e.g. ask for clarification)
- AgentFinishAction() - end the interaction
"""
logger.info(f'Running CodeActAgent v{self.VERSION}')
# if we're done, go back
latest_user_message = state.history.get_latest_user_message()
if latest_user_message and latest_user_message.strip() == '/exit':
return AgentFinishAction()
# prepare what we want to send to the LLM
messages: list[dict[str, str]] = self._get_messages(state)
response = None
# give it multiple chances to get a response
# if it fails, we'll try to condense memory
attempt = 0
while not response and attempt < self.attempts_to_condense:
try:
response = self.llm.completion(
messages=messages,
stop=[
'</execute_ipython>',
'</execute_bash>',
'</execute_browse>',
],
temperature=0.0,
)
except (ContextWindowExceededError, TokenLimitExceedError):
logger.warning(
'Context window exceeded or token limit exceeded. Condensing memory, attempt %d',
attempt,
)
# Retry processing events with condensed memory
summary_action = self.memory_condenser.condense(state.history)
attempt += 1
if summary_action:
# update the messages, so we get the new summary
messages = self._get_messages(state)
continue
if not response:
raise ContextWindowLimitExceededError(
'Context window limit exceeded. Unable to condense memory.'
)
state.num_of_chars += sum(
len(message['content']) for message in messages
) + len(response.choices[0].message.content)
return self.action_parser.parse(response)
def search_memory(self, query: str) -> list[str]:
raise NotImplementedError('Implement this abstract method')
def _get_messages(self, state: State) -> list[dict[str, str]]:
messages = [
{'role': 'system', 'content': self.system_message},
{
'role': 'user',
'content': self.in_context_example,
},
]
for event in state.history.get_events():
message = (
get_action_message(event)
if isinstance(event, Action)
else get_observation_message(event)
)
if message:
messages.append(message)
latest_user_message = next(
(m for m in reversed(messages) if m['role'] == 'user'), None
)
if latest_user_message:
if state.almost_stuck == 1:
latest_user_message['content'] += (
'\n\nENVIRONMENT REMINDER: You are almost stuck repeating the same action. You have only 1 iteration left and you must change your approach. Now.'
)
elif state.almost_stuck == 2:
latest_user_message['content'] += (
'\n\nENVIRONMENT REMINDER: You are almost stuck repeating the same action. You have only 2 iterations left and you must change your approach.'
)
else:
latest_user_message['content'] += (
f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
)
return messages