Files
OpenHands/agenthub/planner_agent/agent.py
Kaushik Deka 415843476c Feat: Add Vision Input Support for LLM with Vision Capabilities (#2848)
* add image feature

* fix-linting

* check model support for images

* add comment

* Add image support to other models

* Add images to chat

* fix linting

* fix test issues

* refactor variable names and import

* fix tests

* fix chat message tests

* fix linting

* add pydantic class message

* use message

* remove redundant comments

* remove redundant comments

* change Message class

* remove unintended change

* fix integration tests using regenerate.sh

* rename image_bas64 to images_url, fix tests

* rename Message.py to message, change reminder append logic, add unit tests

* remove comment, fix error to merge

* codeact_swe_agent

* fix f string

* update eventstream integration tests

* add missing if check in codeact_swe_agent

* update integration tests

* Update frontend/src/components/chat/ChatInput.tsx

* Update frontend/src/components/chat/ChatInput.tsx

* Update frontend/src/components/chat/ChatInput.tsx

* Update frontend/src/components/chat/ChatInput.tsx

* Update frontend/src/components/chat/ChatMessage.tsx

---------

Co-authored-by: tobitege <tobitege@gmx.de>
Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
Co-authored-by: sp.wack <83104063+amanape@users.noreply.github.com>
2024-08-04 02:26:22 +08:00

56 lines
2.1 KiB
Python

from agenthub.planner_agent.response_parser import PlannerResponseParser
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.message import ImageContent, Message, TextContent
from opendevin.events.action import Action, AgentFinishAction
from opendevin.llm.llm import LLM
from opendevin.runtime.tools import RuntimeTool
from .prompt import get_prompt_and_images
class PlannerAgent(Agent):
VERSION = '1.0'
"""
The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
"""
runtime_tools: list[RuntimeTool] = [RuntimeTool.BROWSER]
response_parser = PlannerResponseParser()
def __init__(self, llm: LLM):
"""Initialize the Planner Agent with an LLM
Parameters:
- llm (LLM): The llm to be used by this agent
"""
super().__init__(llm)
def step(self, state: State) -> Action:
"""Checks to see if current step is completed, returns AgentFinishAction if True.
Otherwise, creates a plan prompt and sends to model for inference, returning the result as the next action.
Parameters:
- state (State): The current state given the previous actions and observations
Returns:
- AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
- Action: The next action to take based on llm response
"""
if state.root_task.state in [
'completed',
'verified',
'abandoned',
]:
return AgentFinishAction()
prompt, image_urls = get_prompt_and_images(
state, self.llm.config.max_message_chars
)
content = [TextContent(text=prompt)]
if image_urls:
content.append(ImageContent(image_urls=image_urls))
message = Message(role='user', content=content)
resp = self.llm.completion(messages=[message.model_dump()])
return self.response_parser.parse(resp)