feat(agent): CodeAct with function calling (#4537)

Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: tobitege <10787084+tobitege@users.noreply.github.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: tofarr <tofarr@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-03-22 13:47:19 +08:00 · 2024-10-28 22:06:33 -05:00
parent 421b4c108a
commit ae13171194
34 changed files with 1834 additions and 235 deletions
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -1,6 +1,7 @@
 from enum import Enum
 from typing import Literal

+from litellm import ChatCompletionMessageToolCall
 from pydantic import BaseModel, Field, model_serializer


@@ -48,10 +49,16 @@ class ImageContent(Content):


 class Message(BaseModel):
-    role: Literal['user', 'system', 'assistant']
-    content: list[TextContent | ImageContent] = Field(default=list)
+    role: Literal['user', 'system', 'assistant', 'tool']
+    content: list[TextContent | ImageContent] = Field(default_factory=list)
    cache_enabled: bool = False
    vision_enabled: bool = False
+    # function calling
+    # - tool calls (from LLM)
+    tool_calls: list[ChatCompletionMessageToolCall] | None = None
+    # - tool execution result (to LLM)
+    tool_call_id: str | None = None
+    name: str | None = None  # name of the tool

    @property
    def contains_image(self) -> bool:
@@ -59,23 +66,31 @@ class Message(BaseModel):

    @model_serializer
    def serialize_model(self) -> dict:
-        content: list[dict] | str
-        # two kinds of serializer:
-        # 1. vision serializer: when prompt caching or vision is enabled
-        # 2. single text serializer: for other cases
-        # remove this when liteLLM or providers support this format translation
-        if self.cache_enabled or self.vision_enabled:
-            # when prompt caching or vision is enabled, use vision serializer
-            content = []
-            for item in self.content:
-                if isinstance(item, TextContent):
-                    content.append(item.model_dump())
-                elif isinstance(item, ImageContent):
-                    content.extend(item.model_dump())
-        else:
-            # for other cases, concatenate all text content
-            # into a single string per message
-            content = '\n'.join(
-                item.text for item in self.content if isinstance(item, TextContent)
-            )
-        return {'content': content, 'role': self.role}
+        content: list[dict] = []
+        role_tool_with_prompt_caching = False
+        for item in self.content:
+            d = item.model_dump()
+            # We have to remove cache_prompt for tool content and move it up to the message level
+            # See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472
+            if self.role == 'tool' and item.cache_prompt:
+                role_tool_with_prompt_caching = True
+                d.pop('cache_control')
+            if isinstance(item, TextContent):
+                content.append(d)
+            elif isinstance(item, ImageContent) and self.vision_enabled:
+                content.extend(d)
+
+        ret: dict = {'content': content, 'role': self.role}
+
+        if role_tool_with_prompt_caching:
+            ret['cache_control'] = {'type': 'ephemeral'}
+
+        if self.tool_call_id is not None:
+            assert (
+                self.name is not None
+            ), 'name is required when tool_call_id is not None'
+            ret['tool_call_id'] = self.tool_call_id
+            ret['name'] = self.name
+        if self.tool_calls:
+            ret['tool_calls'] = self.tool_calls
+        return ret