(fix) CodeActAgent: fix issues with vision support in prompts (#3665)

* CodeActAgent: fix message prep if prompt caching is not supported * fix python version in regen tests workflow * fix in conftest "mock_completion" method * add disable_vision to LLMConfig; revert change in message parsing in llm.py * format messages in several files for completion * refactored message(s) formatting (llm.py); added vision_is_active() * fix a unit test * regenerate: added LOG_TO_FILE and FORCE_REGENERATE env flags * try to fix path to logs folder in workflow * llm: prevent index error * try FORCE_USE_LLM in regenerate * tweaks everywhere... * fix 2 random unit test errors :( * added FORCE_REGENERATE_TESTS=true to regenerate CLI * fix test_lint_file_fail_typescript again * double-quotes for env vars in workflow; llm logger set to debug * fix typo in regenerate * regenerate iterations now 20; applied iteration counter fix by Li * regenerate: pass FORCE_REGENERATE flag into env * fixes for int tests. several mock files updated. * browsing_agent: fix response_parser.py adding ) to empty response * test_browse_internet: fix skipif and revert obsolete mock files * regenerate: fi bracketing for http server start/kill conditions * disable test_browse_internet for CodeAct*Agents; mock files updated after merge * missed to include more mock files earlier * reverts after review feedback from Li * forgot one * browsing agent test, partial fixes and updated mock files * test_browse_internet works in my WSL now! * adapt unit test test_prompt_caching.py * add DEBUG to regenerate workflow command * convert regenerate workflow params to inputs * more integration test mock files updated * more files * test_prompt_caching: restored test_prompt_caching_headers purpose * file_ops: fix potential exception, like "cross device copy"; fixed mock files accordingly * reverts/changes wrt feedback from xingyao * updated docs and config template * code cleanup wrt review feedback
2026-03-22 13:47:19 +08:00 · 2024-09-04 17:58:30 +02:00
parent 1b66f2e777
commit bc31fb15fe
106 changed files with 8858 additions and 2144 deletions
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -1,8 +1,11 @@
 from enum import Enum
+from typing import Union

 from pydantic import BaseModel, Field, model_serializer
 from typing_extensions import Literal

+from openhands.core.logger import openhands_logger as logger
+

 class ContentType(Enum):
    TEXT = 'text'
@@ -10,7 +13,7 @@ class ContentType(Enum):


 class Content(BaseModel):
-    type: ContentType
+    type: str
    cache_prompt: bool = False

    @model_serializer
@@ -19,13 +22,13 @@ class Content(BaseModel):


 class TextContent(Content):
-    type: ContentType = ContentType.TEXT
+    type: str = ContentType.TEXT.value
    text: str

    @model_serializer
    def serialize_model(self):
        data: dict[str, str | dict[str, str]] = {
-            'type': self.type.value,
+            'type': self.type,
            'text': self.text,
        }
        if self.cache_prompt:
@@ -34,14 +37,14 @@ class TextContent(Content):


 class ImageContent(Content):
-    type: ContentType = ContentType.IMAGE_URL
+    type: str = ContentType.IMAGE_URL.value
    image_urls: list[str]

    @model_serializer
    def serialize_model(self):
        images: list[dict[str, str | dict[str, str]]] = []
        for url in self.image_urls:
-            images.append({'type': self.type.value, 'image_url': {'url': url}})
+            images.append({'type': self.type, 'image_url': {'url': url}})
        if self.cache_prompt and images:
            images[-1]['cache_control'] = {'type': 'ephemeral'}
        return images
@@ -65,4 +68,50 @@ class Message(BaseModel):
            elif isinstance(item, ImageContent):
                content.extend(item.model_dump())

-        return {'role': self.role, 'content': content}
+        return {'content': content, 'role': self.role}
+
+
+def format_messages(
+    messages: Union[Message, list[Message]], with_images: bool
+) -> list[dict]:
+    if not isinstance(messages, list):
+        messages = [messages]
+
+    if with_images:
+        return [message.model_dump() for message in messages]
+
+    converted_messages = []
+    for message in messages:
+        content_str = ''
+        role = 'user'
+        if 'role' in message:
+            role = message['role']
+        if isinstance(message, str):
+            content_str = content_str + message + '\n'
+            continue
+
+        if isinstance(message, dict):
+            if 'content' in message:
+                content_str = content_str + message['content'] + '\n'
+        elif isinstance(message, Message):
+            role = message.role
+            for content in message.content:
+                if isinstance(content, list):
+                    for item in content:
+                        if isinstance(item, TextContent):
+                            content_str = content_str + item.text + '\n'
+                elif isinstance(content, TextContent):
+                    content_str = content_str + content.text + '\n'
+        else:
+            logger.error(
+                f'>>> `message` is not a string, dict, or Message: {type(message)}'
+            )
+
+        if content_str:
+            converted_messages.append(
+                {
+                    'role': role,
+                    'content': content_str,
+                }
+            )
+    return converted_messages