更新agent和支持的模型

2026-03-22 13:07:17 +08:00 · 2025-03-14 16:35:48 +08:00
parent 800e2f33cb
commit 95b8ad3407
5 changed files with 17 additions and 15 deletions
--- a/SUPPORT_MODEL.md
+++ b/SUPPORT_MODEL.md
@@ -1,3 +1,4 @@
 | Vendor-en | Vendor-ch | Model | base-url |
 | --- | --- | --- | --- |
 | openainext | openainext | gpt-4o-2024-11-20 | https://api.openai-next.com/v1 |
+| openainext | openainext | gpt-4.5-preview-2025-02-27 | https://api.openai-next.com/v1 |
--- a/gradio_ui/agent/task_run_agent.py
+++ b/gradio_ui/agent/task_run_agent.py
@@ -1,3 +1,4 @@
+
 import json
 import uuid
 from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage
@@ -6,6 +7,8 @@ from gradio_ui.agent.base_agent import BaseAgent
 from xbrain.core.chat import run
 import platform
 import re
+
+from gradio_ui.tools.computer import Action
 class TaskRunAgent(BaseAgent):
    def __init__(self):
        self.OUTPUT_DIR = "./tmp/outputs"
@@ -90,8 +93,7 @@ class TaskRunAgentResponse(BaseModel):
    next_action: str = Field(
        description="选择一个操作类型，如果找不到合适的操作，请选择None",
        json_schema_extra={
-            "enum": ["type", "left_click", "right_click", "double_click", 
-                    "hover", "scroll_up", "scroll_down", "wait", "None"]
+            "enum": Action
        }
    )
    box_id: int = Field(description="要操作的框ID，当next_action为left_click、right_click、double_click、hover时提供，否则为None", default=None)
@@ -133,16 +135,6 @@ system_prompt = """
 }}
 ```

-【next_action】仅包括下面之一：
- type：输入一串文本。
- left_click：将鼠标移动到框ID并左键单击。
- right_click：将鼠标移动到框ID并右键单击。
- double_click：将鼠标移动到框ID并双击。
- hover：将鼠标移动到框ID。
- scroll_up：向上滚动屏幕以查看之前的内容。
- scroll_down：当所需按钮不可见或您需要查看更多内容时，向下滚动屏幕。
- wait：等待1秒钟让设备加载或响应。
-
 ##########
 ### 案例 ###
 一个例子：
--- a/gradio_ui/agent/verification_agent.py
+++ b/gradio_ui/agent/verification_agent.py
@@ -3,6 +3,8 @@ from pydantic import Field,BaseModel
 from gradio_ui.agent.base_agent import BaseAgent
 from xbrain.core.chat import run

+from gradio_ui.tools.computer import Action
+
 class VerificationAgent(BaseAgent):
    def __call__(self, messages, parsed_screen_result):
        messages.append(
@@ -17,7 +19,7 @@ class VerificationAgent(BaseAgent):
             })
        response = run(
            messages, 
-            user_prompt=prompt.format(screen_info=str(parsed_screen_result['parsed_content_list'])), 
+            user_prompt=prompt.format(screen_info=str(parsed_screen_result['parsed_content_list'], action_list=str(Action))), 
            response_format=VerificationResponse
        )
        return json.loads(response)
@@ -74,6 +76,12 @@ prompt = """
 - **模糊匹配**：允许近似匹配而非精确匹配
 - **超时设置**：指定验证的最长等待时间

+### 补救措施 ###
+补救措施建议如下：
+- 【推荐】可以再等待一段时间看看效果，因为上一个操作还没执行完成就开始了验证
+- 再一次操作
+- 检查是否存在其他验证方法，但是仅限于以下几个动作：
+{action_list}
 ### 例子 ###
 操作：点击"登录"按钮
 预期结果：登录成功并显示首页
--- a/gradio_ui/loop.py
+++ b/gradio_ui/loop.py
@@ -37,7 +37,7 @@ def sampling_loop_sync(
    for plan in plan_list:      
        execute_task_plan(plan, vision_agent, task_run_agent, executor, messages)
        yield
-        sleep(2)
+        sleep(5)
        yield from verification_loop(vision_agent, verification_agent, executor, task_run_agent, messages)
        

--- a/gradio_ui/tools/computer.py
+++ b/gradio_ui/tools/computer.py
@@ -28,7 +28,8 @@ Action = Literal[
    "hover",
    "wait",
    "scroll_up",
-    "scroll_down"
+    "scroll_down",
+    "None"
 ]

 class Resolution(TypedDict):