From 73a5c991c8dbba2e6f2e193ee52892872c9486f7 Mon Sep 17 00:00:00 2001
From: yuruo <yuruotong1@163.com>
Date: Wed, 12 Mar 2025 10:38:34 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=B8=AD=E6=AD=A2=E9=80=BB?=
 =?UTF-8?q?=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 gradio_ui/agent/task_run_agent.py | 31 ++++++++++++++++---------------
 gradio_ui/loop.py                 |  2 +-
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/gradio_ui/agent/task_run_agent.py b/gradio_ui/agent/task_run_agent.py
index 60094f9..95461f3 100644
--- a/gradio_ui/agent/task_run_agent.py
+++ b/gradio_ui/agent/task_run_agent.py
@@ -11,7 +11,8 @@ from xbrain.core.chat import run
 import platform
 import re
 class TaskRunAgent(BaseAgent):
-    def __init__(self):
+    def __init__(self, output_callback):
+        self.output_callback = output_callback
         self.OUTPUT_DIR = "./tmp/outputs"
        
     def __call__(self, task_plan, parsed_screen):
@@ -23,11 +24,12 @@ class TaskRunAgent(BaseAgent):
         screen_width, screen_height = parsed_screen['width'], parsed_screen['height']
         vlm_response = run([{"role": "user", "content": "next"}], user_prompt=self.SYSTEM_PROMPT, response_format=TaskRunAgentResponse)
         vlm_response_json = json.loads(vlm_response)
-        img_to_show_base64 = parsed_screen["image"]
         if "box_id" in vlm_response_json:
             try:
-                bbox = parsed_screen["parsed_content_list"][int(vlm_response_json["box_id"])]["bbox"]
-                vlm_response_json["box_centroid_coordinate"] = [int((bbox[0] + bbox[2]) / 2 * screen_width), int((bbox[1] + bbox[3]) / 2 * screen_height)]
+                bbox = parsed_screen["parsed_content_list"][int(vlm_response_json["box_id"])].coordinates
+                # vlm_response_json["box_centroid_coordinate"] = [int((bbox[0] + bbox[2]) / 2 * screen_width), int((bbox[1] + bbox[3]) / 2 * screen_height)]
+                vlm_response_json["box_centroid_coordinate"] = [int((bbox[0] + bbox[2]) / 2 ), int((bbox[1] + bbox[3]) / 2 )]
+
                 # img_to_show_data = base64.b64decode(img_to_show_base64)
                 # img_to_show = Image.open(BytesIO(img_to_show_data))
                 img_to_show = parsed_screen["image"]
@@ -98,16 +100,15 @@ class TaskRunAgent(BaseAgent):
 
 class TaskRunAgentResponse(BaseModel):
     reasoning: str = Field(description="描述当前屏幕上的内容，考虑历史记录，然后描述您如何实现任务的逐步思考，一次从可用操作中选择一个操作。")
-    next_action: str = Field(description="一次一个操作，简短精确地描述它。")
-    action_type: str = Field(
-        description="选择一个操作类型",
+    next_action: str = Field(
+        description="选择一个操作类型，如果找不到合适的操作，请选择None",
         json_schema_extra={
             "enum": ["type", "left_click", "right_click", "double_click", 
-                    "hover", "scroll_up", "scroll_down", "wait"]
+                    "hover", "scroll_up", "scroll_down", "wait", "None"]
         }
     )
-    box_id: int = Field(description="要操作的框ID，当action_type为left_click、right_click、double_click、hover时提供，否则为None", default=None)
-    value: str = Field(description="仅当action_type为type时提供，否则为None", default=None)
+    box_id: int = Field(description="要操作的框ID，当next_action为left_click、right_click、double_click、hover时提供，否则为None", default=None)
+    value: str = Field(description="仅当next_action为type时提供，否则为None", default=None)
 
 system_prompt = """
 ### 目标 ###
@@ -134,20 +135,20 @@ system_prompt = """
 8. 如果您收到登录信息页面或验证码页面的提示，或者您认为下一步操作需要用户许可，您应该在json字段中说"Next Action": "None"。
 9. 你只能使用鼠标和键盘与计算机进行交互。
 10. 你只能与桌面图形用户界面交互（无法访问终端或应用程序菜单）。
-
+11. 如果当前屏幕没有显示任何可操作的元素，并且当前屏幕不能下滑，请选择None，退出操作。
 
 ##########
 ### 输出格式 ###
 ```json
 {{
-    "Reasoning": str, # 描述当前屏幕上的内容，考虑历史记录，然后描述您如何实现任务的逐步思考，一次从可用操作中选择一个操作。
-    "Next Action": "action_type, action description" | "None" # 一次一个操作，简短精确地描述它。
-    "Box ID": n,
+    "reasoning": str, # 描述当前屏幕上的内容，考虑历史记录，然后描述您如何实现任务的逐步思考，一次从可用操作中选择一个操作。
+    "next_action": "action_type, action description" | "None" # 一次一个操作，简短精确地描述它。
+    "box_id": n,
     "value": "xxx" # 仅当操作为type时提供value字段，否则不包括value键
 }}
 ```
 
-【Next Action】仅包括下面之一：
+【next_action】仅包括下面之一：
 - type：输入一串文本。
 - left_click：将鼠标移动到框ID并左键单击。
 - right_click：将鼠标移动到框ID并右键单击。
diff --git a/gradio_ui/loop.py b/gradio_ui/loop.py
index c958e7d..1e2f1a0 100644
--- a/gradio_ui/loop.py
+++ b/gradio_ui/loop.py
@@ -42,7 +42,7 @@ def sampling_loop_sync(
     )
     tool_result_content = None
     plan = task_plan_agent(user_task = messages[-1]["content"][0].text)
-    task_run_agent = TaskRunAgent()
+    task_run_agent = TaskRunAgent(output_callback=output_callback)
 
 
     while True: