From 73a5c991c8dbba2e6f2e193ee52892872c9486f7 Mon Sep 17 00:00:00 2001 From: yuruo Date: Wed, 12 Mar 2025 10:38:34 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=B8=AD=E6=AD=A2=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gradio_ui/agent/task_run_agent.py | 31 ++++++++++++++++--------------- gradio_ui/loop.py | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/gradio_ui/agent/task_run_agent.py b/gradio_ui/agent/task_run_agent.py index 60094f9..95461f3 100644 --- a/gradio_ui/agent/task_run_agent.py +++ b/gradio_ui/agent/task_run_agent.py @@ -11,7 +11,8 @@ from xbrain.core.chat import run import platform import re class TaskRunAgent(BaseAgent): - def __init__(self): + def __init__(self, output_callback): + self.output_callback = output_callback self.OUTPUT_DIR = "./tmp/outputs" def __call__(self, task_plan, parsed_screen): @@ -23,11 +24,12 @@ class TaskRunAgent(BaseAgent): screen_width, screen_height = parsed_screen['width'], parsed_screen['height'] vlm_response = run([{"role": "user", "content": "next"}], user_prompt=self.SYSTEM_PROMPT, response_format=TaskRunAgentResponse) vlm_response_json = json.loads(vlm_response) - img_to_show_base64 = parsed_screen["image"] if "box_id" in vlm_response_json: try: - bbox = parsed_screen["parsed_content_list"][int(vlm_response_json["box_id"])]["bbox"] - vlm_response_json["box_centroid_coordinate"] = [int((bbox[0] + bbox[2]) / 2 * screen_width), int((bbox[1] + bbox[3]) / 2 * screen_height)] + bbox = parsed_screen["parsed_content_list"][int(vlm_response_json["box_id"])].coordinates + # vlm_response_json["box_centroid_coordinate"] = [int((bbox[0] + bbox[2]) / 2 * screen_width), int((bbox[1] + bbox[3]) / 2 * screen_height)] + vlm_response_json["box_centroid_coordinate"] = [int((bbox[0] + bbox[2]) / 2 ), int((bbox[1] + bbox[3]) / 2 )] + # img_to_show_data = base64.b64decode(img_to_show_base64) # img_to_show = Image.open(BytesIO(img_to_show_data)) img_to_show = parsed_screen["image"] @@ -98,16 +100,15 @@ class TaskRunAgent(BaseAgent): class TaskRunAgentResponse(BaseModel): reasoning: str = Field(description="描述当前屏幕上的内容,考虑历史记录,然后描述您如何实现任务的逐步思考,一次从可用操作中选择一个操作。") - next_action: str = Field(description="一次一个操作,简短精确地描述它。") - action_type: str = Field( - description="选择一个操作类型", + next_action: str = Field( + description="选择一个操作类型,如果找不到合适的操作,请选择None", json_schema_extra={ "enum": ["type", "left_click", "right_click", "double_click", - "hover", "scroll_up", "scroll_down", "wait"] + "hover", "scroll_up", "scroll_down", "wait", "None"] } ) - box_id: int = Field(description="要操作的框ID,当action_type为left_click、right_click、double_click、hover时提供,否则为None", default=None) - value: str = Field(description="仅当action_type为type时提供,否则为None", default=None) + box_id: int = Field(description="要操作的框ID,当next_action为left_click、right_click、double_click、hover时提供,否则为None", default=None) + value: str = Field(description="仅当next_action为type时提供,否则为None", default=None) system_prompt = """ ### 目标 ### @@ -134,20 +135,20 @@ system_prompt = """ 8. 如果您收到登录信息页面或验证码页面的提示,或者您认为下一步操作需要用户许可,您应该在json字段中说"Next Action": "None"。 9. 你只能使用鼠标和键盘与计算机进行交互。 10. 你只能与桌面图形用户界面交互(无法访问终端或应用程序菜单)。 - +11. 如果当前屏幕没有显示任何可操作的元素,并且当前屏幕不能下滑,请选择None,退出操作。 ########## ### 输出格式 ### ```json {{ - "Reasoning": str, # 描述当前屏幕上的内容,考虑历史记录,然后描述您如何实现任务的逐步思考,一次从可用操作中选择一个操作。 - "Next Action": "action_type, action description" | "None" # 一次一个操作,简短精确地描述它。 - "Box ID": n, + "reasoning": str, # 描述当前屏幕上的内容,考虑历史记录,然后描述您如何实现任务的逐步思考,一次从可用操作中选择一个操作。 + "next_action": "action_type, action description" | "None" # 一次一个操作,简短精确地描述它。 + "box_id": n, "value": "xxx" # 仅当操作为type时提供value字段,否则不包括value键 }} ``` -【Next Action】仅包括下面之一: +【next_action】仅包括下面之一: - type:输入一串文本。 - left_click:将鼠标移动到框ID并左键单击。 - right_click:将鼠标移动到框ID并右键单击。 diff --git a/gradio_ui/loop.py b/gradio_ui/loop.py index c958e7d..1e2f1a0 100644 --- a/gradio_ui/loop.py +++ b/gradio_ui/loop.py @@ -42,7 +42,7 @@ def sampling_loop_sync( ) tool_result_content = None plan = task_plan_agent(user_task = messages[-1]["content"][0].text) - task_run_agent = TaskRunAgent() + task_run_agent = TaskRunAgent(output_callback=output_callback) while True: