autoMate/auto_control/agent/task_run_agent.py
2025-03-24 17:31:53 +08:00

188 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import uuid
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage
from pydantic import Field, create_model
from auto_control.agent.base_agent import BaseAgent
from xbrain.core.chat import run
from auto_control.tools.computer import Action
class TaskRunAgent(BaseAgent):
def __init__(self):
self.OUTPUT_DIR = "./tmp/outputs"
def __call__(self, parsed_screen_result, messages):
messages.append(
{"role": "user",
"content": [
{"type": "text", "text": "Image is the screenshot of the current screen"},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{parsed_screen_result['base64_image']}"}
}
]
}
)
task_list = json.loads(messages[1]['content'])['task_list']
# Convert task_list to a numbered format
formatted_task_list = "\n".join([f"{i}.{task}" for i, task in enumerate(task_list)])
system_prompt = prompt.format(task_list=formatted_task_list)
vlm_response = run(
messages,
user_prompt=system_prompt,
response_format=create_dynamic_response_model(parsed_screen_result)
)
vlm_response_json = json.loads(vlm_response)
response_content = [BetaTextBlock(text=vlm_response_json["reasoning"], type='text')]
# Handle cursor movement based on box_id
if "box_id" in vlm_response_json:
action_types_without_cursor = ["None", "key", "type", "scroll_down", "scroll_up", "cursor_position", "wait"]
if vlm_response_json["box_id"] != -1 and vlm_response_json["next_action"] not in action_types_without_cursor:
# Move cursor to the center of the identified element
element = self.find_element_by_id(parsed_screen_result, vlm_response_json["box_id"])
bbox = element.coordinates
box_centroid_coordinate = [
int((bbox[0] + bbox[2]) / 2),
int((bbox[1] + bbox[3]) / 2)
]
move_cursor_block = BetaToolUseBlock(
id=f'toolu_{uuid.uuid4()}',
input={'action': 'mouse_move', 'coordinate': box_centroid_coordinate},
name='computer',
type='tool_use'
)
response_content.append(move_cursor_block)
elif vlm_response_json["box_id"] == -1 and len(vlm_response_json["coordinates"]) == 2:
# Move cursor to specified coordinates
move_cursor_block = BetaToolUseBlock(
id=f'toolu_{uuid.uuid4()}',
input={'action': 'mouse_move', 'coordinate': vlm_response_json["coordinates"]},
name='computer',
type='tool_use'
)
response_content.append(move_cursor_block)
if vlm_response_json["next_action"] == "None":
print("Task paused/completed.")
elif vlm_response_json["next_action"] == "type":
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
input={'action': vlm_response_json["next_action"], 'text': vlm_response_json["value"]},
name='computer', type='tool_use')
response_content.append(sim_content_block)
else:
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
input={'action': vlm_response_json["next_action"]},
name='computer', type='tool_use')
response_content.append(sim_content_block)
response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0))
return response_message, vlm_response_json
def find_element_by_id(self, parsed_screen_result, box_id):
for element in parsed_screen_result["parsed_content_list"]:
if element.element_id == box_id:
return element
return None
def create_dynamic_response_model(parsed_screen_result):
available_box_ids = [item.element_id for item in parsed_screen_result['parsed_content_list']]
available_box_ids.append(-1)
task_run_agent_response = create_model(
'TaskRunAgentResponse',
reasoning = (str, Field(
description="描述当前屏幕上的内容,考虑历史记录,然后说出你要这么做的理由。"
)),
next_action = (str, Field(
description="选择一个操作类型如果找不到合适的操作请选择None",
json_schema_extra={
"enum": Action
}
)),
box_id = (int, Field(
description="要操作的框ID如果框ID不存在就返回-1",
json_schema_extra={
"enum": available_box_ids
}
)),
coordinates = (list[int], Field(
description="当 box_id 为-1时直接返回要操作对象的坐标只返回x,y这2个整数"
)),
value = (str, Field(
description="仅当next_action为type时提供否则为None"
)),
current_task_id = (int, Field(
description="请判断一下你正在完成第几个任务第一个任务是0"
))
)
return task_run_agent_response
prompt = """
### 目标 ###
你是一个任务执行者。请你根据屏幕截图和【所有元素】确定接下来要做什么如果任务完成把next_action设置为None
请根据以下任务列表判断一下你正在执行第几个任务current_task_id第一个任务是0任务列表如下
{task_list}
##########
### 注意 ###
- 要结合用户传入的屏幕图片观察其中的 box_id 框框和标号确定要操作哪一个box_id如果没有合适的请返回-1然后通过coordinates给出要操作对象的坐标。
- 每次应该只给出一个操作告诉我要对哪个box_id进行操作、输入什么内容或者滚动或者其他操作。
- 应该对当前屏幕进行分析,通过查看历史记录反思已完成的工作,然后描述您如何实现任务的逐步思考。
- 避免连续多次选择相同的操作/元素,如果发生这种情况,反思自己,可能出了什么问题,并预测不同的操作。
- 任务不是连续的上一次是1下一次不一定是2你要根据next_action进行判断。
- current_task_id 要在任务列表中找到,不要随便写。
- 当你觉得任务已经完成时请一定把next_action设置为'None',不然会重复执行。
- 涉及到输入type、key操作时其上一步操作一定是点击输入框操作。
##########
### 输出格式 ###
```json
{{
"reasoning": str, # 综合当前屏幕上的内容和历史记录,描述您是如何思考的。
"next_action": str, # 要执行的动作。
"box_id": int, # 要操作的框ID当next_action为left_click、right_click、double_click、hover时提供否则为None
"value": "xxx" # 仅当操作为type时提供value字段否则不包括value键
"current_task_id": int # 当前正在执行第几个任务第一个任务是0,
"coordinates": list[int] # 仅当box_id为-1时提供返回要操作对象的坐标只返回x,y这2个整数
}}
```
##########
### 案例 ###
任务列表:
0. 打开浏览器
1. 搜索亚马逊
2. 点击第一个搜索结果
一个例子:
```json
{{
"reasoning": "当前屏幕显示亚马逊的谷歌搜索结果在之前的操作中我已经在谷歌上搜索了亚马逊。然后我需要点击第一个搜索结果以转到amazon.com。",
"next_action": "left_click",
"box_id": 35,
"current_task_id": 0
}}
```
另一个例子:
```json
{{
"reasoning": "当前屏幕显示亚马逊的首页。没有之前的操作。因此,我需要在搜索栏中输入"Apple watch"",
"next_action": "type",
"box_id": 27,
"value": "Apple watch",
"current_task_id": 1
}}
```
另一个例子:
```json
{{
"reasoning": "当前屏幕没有显示'提交'按钮,我需要向下滚动以查看按钮是否可用。",
"next_action": "scroll_down",
"current_task_id": 2
}}
"""