mirror of
https://github.com/yuruotong1/autoMate.git
synced 2026-03-22 13:07:17 +08:00
169 lines
7.8 KiB
Python
169 lines
7.8 KiB
Python
import json
|
||
import uuid
|
||
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage
|
||
from pydantic import BaseModel, Field
|
||
from gradio_ui.agent.base_agent import BaseAgent
|
||
from xbrain.core.chat import run
|
||
import platform
|
||
import re
|
||
class TaskRunAgent(BaseAgent):
|
||
def __init__(self):
|
||
self.OUTPUT_DIR = "./tmp/outputs"
|
||
|
||
def __call__(self, task_plan, parsed_screen_result, messages):
|
||
screen_info = str(parsed_screen_result['parsed_content_list'])
|
||
self.SYSTEM_PROMPT = system_prompt.format(task_plan=str(task_plan),
|
||
device=self.get_device(),
|
||
screen_info=screen_info)
|
||
messages.append(
|
||
{"role": "user",
|
||
"content": [
|
||
{"type": "text", "text": "Image is the screenshot of the current screen"},
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {"url": f"data:image/png;base64,{parsed_screen_result['base64_image']}"}
|
||
}
|
||
]
|
||
}
|
||
)
|
||
vlm_response = run(
|
||
messages,
|
||
user_prompt=self.SYSTEM_PROMPT,
|
||
response_format=TaskRunAgentResponse
|
||
)
|
||
messages.append({"role": "assistant", "content": vlm_response})
|
||
vlm_response_json = json.loads(vlm_response)
|
||
response_content = [BetaTextBlock(text=vlm_response_json["reasoning"], type='text')]
|
||
if 'box_centroid_coordinate' in vlm_response_json:
|
||
move_cursor_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
||
input={'action': 'mouse_move', 'coordinate': vlm_response_json["box_centroid_coordinate"]},
|
||
name='computer', type='tool_use')
|
||
response_content.append(move_cursor_block)
|
||
|
||
if vlm_response_json["next_action"] == "None":
|
||
print("Task paused/completed.")
|
||
elif vlm_response_json["next_action"] == "type":
|
||
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
||
input={'action': vlm_response_json["next_action"], 'text': vlm_response_json["value"]},
|
||
name='computer', type='tool_use')
|
||
response_content.append(sim_content_block)
|
||
else:
|
||
sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
|
||
input={'action': vlm_response_json["next_action"]},
|
||
name='computer', type='tool_use')
|
||
response_content.append(sim_content_block)
|
||
response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0))
|
||
return response_message, vlm_response_json
|
||
|
||
|
||
def get_device(self):
|
||
# 获取当前操作系统信息
|
||
system = platform.system()
|
||
if system == "Windows":
|
||
device = f"Windows {platform.release()}"
|
||
elif system == "Darwin":
|
||
device = f"Mac OS {platform.mac_ver()[0]}"
|
||
elif system == "Linux":
|
||
device = f"Linux {platform.release()}"
|
||
else:
|
||
device = system
|
||
return device
|
||
|
||
|
||
def extract_data(self, input_string, data_type):
|
||
# Regular expression to extract content starting from '```python' until the end if there are no closing backticks
|
||
pattern = f"```{data_type}" + r"(.*?)(```|$)"
|
||
# Extract content
|
||
# re.DOTALL allows '.' to match newlines as well
|
||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||
# Return the first match if exists, trimming whitespace and ignoring potential closing backticks
|
||
return matches[0][0].strip() if matches else input_string
|
||
|
||
class TaskRunAgentResponse(BaseModel):
|
||
reasoning: str = Field(description="描述当前屏幕上的内容,考虑历史记录,然后描述您如何实现任务的逐步思考,一次从可用操作中选择一个操作。")
|
||
next_action: str = Field(
|
||
description="选择一个操作类型,如果找不到合适的操作,请选择None",
|
||
json_schema_extra={
|
||
"enum": ["type", "left_click", "right_click", "double_click",
|
||
"hover", "scroll_up", "scroll_down", "wait", "None"]
|
||
}
|
||
)
|
||
box_id: int = Field(description="要操作的框ID,当next_action为left_click、right_click、double_click、hover时提供,否则为None", default=None)
|
||
value: str = Field(description="仅当next_action为type时提供,否则为None", default=None)
|
||
|
||
system_prompt = """
|
||
### 目标 ###
|
||
你是一个自动化规划师,需要完成用户的任务。请你根据屏幕信息确定【下一步操作】,以完成任务:
|
||
|
||
你当前的任务是:
|
||
{task_plan}
|
||
|
||
以下是用yolo检测的当前屏幕上的所有元素:
|
||
|
||
{screen_info}
|
||
##########
|
||
|
||
### 注意 ###
|
||
1. 每次应该只给出一个操作。
|
||
2. 应该对当前屏幕进行分析,通过查看历史记录反思已完成的工作,然后描述您如何实现任务的逐步思考。
|
||
3. 在"Next Action"中附上下一步操作预测。
|
||
4. 不应包括其他操作,例如键盘快捷键。
|
||
5. 当任务完成时,不要完成额外的操作。你应该在json字段中说"Next Action": "None"。
|
||
6. 任务涉及购买多个产品或浏览多个页面。你应该将其分解为子目标,并按照说明的顺序一个一个地完成每个子目标。
|
||
7. 避免连续多次选择相同的操作/元素,如果发生这种情况,反思自己,可能出了什么问题,并预测不同的操作。
|
||
8. 如果您收到登录信息页面或验证码页面的提示,或者您认为下一步操作需要用户许可,您应该在json字段中说"Next Action": "None"。
|
||
9. 你只能使用鼠标和键盘与计算机进行交互。
|
||
10. 你只能与桌面图形用户界面交互(无法访问终端或应用程序菜单)。
|
||
11. 如果当前屏幕没有显示任何可操作的元素,并且当前屏幕不能下滑,请返回None。
|
||
|
||
##########
|
||
### 输出格式 ###
|
||
```json
|
||
{{
|
||
"reasoning": str, # 描述当前屏幕上的内容,考虑历史记录,然后描述您如何实现任务的逐步思考,一次从可用操作中选择一个操作。
|
||
"next_action": "action_type, action description" | "None" # 一次一个操作,简短精确地描述它。
|
||
"box_id": n,
|
||
"value": "xxx" # 仅当操作为type时提供value字段,否则不包括value键
|
||
}}
|
||
```
|
||
|
||
【next_action】仅包括下面之一:
|
||
- type:输入一串文本。
|
||
- left_click:将鼠标移动到框ID并左键单击。
|
||
- right_click:将鼠标移动到框ID并右键单击。
|
||
- double_click:将鼠标移动到框ID并双击。
|
||
- hover:将鼠标移动到框ID。
|
||
- scroll_up:向上滚动屏幕以查看之前的内容。
|
||
- scroll_down:当所需按钮不可见或您需要查看更多内容时,向下滚动屏幕。
|
||
- wait:等待1秒钟让设备加载或响应。
|
||
|
||
##########
|
||
### 案例 ###
|
||
一个例子:
|
||
```json
|
||
{{
|
||
"reasoning": "当前屏幕显示亚马逊的谷歌搜索结果,在之前的操作中,我已经在谷歌上搜索了亚马逊。然后我需要点击第一个搜索结果以转到amazon.com。",
|
||
"next_action": "left_click",
|
||
"box_id": m
|
||
}}
|
||
```
|
||
|
||
另一个例子:
|
||
```json
|
||
{{
|
||
"reasoning": "当前屏幕显示亚马逊的首页。没有之前的操作。因此,我需要在搜索栏中输入"Apple watch"。",
|
||
"next_action": "type",
|
||
"box_id": n,
|
||
"value": "Apple watch"
|
||
}}
|
||
```
|
||
|
||
另一个例子:
|
||
```json
|
||
{{
|
||
"reasoning": "当前屏幕没有显示'提交'按钮,我需要向下滚动以查看按钮是否可用。",
|
||
"next_action": "scroll_down"
|
||
}}
|
||
"""
|
||
|