mirror of
https://github.com/yuruotong1/autoMate.git
synced 2026-03-22 13:07:17 +08:00
@@ -198,10 +198,10 @@ class VLMAgent:
|
||||
You are using a Windows device.
|
||||
You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
|
||||
You can only interact with the desktop GUI (no terminal or application menu access).
|
||||
|
||||
You may be given some history plan and actions, this is the response from the previous loop.
|
||||
You should carefully consider your plan base on the task, screenshot, and history actions.
|
||||
|
||||
|
||||
Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}
|
||||
|
||||
Your available "Next Action" only include:
|
||||
|
||||
@@ -18,8 +18,6 @@ from gradio_ui.loop import (
|
||||
sampling_loop_sync,
|
||||
)
|
||||
from gradio_ui.tools import ToolResult
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
import base64
|
||||
|
||||
CONFIG_DIR = Path("~/.anthropic").expanduser()
|
||||
|
||||
@@ -9,6 +9,7 @@ from .base import BaseAnthropicTool, ToolError, ToolResult
|
||||
from .screen_capture import get_screenshot
|
||||
import requests
|
||||
import re
|
||||
import pyautogui
|
||||
|
||||
OUTPUT_DIR = "./tmp/outputs"
|
||||
TYPING_DELAY_MS = 12
|
||||
@@ -40,11 +41,6 @@ MAX_SCALING_TARGETS: dict[str, Resolution] = {
|
||||
"FWXGA": Resolution(width=1366, height=768), # ~16:9
|
||||
}
|
||||
|
||||
|
||||
class ScalingSource(StrEnum):
|
||||
COMPUTER = "computer"
|
||||
API = "api"
|
||||
|
||||
class ComputerToolOptions(TypedDict):
|
||||
display_height_px: int
|
||||
display_width_px: int
|
||||
@@ -65,17 +61,13 @@ class ComputerTool(BaseAnthropicTool):
|
||||
height: int
|
||||
display_num: int | None
|
||||
_screenshot_delay = 2.0
|
||||
_scaling_enabled = True
|
||||
|
||||
|
||||
@property
|
||||
def options(self) -> ComputerToolOptions:
|
||||
width, height = self.scale_coordinates(
|
||||
ScalingSource.COMPUTER, self.width, self.height
|
||||
)
|
||||
# 直接使用原始尺寸,不进行缩放
|
||||
return {
|
||||
"display_width_px": width,
|
||||
"display_height_px": height,
|
||||
"display_width_px": self.width,
|
||||
"display_height_px": self.height,
|
||||
"display_number": self.display_num,
|
||||
}
|
||||
|
||||
@@ -89,7 +81,6 @@ class ComputerTool(BaseAnthropicTool):
|
||||
self.display_num = None
|
||||
self.offset_x = 0
|
||||
self.offset_y = 0
|
||||
self.is_scaling = is_scaling
|
||||
self.width, self.height = self.get_screen_size()
|
||||
print(f"screen size: {self.width}, {self.height}")
|
||||
self.key_conversion = {"Page_Down": "pagedown",
|
||||
@@ -104,7 +95,7 @@ class ComputerTool(BaseAnthropicTool):
|
||||
coordinate: tuple[int, int] | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
|
||||
print(f"action: {action}, text: {text}, coordinate: {coordinate},")
|
||||
if action in ("mouse_move", "left_click_drag"):
|
||||
if coordinate is None:
|
||||
raise ToolError(f"coordinate is required for {action}")
|
||||
@@ -115,23 +106,15 @@ class ComputerTool(BaseAnthropicTool):
|
||||
# if not all(isinstance(i, int) and i >= 0 for i in coordinate):
|
||||
if not all(isinstance(i, int) for i in coordinate):
|
||||
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
|
||||
if self.is_scaling:
|
||||
x, y = self.scale_coordinates(
|
||||
ScalingSource.API, coordinate[0], coordinate[1]
|
||||
)
|
||||
else:
|
||||
x, y = coordinate
|
||||
# print(f"scaled_coordinates: {x}, {y}")
|
||||
# print(f"offset: {self.offset_x}, {self.offset_y}")
|
||||
# x += self.offset_x # TODO - check if this is needed
|
||||
# y += self.offset_y
|
||||
|
||||
x, y = coordinate
|
||||
print(f"mouse move to {x}, {y}")
|
||||
if action == "mouse_move":
|
||||
self.run_command(f"pyautogui.moveTo({x}, {y})")
|
||||
pyautogui.moveTo(x, y)
|
||||
return ToolResult(output=f"Moved mouse to ({x}, {y})")
|
||||
elif action == "left_click_drag":
|
||||
current_x, current_y = self.run_command("pyautogui.position()")
|
||||
self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
|
||||
current_x, current_y = pyautogui.position()
|
||||
pyautogui.dragTo(x, y, duration=0.5)
|
||||
return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
|
||||
if action in ("key", "type"):
|
||||
if text is None:
|
||||
@@ -146,17 +129,17 @@ class ComputerTool(BaseAnthropicTool):
|
||||
for key in keys:
|
||||
key = self.key_conversion.get(key.strip(), key.strip())
|
||||
key = key.lower()
|
||||
self.run_command(f"pyautogui.keyDown('{key}')") # Press down each key
|
||||
pyautogui.keyDown(key)
|
||||
for key in reversed(keys):
|
||||
key = self.key_conversion.get(key.strip(), key.strip())
|
||||
key = key.lower()
|
||||
self.run_command(f"pyautogui.keyUp('{key}')") # Release each key in reverse order
|
||||
pyautogui.keyUp(key)
|
||||
return ToolResult(output=f"Pressed keys: {text}")
|
||||
elif action == "type":
|
||||
# default click before type TODO: check if this is needed
|
||||
self.run_command("pyautogui.click()")
|
||||
self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
|
||||
self.run_command("pyautogui.press('enter')")
|
||||
pyautogui.click()
|
||||
pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)
|
||||
pyautogui.press('enter')
|
||||
screenshot_base64 = (await self.screenshot()).base64_image
|
||||
return ToolResult(output=text, base64_image=screenshot_base64)
|
||||
if action in (
|
||||
@@ -175,28 +158,28 @@ class ComputerTool(BaseAnthropicTool):
|
||||
if action == "screenshot":
|
||||
return await self.screenshot()
|
||||
elif action == "cursor_position":
|
||||
x, y = self.run_command("pyautogui.position()")
|
||||
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
|
||||
x, y = pyautogui.position()
|
||||
# 直接返回原始坐标,不进行缩放
|
||||
return ToolResult(output=f"X={x},Y={y}")
|
||||
else:
|
||||
if action == "left_click":
|
||||
self.run_command("pyautogui.click()")
|
||||
pyautogui.click()
|
||||
elif action == "right_click":
|
||||
self.run_command("pyautogui.rightClick()")
|
||||
pyautogui.rightClick()
|
||||
elif action == "middle_click":
|
||||
self.run_command("pyautogui.middleClick()")
|
||||
pyautogui.middleClick()
|
||||
elif action == "double_click":
|
||||
self.run_command("pyautogui.doubleClick()")
|
||||
pyautogui.doubleClick()
|
||||
elif action == "left_press":
|
||||
self.run_command("pyautogui.mouseDown()")
|
||||
pyautogui.mouseDown()
|
||||
time.sleep(1)
|
||||
self.run_command("pyautogui.mouseUp()")
|
||||
pyautogui.mouseUp()
|
||||
return ToolResult(output=f"Performed {action}")
|
||||
if action in ("scroll_up", "scroll_down"):
|
||||
if action == "scroll_up":
|
||||
self.run_command("pyautogui.scroll(100)")
|
||||
pyautogui.scroll(100)
|
||||
elif action == "scroll_down":
|
||||
self.run_command("pyautogui.scroll(-100)")
|
||||
pyautogui.scroll(-100)
|
||||
return ToolResult(output=f"Performed {action}")
|
||||
if action == "hover":
|
||||
return ToolResult(output=f"Performed {action}")
|
||||
@@ -204,31 +187,6 @@ class ComputerTool(BaseAnthropicTool):
|
||||
time.sleep(1)
|
||||
return ToolResult(output=f"Performed {action}")
|
||||
raise ToolError(f"Invalid action: {action}")
|
||||
def run_command(self, action: str):
|
||||
"""
|
||||
|
||||
Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
|
||||
"""
|
||||
prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
|
||||
command_list = ["python", "-c", f"{prefix} {action}"]
|
||||
parse = action == "pyautogui.position()"
|
||||
if parse:
|
||||
command_list[-1] = f"{prefix} print({action})"
|
||||
try:
|
||||
print(f"run command: {command_list}")
|
||||
# 使用 tool.execute_command 替代 requests.post
|
||||
response = tool.execute_command(command_list)
|
||||
time.sleep(0.7) # avoid async error as actions take time to complete
|
||||
print(f"action executed")
|
||||
if parse:
|
||||
output = response['output'].strip()
|
||||
match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
|
||||
if not match:
|
||||
raise ToolError(f"Could not parse coordinates from output: {output}")
|
||||
x, y = map(int, match.groups())
|
||||
return x, y
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
|
||||
|
||||
async def screenshot(self):
|
||||
if not hasattr(self, 'target_dimension'):
|
||||
@@ -249,35 +207,7 @@ class ComputerTool(BaseAnthropicTool):
|
||||
padding_image.paste(screenshot, (0, 0))
|
||||
return padding_image
|
||||
|
||||
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
|
||||
"""Scale coordinates to a target maximum resolution."""
|
||||
if not self._scaling_enabled:
|
||||
return x, y
|
||||
ratio = self.width / self.height
|
||||
target_dimension = None
|
||||
for target_name, dimension in MAX_SCALING_TARGETS.items():
|
||||
# allow some error in the aspect ratio - not ratios are exactly 16:9
|
||||
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
||||
if dimension["width"] < self.width:
|
||||
target_dimension = dimension
|
||||
self.target_dimension = target_dimension
|
||||
# print(f"target_dimension: {target_dimension}")
|
||||
break
|
||||
if target_dimension is None:
|
||||
# TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
|
||||
target_dimension = MAX_SCALING_TARGETS["WXGA"]
|
||||
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
|
||||
# should be less than 1
|
||||
x_scaling_factor = target_dimension["width"] / self.width
|
||||
y_scaling_factor = target_dimension["height"] / self.height
|
||||
if source == ScalingSource.API:
|
||||
if x > self.width or y > self.height:
|
||||
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
|
||||
# scale up
|
||||
return round(x / x_scaling_factor), round(y / y_scaling_factor)
|
||||
# scale down
|
||||
return round(x * x_scaling_factor), round(y * y_scaling_factor)
|
||||
|
||||
|
||||
def get_screen_size(self):
|
||||
"""Return width and height of the screen"""
|
||||
try:
|
||||
|
||||
70
main.py
70
main.py
@@ -1,12 +1,10 @@
|
||||
import argparse
|
||||
import subprocess
|
||||
import signal
|
||||
import sys
|
||||
import platform
|
||||
from threading import Thread
|
||||
import requests
|
||||
from gradio_ui import app
|
||||
from util import download_weights
|
||||
import time
|
||||
import torch
|
||||
import socket
|
||||
|
||||
def run():
|
||||
try:
|
||||
@@ -17,45 +15,59 @@ def run():
|
||||
except Exception:
|
||||
print("显卡驱动不适配,请根据readme安装合适版本的 torch!")
|
||||
|
||||
# 启动 server.py 子进程,并捕获其输出
|
||||
# Windows:
|
||||
if platform.system() == 'Windows':
|
||||
server_process = subprocess.Popen(
|
||||
["python", "./server.py"],
|
||||
stdout=subprocess.PIPE, # 捕获标准输出
|
||||
stderr=subprocess.PIPE,
|
||||
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
|
||||
text=True
|
||||
)
|
||||
else:
|
||||
server_process = subprocess.Popen(
|
||||
["python", "./server.py"],
|
||||
stdout=subprocess.PIPE, # 捕获标准输出
|
||||
stderr=subprocess.PIPE,
|
||||
start_new_session=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
server_process = subprocess.Popen(
|
||||
["python", "./omniserver.py"],
|
||||
stdout=subprocess.PIPE, # 捕获标准输出
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
# 下载权重文件
|
||||
download_weights.download()
|
||||
print("启动Omniserver服务中,约40s左右,请耐心等待!")
|
||||
print("启动Omniserver服务中,约5分钟左右,因为加载模型真的超级慢,请耐心等待!")
|
||||
# 启动 Gradio UI
|
||||
# 等待 server_process 打印出 "Started server process"
|
||||
while True:
|
||||
output = server_process.stdout.readline()
|
||||
if "Omniparser initialized" in output:
|
||||
print("Omniparseer服务启动成功...")
|
||||
res = requests.get("http://127.0.0.1:8000/probe/")
|
||||
if res.status_code == 200 and res.json().get("message", None):
|
||||
print("Omniparser服务启动成功...")
|
||||
break
|
||||
if server_process.poll() is not None:
|
||||
raise RuntimeError("Server process terminated unexpectedly")
|
||||
|
||||
stdout_thread = Thread(
|
||||
target=stream_reader,
|
||||
args=(server_process.stdout, "SERVER-OUT")
|
||||
)
|
||||
|
||||
stderr_thread = Thread(
|
||||
target=stream_reader,
|
||||
args=(server_process.stderr, "SERVER-ERR")
|
||||
)
|
||||
stdout_thread.daemon = True
|
||||
stderr_thread.daemon = True
|
||||
stdout_thread.start()
|
||||
stderr_thread.start()
|
||||
app.run()
|
||||
finally:
|
||||
# 确保在主进程退出时终止子进程
|
||||
if server_process.poll() is None: # 如果进程还在运行
|
||||
server_process.terminate() # 发送终止信号
|
||||
server_process.wait(timeout=5) # 等待进程结束
|
||||
server_process.wait(timeout=8) # 等待进程结束
|
||||
|
||||
def stream_reader(pipe, prefix):
|
||||
for line in pipe:
|
||||
print(f"[{prefix}]", line, end="", flush=True)
|
||||
|
||||
def is_port_occupied(port):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
return s.connect_ex(('localhost', port)) == 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 检测8000端口是否被占用
|
||||
if is_port_occupied(8000):
|
||||
print("8000端口被占用,请先关闭占用该端口的进程")
|
||||
exit()
|
||||
run()
|
||||
@@ -1,5 +1,5 @@
|
||||
'''
|
||||
python -m server --som_model_path ../../weights/icon_detect/model.pt --caption_model_name florence2 --caption_model_path ../../weights/icon_caption_florence --device cuda --BOX_TRESHOLD 0.05
|
||||
python -m omniparserserver --som_model_path ../../weights/icon_detect/model.pt --caption_model_name florence2 --caption_model_path ../../weights/icon_caption_florence --device cuda --BOX_TRESHOLD 0.05
|
||||
'''
|
||||
|
||||
import sys
|
||||
@@ -14,7 +14,7 @@ sys.path.append(root_dir)
|
||||
from util.omniparser import Omniparser
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description='autoMate API')
|
||||
parser = argparse.ArgumentParser(description='Omniparser API')
|
||||
parser.add_argument('--som_model_path', type=str, default='./weights/icon_detect/model.pt', help='Path to the som model')
|
||||
parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model')
|
||||
parser.add_argument('--caption_model_path', type=str, default='./weights/icon_caption', help='Path to the caption model')
|
||||
@@ -45,7 +45,10 @@ async def parse(parse_request: ParseRequest):
|
||||
|
||||
@app.get("/probe/")
|
||||
async def root():
|
||||
return {"message": "API ready"}
|
||||
return {"message": "Omniparser API ready"}
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("server:app", host=args.host, port=args.port, reload=True)
|
||||
uvicorn.run("omniserver:app", host=args.host, port=args.port, reload=True)
|
||||
@@ -10,7 +10,7 @@ class Omniparser(object):
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.som_model = get_yolo_model(model_path=config['som_model_path'])
|
||||
self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
|
||||
print('Omniparser initialized!')
|
||||
print('Server initialized!')
|
||||
|
||||
def parse(self, image_base64: str):
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
|
||||
@@ -96,7 +96,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
||||
|
||||
model, processor = caption_model_processor['model'], caption_model_processor['processor']
|
||||
if not prompt:
|
||||
if 'florence' in model.config.name_or_path:
|
||||
if 'florence' in model.config.model_type:
|
||||
prompt = "<CAPTION>"
|
||||
else:
|
||||
prompt = "The image shows"
|
||||
@@ -111,7 +111,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
||||
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16)
|
||||
else:
|
||||
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
|
||||
if 'florence' in model.config.name_or_path:
|
||||
if 'florence' in model.config.model_type:
|
||||
generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=20,num_beams=1, do_sample=False)
|
||||
else:
|
||||
generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,
|
||||
|
||||
Reference in New Issue
Block a user