Merge pull request #55 from yuruotong1/dev

Dev
This commit is contained in:
Dongle
2025-03-07 08:26:24 +08:00
committed by GitHub
7 changed files with 78 additions and 135 deletions

View File

@@ -198,10 +198,10 @@ class VLMAgent:
You are using a Windows device.
You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
You can only interact with the desktop GUI (no terminal or application menu access).
You may be given some history plan and actions, this is the response from the previous loop.
You should carefully consider your plan base on the task, screenshot, and history actions.
Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}
Your available "Next Action" only include:

View File

@@ -18,8 +18,6 @@ from gradio_ui.loop import (
sampling_loop_sync,
)
from gradio_ui.tools import ToolResult
import requests
from requests.exceptions import RequestException
import base64
CONFIG_DIR = Path("~/.anthropic").expanduser()

View File

@@ -9,6 +9,7 @@ from .base import BaseAnthropicTool, ToolError, ToolResult
from .screen_capture import get_screenshot
import requests
import re
import pyautogui
OUTPUT_DIR = "./tmp/outputs"
TYPING_DELAY_MS = 12
@@ -40,11 +41,6 @@ MAX_SCALING_TARGETS: dict[str, Resolution] = {
"FWXGA": Resolution(width=1366, height=768), # ~16:9
}
class ScalingSource(StrEnum):
COMPUTER = "computer"
API = "api"
class ComputerToolOptions(TypedDict):
display_height_px: int
display_width_px: int
@@ -65,17 +61,13 @@ class ComputerTool(BaseAnthropicTool):
height: int
display_num: int | None
_screenshot_delay = 2.0
_scaling_enabled = True
@property
def options(self) -> ComputerToolOptions:
width, height = self.scale_coordinates(
ScalingSource.COMPUTER, self.width, self.height
)
# 直接使用原始尺寸,不进行缩放
return {
"display_width_px": width,
"display_height_px": height,
"display_width_px": self.width,
"display_height_px": self.height,
"display_number": self.display_num,
}
@@ -89,7 +81,6 @@ class ComputerTool(BaseAnthropicTool):
self.display_num = None
self.offset_x = 0
self.offset_y = 0
self.is_scaling = is_scaling
self.width, self.height = self.get_screen_size()
print(f"screen size: {self.width}, {self.height}")
self.key_conversion = {"Page_Down": "pagedown",
@@ -104,7 +95,7 @@ class ComputerTool(BaseAnthropicTool):
coordinate: tuple[int, int] | None = None,
**kwargs,
):
print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
print(f"action: {action}, text: {text}, coordinate: {coordinate},")
if action in ("mouse_move", "left_click_drag"):
if coordinate is None:
raise ToolError(f"coordinate is required for {action}")
@@ -115,23 +106,15 @@ class ComputerTool(BaseAnthropicTool):
# if not all(isinstance(i, int) and i >= 0 for i in coordinate):
if not all(isinstance(i, int) for i in coordinate):
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
if self.is_scaling:
x, y = self.scale_coordinates(
ScalingSource.API, coordinate[0], coordinate[1]
)
else:
x, y = coordinate
# print(f"scaled_coordinates: {x}, {y}")
# print(f"offset: {self.offset_x}, {self.offset_y}")
# x += self.offset_x # TODO - check if this is needed
# y += self.offset_y
x, y = coordinate
print(f"mouse move to {x}, {y}")
if action == "mouse_move":
self.run_command(f"pyautogui.moveTo({x}, {y})")
pyautogui.moveTo(x, y)
return ToolResult(output=f"Moved mouse to ({x}, {y})")
elif action == "left_click_drag":
current_x, current_y = self.run_command("pyautogui.position()")
self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
current_x, current_y = pyautogui.position()
pyautogui.dragTo(x, y, duration=0.5)
return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
if action in ("key", "type"):
if text is None:
@@ -146,17 +129,17 @@ class ComputerTool(BaseAnthropicTool):
for key in keys:
key = self.key_conversion.get(key.strip(), key.strip())
key = key.lower()
self.run_command(f"pyautogui.keyDown('{key}')") # Press down each key
pyautogui.keyDown(key)
for key in reversed(keys):
key = self.key_conversion.get(key.strip(), key.strip())
key = key.lower()
self.run_command(f"pyautogui.keyUp('{key}')") # Release each key in reverse order
pyautogui.keyUp(key)
return ToolResult(output=f"Pressed keys: {text}")
elif action == "type":
# default click before type TODO: check if this is needed
self.run_command("pyautogui.click()")
self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
self.run_command("pyautogui.press('enter')")
pyautogui.click()
pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)
pyautogui.press('enter')
screenshot_base64 = (await self.screenshot()).base64_image
return ToolResult(output=text, base64_image=screenshot_base64)
if action in (
@@ -175,28 +158,28 @@ class ComputerTool(BaseAnthropicTool):
if action == "screenshot":
return await self.screenshot()
elif action == "cursor_position":
x, y = self.run_command("pyautogui.position()")
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
x, y = pyautogui.position()
# 直接返回原始坐标,不进行缩放
return ToolResult(output=f"X={x},Y={y}")
else:
if action == "left_click":
self.run_command("pyautogui.click()")
pyautogui.click()
elif action == "right_click":
self.run_command("pyautogui.rightClick()")
pyautogui.rightClick()
elif action == "middle_click":
self.run_command("pyautogui.middleClick()")
pyautogui.middleClick()
elif action == "double_click":
self.run_command("pyautogui.doubleClick()")
pyautogui.doubleClick()
elif action == "left_press":
self.run_command("pyautogui.mouseDown()")
pyautogui.mouseDown()
time.sleep(1)
self.run_command("pyautogui.mouseUp()")
pyautogui.mouseUp()
return ToolResult(output=f"Performed {action}")
if action in ("scroll_up", "scroll_down"):
if action == "scroll_up":
self.run_command("pyautogui.scroll(100)")
pyautogui.scroll(100)
elif action == "scroll_down":
self.run_command("pyautogui.scroll(-100)")
pyautogui.scroll(-100)
return ToolResult(output=f"Performed {action}")
if action == "hover":
return ToolResult(output=f"Performed {action}")
@@ -204,31 +187,6 @@ class ComputerTool(BaseAnthropicTool):
time.sleep(1)
return ToolResult(output=f"Performed {action}")
raise ToolError(f"Invalid action: {action}")
def run_command(self, action: str):
"""
Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
"""
prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
command_list = ["python", "-c", f"{prefix} {action}"]
parse = action == "pyautogui.position()"
if parse:
command_list[-1] = f"{prefix} print({action})"
try:
print(f"run command: {command_list}")
# 使用 tool.execute_command 替代 requests.post
response = tool.execute_command(command_list)
time.sleep(0.7) # avoid async error as actions take time to complete
print(f"action executed")
if parse:
output = response['output'].strip()
match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
if not match:
raise ToolError(f"Could not parse coordinates from output: {output}")
x, y = map(int, match.groups())
return x, y
except requests.exceptions.RequestException as e:
raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
async def screenshot(self):
if not hasattr(self, 'target_dimension'):
@@ -249,35 +207,7 @@ class ComputerTool(BaseAnthropicTool):
padding_image.paste(screenshot, (0, 0))
return padding_image
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
"""Scale coordinates to a target maximum resolution."""
if not self._scaling_enabled:
return x, y
ratio = self.width / self.height
target_dimension = None
for target_name, dimension in MAX_SCALING_TARGETS.items():
# allow some error in the aspect ratio - not ratios are exactly 16:9
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
if dimension["width"] < self.width:
target_dimension = dimension
self.target_dimension = target_dimension
# print(f"target_dimension: {target_dimension}")
break
if target_dimension is None:
# TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
target_dimension = MAX_SCALING_TARGETS["WXGA"]
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
# should be less than 1
x_scaling_factor = target_dimension["width"] / self.width
y_scaling_factor = target_dimension["height"] / self.height
if source == ScalingSource.API:
if x > self.width or y > self.height:
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
# scale up
return round(x / x_scaling_factor), round(y / y_scaling_factor)
# scale down
return round(x * x_scaling_factor), round(y * y_scaling_factor)
def get_screen_size(self):
"""Return width and height of the screen"""
try:

70
main.py
View File

@@ -1,12 +1,10 @@
import argparse
import subprocess
import signal
import sys
import platform
from threading import Thread
import requests
from gradio_ui import app
from util import download_weights
import time
import torch
import socket
def run():
try:
@@ -17,45 +15,59 @@ def run():
except Exception:
print("显卡驱动不适配请根据readme安装合适版本的 torch")
# 启动 server.py 子进程,并捕获其输出
# Windows:
if platform.system() == 'Windows':
server_process = subprocess.Popen(
["python", "./server.py"],
stdout=subprocess.PIPE, # 捕获标准输出
stderr=subprocess.PIPE,
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
text=True
)
else:
server_process = subprocess.Popen(
["python", "./server.py"],
stdout=subprocess.PIPE, # 捕获标准输出
stderr=subprocess.PIPE,
start_new_session=True,
text=True
)
server_process = subprocess.Popen(
["python", "./omniserver.py"],
stdout=subprocess.PIPE, # 捕获标准输出
stderr=subprocess.PIPE,
text=True
)
try:
# 下载权重文件
download_weights.download()
print("启动Omniserver服务中40s左右,请耐心等待!")
print("启动Omniserver服务中5分钟左右因为加载模型真的超级慢,请耐心等待!")
# 启动 Gradio UI
# 等待 server_process 打印出 "Started server process"
while True:
output = server_process.stdout.readline()
if "Omniparser initialized" in output:
print("Omniparseer服务启动成功...")
res = requests.get("http://127.0.0.1:8000/probe/")
if res.status_code == 200 and res.json().get("message", None):
print("Omniparser服务启动成功...")
break
if server_process.poll() is not None:
raise RuntimeError("Server process terminated unexpectedly")
stdout_thread = Thread(
target=stream_reader,
args=(server_process.stdout, "SERVER-OUT")
)
stderr_thread = Thread(
target=stream_reader,
args=(server_process.stderr, "SERVER-ERR")
)
stdout_thread.daemon = True
stderr_thread.daemon = True
stdout_thread.start()
stderr_thread.start()
app.run()
finally:
# 确保在主进程退出时终止子进程
if server_process.poll() is None: # 如果进程还在运行
server_process.terminate() # 发送终止信号
server_process.wait(timeout=5) # 等待进程结束
server_process.wait(timeout=8) # 等待进程结束
def stream_reader(pipe, prefix):
for line in pipe:
print(f"[{prefix}]", line, end="", flush=True)
def is_port_occupied(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) == 0
if __name__ == '__main__':
# 检测8000端口是否被占用
if is_port_occupied(8000):
print("8000端口被占用请先关闭占用该端口的进程")
exit()
run()

View File

@@ -1,5 +1,5 @@
'''
python -m server --som_model_path ../../weights/icon_detect/model.pt --caption_model_name florence2 --caption_model_path ../../weights/icon_caption_florence --device cuda --BOX_TRESHOLD 0.05
python -m omniparserserver --som_model_path ../../weights/icon_detect/model.pt --caption_model_name florence2 --caption_model_path ../../weights/icon_caption_florence --device cuda --BOX_TRESHOLD 0.05
'''
import sys
@@ -14,7 +14,7 @@ sys.path.append(root_dir)
from util.omniparser import Omniparser
def parse_arguments():
parser = argparse.ArgumentParser(description='autoMate API')
parser = argparse.ArgumentParser(description='Omniparser API')
parser.add_argument('--som_model_path', type=str, default='./weights/icon_detect/model.pt', help='Path to the som model')
parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model')
parser.add_argument('--caption_model_path', type=str, default='./weights/icon_caption', help='Path to the caption model')
@@ -45,7 +45,10 @@ async def parse(parse_request: ParseRequest):
@app.get("/probe/")
async def root():
return {"message": "API ready"}
return {"message": "Omniparser API ready"}
if __name__ == "__main__":
uvicorn.run("server:app", host=args.host, port=args.port, reload=True)
uvicorn.run("omniserver:app", host=args.host, port=args.port, reload=True)

View File

@@ -10,7 +10,7 @@ class Omniparser(object):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.som_model = get_yolo_model(model_path=config['som_model_path'])
self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
print('Omniparser initialized!')
print('Server initialized!')
def parse(self, image_base64: str):
image_bytes = base64.b64decode(image_base64)

View File

@@ -96,7 +96,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
model, processor = caption_model_processor['model'], caption_model_processor['processor']
if not prompt:
if 'florence' in model.config.name_or_path:
if 'florence' in model.config.model_type:
prompt = "<CAPTION>"
else:
prompt = "The image shows"
@@ -111,7 +111,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16)
else:
inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
if 'florence' in model.config.name_or_path:
if 'florence' in model.config.model_type:
generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=20,num_beams=1, do_sample=False)
else:
generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,