Merge pull request #55 from yuruotong1/dev

Dev
2026-03-22 13:07:17 +08:00 · 2025-03-07 08:26:24 +08:00
parent 4564470976 ce96fada13
commit b6dd91cc1e
7 changed files with 78 additions and 135 deletions
--- a/gradio_ui/agent/vlm_agent.py
+++ b/gradio_ui/agent/vlm_agent.py
@@ -198,10 +198,10 @@ class VLMAgent:
 You are using a Windows device.
 You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
 You can only interact with the desktop GUI (no terminal or application menu access).
-
 You may be given some history plan and actions, this is the response from the previous loop.
 You should carefully consider your plan base on the task, screenshot, and history actions.

+
 Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}

 Your available "Next Action" only include:
--- a/gradio_ui/app.py
+++ b/gradio_ui/app.py
@@ -18,8 +18,6 @@ from gradio_ui.loop import (
    sampling_loop_sync,
 )
 from gradio_ui.tools import ToolResult
-import requests
-from requests.exceptions import RequestException
 import base64

 CONFIG_DIR = Path("~/.anthropic").expanduser()
--- a/gradio_ui/tools/computer.py
+++ b/gradio_ui/tools/computer.py
@@ -9,6 +9,7 @@ from .base import BaseAnthropicTool, ToolError, ToolResult
 from .screen_capture import get_screenshot
 import requests
 import re
+import pyautogui

 OUTPUT_DIR = "./tmp/outputs"
 TYPING_DELAY_MS = 12
@@ -40,11 +41,6 @@ MAX_SCALING_TARGETS: dict[str, Resolution] = {
    "FWXGA": Resolution(width=1366, height=768),  # ~16:9
 }

-
-class ScalingSource(StrEnum):
-    COMPUTER = "computer"
-    API = "api"
-
 class ComputerToolOptions(TypedDict):
    display_height_px: int
    display_width_px: int
@@ -65,17 +61,13 @@ class ComputerTool(BaseAnthropicTool):
    height: int
    display_num: int | None
    _screenshot_delay = 2.0
-    _scaling_enabled = True
-

    @property
    def options(self) -> ComputerToolOptions:
-        width, height = self.scale_coordinates(
-            ScalingSource.COMPUTER, self.width, self.height
-        )
+        # 直接使用原始尺寸，不进行缩放
        return {
-            "display_width_px": width,
-            "display_height_px": height,
+            "display_width_px": self.width,
+            "display_height_px": self.height,
            "display_number": self.display_num,
        }

@@ -89,7 +81,6 @@ class ComputerTool(BaseAnthropicTool):
        self.display_num = None
        self.offset_x = 0
        self.offset_y = 0
-        self.is_scaling = is_scaling
        self.width, self.height = self.get_screen_size()
        print(f"screen size: {self.width}, {self.height}")
        self.key_conversion = {"Page_Down": "pagedown",
@@ -104,7 +95,7 @@ class ComputerTool(BaseAnthropicTool):
        coordinate: tuple[int, int] | None = None,
        **kwargs,
    ):
-        print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
+        print(f"action: {action}, text: {text}, coordinate: {coordinate},")
        if action in ("mouse_move", "left_click_drag"):
            if coordinate is None:
                raise ToolError(f"coordinate is required for {action}")
@@ -115,23 +106,15 @@ class ComputerTool(BaseAnthropicTool):
            # if not all(isinstance(i, int) and i >= 0 for i in coordinate):
            if not all(isinstance(i, int) for i in coordinate):
                raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
-            if self.is_scaling:
-                x, y = self.scale_coordinates(
-                    ScalingSource.API, coordinate[0], coordinate[1]
-                )
-            else:
-                x, y = coordinate
-            # print(f"scaled_coordinates: {x}, {y}")
-            # print(f"offset: {self.offset_x}, {self.offset_y}")
-            # x += self.offset_x # TODO - check if this is needed
-            # y += self.offset_y
+           
+            x, y = coordinate
            print(f"mouse move to {x}, {y}")
            if action == "mouse_move":
-                self.run_command(f"pyautogui.moveTo({x}, {y})")
+                pyautogui.moveTo(x, y)
                return ToolResult(output=f"Moved mouse to ({x}, {y})")
            elif action == "left_click_drag":
-                current_x, current_y = self.run_command("pyautogui.position()")
-                self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
+                current_x, current_y = pyautogui.position()
+                pyautogui.dragTo(x, y, duration=0.5)
                return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
        if action in ("key", "type"):
            if text is None:
@@ -146,17 +129,17 @@ class ComputerTool(BaseAnthropicTool):
                for key in keys:
                    key = self.key_conversion.get(key.strip(), key.strip())
                    key = key.lower()
-                    self.run_command(f"pyautogui.keyDown('{key}')")  # Press down each key
+                    pyautogui.keyDown(key)
                for key in reversed(keys):
                    key = self.key_conversion.get(key.strip(), key.strip())
                    key = key.lower()
-                    self.run_command(f"pyautogui.keyUp('{key}')")    # Release each key in reverse order
+                    pyautogui.keyUp(key)
                return ToolResult(output=f"Pressed keys: {text}")
            elif action == "type":
                # default click before type TODO: check if this is needed
-                self.run_command("pyautogui.click()")
-                self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
-                self.run_command("pyautogui.press('enter')")
+                pyautogui.click()
+                pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)
+                pyautogui.press('enter')
                screenshot_base64 = (await self.screenshot()).base64_image
                return ToolResult(output=text, base64_image=screenshot_base64)
        if action in (
@@ -175,28 +158,28 @@ class ComputerTool(BaseAnthropicTool):
            if action == "screenshot":
                return await self.screenshot()
            elif action == "cursor_position":
-                x, y = self.run_command("pyautogui.position()")
-                x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
+                x, y = pyautogui.position()
+                # 直接返回原始坐标，不进行缩放
                return ToolResult(output=f"X={x},Y={y}")
            else:
                if action == "left_click":
-                    self.run_command("pyautogui.click()")
+                    pyautogui.click()
                elif action == "right_click":
-                    self.run_command("pyautogui.rightClick()")
+                    pyautogui.rightClick()
                elif action == "middle_click":
-                    self.run_command("pyautogui.middleClick()")
+                    pyautogui.middleClick()
                elif action == "double_click":
-                    self.run_command("pyautogui.doubleClick()")
+                    pyautogui.doubleClick()
                elif action == "left_press":
-                    self.run_command("pyautogui.mouseDown()")
+                    pyautogui.mouseDown()
                    time.sleep(1)
-                    self.run_command("pyautogui.mouseUp()")
+                    pyautogui.mouseUp()
                return ToolResult(output=f"Performed {action}")
        if action in ("scroll_up", "scroll_down"):
            if action == "scroll_up":
-                self.run_command("pyautogui.scroll(100)")
+                pyautogui.scroll(100)
            elif action == "scroll_down":
-                self.run_command("pyautogui.scroll(-100)")
+                pyautogui.scroll(-100)
            return ToolResult(output=f"Performed {action}")
        if action == "hover":
            return ToolResult(output=f"Performed {action}")
@@ -204,31 +187,6 @@ class ComputerTool(BaseAnthropicTool):
            time.sleep(1)
            return ToolResult(output=f"Performed {action}")
        raise ToolError(f"Invalid action: {action}")
-    def run_command(self, action: str):
-        """
-
-        Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
-        """
-        prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
-        command_list = ["python", "-c", f"{prefix} {action}"]
-        parse = action == "pyautogui.position()"
-        if parse:
-            command_list[-1] = f"{prefix} print({action})"
-        try:
-            print(f"run command: {command_list}")
-            # 使用 tool.execute_command 替代 requests.post
-            response = tool.execute_command(command_list)
-            time.sleep(0.7) # avoid async error as actions take time to complete
-            print(f"action executed")
-            if parse:
-                output = response['output'].strip()
-                match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
-                if not match:
-                    raise ToolError(f"Could not parse coordinates from output: {output}")
-                x, y = map(int, match.groups())
-                return x, y
-        except requests.exceptions.RequestException as e:
-            raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
    
    async def screenshot(self):
        if not hasattr(self, 'target_dimension'):
@@ -249,35 +207,7 @@ class ComputerTool(BaseAnthropicTool):
        padding_image.paste(screenshot, (0, 0))
        return padding_image

-    def scale_coordinates(self, source: ScalingSource, x: int, y: int):
-        """Scale coordinates to a target maximum resolution."""
-        if not self._scaling_enabled:
-            return x, y
-        ratio = self.width / self.height
-        target_dimension = None
-        for target_name, dimension in MAX_SCALING_TARGETS.items():
-            # allow some error in the aspect ratio - not ratios are exactly 16:9
-            if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
-                if dimension["width"] < self.width:
-                    target_dimension = dimension
-                    self.target_dimension = target_dimension
-                    # print(f"target_dimension: {target_dimension}")
-                break
-        if target_dimension is None:
-            # TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
-            target_dimension = MAX_SCALING_TARGETS["WXGA"]
-            self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
-        # should be less than 1
-        x_scaling_factor = target_dimension["width"] / self.width
-        y_scaling_factor = target_dimension["height"] / self.height
-        if source == ScalingSource.API:
-            if x > self.width or y > self.height:
-                raise ToolError(f"Coordinates {x}, {y} are out of bounds")
-            # scale up
-            return round(x / x_scaling_factor), round(y / y_scaling_factor)
-        # scale down
-        return round(x * x_scaling_factor), round(y * y_scaling_factor)
-        
+   
    def get_screen_size(self):
        """Return width and height of the screen"""
        try:
--- a/main.py
+++ b/main.py
@@ -1,12 +1,10 @@
-import argparse
 import subprocess
-import signal
-import sys
-import platform
+from threading import Thread
+import requests
 from gradio_ui import app
 from util import download_weights
-import time
 import torch
+import socket

 def run():
    try:
@@ -17,45 +15,59 @@ def run():
    except Exception:
        print("显卡驱动不适配，请根据readme安装合适版本的 torch！")

-    # 启动 server.py 子进程，并捕获其输出
-    # Windows: 
-    if platform.system() == 'Windows':
-        server_process = subprocess.Popen(
-            ["python", "./server.py"],
-            stdout=subprocess.PIPE,  # 捕获标准输出
-            stderr=subprocess.PIPE,
-            creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
-            text=True
-        )
-    else:
-        server_process = subprocess.Popen(
-            ["python", "./server.py"],
-            stdout=subprocess.PIPE,  # 捕获标准输出
-            stderr=subprocess.PIPE,
-            start_new_session=True,
-            text=True
-        )
+
+    server_process = subprocess.Popen(
+        ["python", "./omniserver.py"],
+        stdout=subprocess.PIPE,  # 捕获标准输出
+        stderr=subprocess.PIPE,
+        text=True
+    )


    try:
        # 下载权重文件
        download_weights.download()
-        print("启动Omniserver服务中，约40s左右，请耐心等待！")
+        print("启动Omniserver服务中，约5分钟左右，因为加载模型真的超级慢，请耐心等待！")
        # 启动 Gradio UI
         # 等待 server_process 打印出 "Started server process"
        while True:
-            output = server_process.stdout.readline()
-            if "Omniparser initialized" in output:
-                print("Omniparseer服务启动成功...")
+            res = requests.get("http://127.0.0.1:8000/probe/")
+            if res.status_code == 200 and res.json().get("message", None):
+                print("Omniparser服务启动成功...")
                break
            if server_process.poll() is not None:
                raise RuntimeError("Server process terminated unexpectedly")
+        
+        stdout_thread = Thread(
+            target=stream_reader,
+            args=(server_process.stdout, "SERVER-OUT")
+        )
+
+        stderr_thread = Thread(
+            target=stream_reader,
+            args=(server_process.stderr, "SERVER-ERR")
+        )
+        stdout_thread.daemon = True
+        stderr_thread.daemon = True
+        stdout_thread.start()
+        stderr_thread.start()
        app.run()
    finally:
-        # 确保在主进程退出时终止子进程
        if server_process.poll() is None:  # 如果进程还在运行
            server_process.terminate()  # 发送终止信号
-            server_process.wait(timeout=5)  # 等待进程结束
+            server_process.wait(timeout=8)  # 等待进程结束

+def stream_reader(pipe, prefix):
+    for line in pipe:
+        print(f"[{prefix}]", line, end="", flush=True)
+
+def is_port_occupied(port):
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(('localhost', port)) == 0
+    
 if __name__ == '__main__':
+    # 检测8000端口是否被占用
+    if is_port_occupied(8000):
+        print("8000端口被占用，请先关闭占用该端口的进程")
+        exit()
    run()
--- a/omniserver.py
+++ b/omniserver.py
@@ -1,5 +1,5 @@
 '''
-python -m server --som_model_path ../../weights/icon_detect/model.pt --caption_model_name florence2 --caption_model_path ../../weights/icon_caption_florence --device cuda --BOX_TRESHOLD 0.05
+python -m omniparserserver --som_model_path ../../weights/icon_detect/model.pt --caption_model_name florence2 --caption_model_path ../../weights/icon_caption_florence --device cuda --BOX_TRESHOLD 0.05
 '''

 import sys
@@ -14,7 +14,7 @@ sys.path.append(root_dir)
 from util.omniparser import Omniparser

 def parse_arguments():
-    parser = argparse.ArgumentParser(description='autoMate API')
+    parser = argparse.ArgumentParser(description='Omniparser API')
    parser.add_argument('--som_model_path', type=str, default='./weights/icon_detect/model.pt', help='Path to the som model')
    parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model')
    parser.add_argument('--caption_model_path', type=str, default='./weights/icon_caption', help='Path to the caption model')
@@ -45,7 +45,10 @@ async def parse(parse_request: ParseRequest):

@app.get("/probe/")
 async def root():
-    return {"message": "API ready"}
+    return {"message": "Omniparser API ready"}
+
+
+

 if __name__ == "__main__":
-    uvicorn.run("server:app", host=args.host, port=args.port, reload=True)
+    uvicorn.run("omniserver:app", host=args.host, port=args.port, reload=True)
--- a/util/omniparser.py
+++ b/util/omniparser.py
@@ -10,7 +10,7 @@ class Omniparser(object):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.som_model = get_yolo_model(model_path=config['som_model_path'])
        self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
-        print('Omniparser initialized!')
+        print('Server initialized!')

    def parse(self, image_base64: str):
        image_bytes = base64.b64decode(image_base64)
--- a/util/utils.py
+++ b/util/utils.py
@@ -96,7 +96,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_

    model, processor = caption_model_processor['model'], caption_model_processor['processor']
    if not prompt:
-        if 'florence' in model.config.name_or_path:
+        if 'florence' in model.config.model_type:
            prompt = "<CAPTION>"
        else:
            prompt = "The image shows"
@@ -111,7 +111,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
            inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16)
        else:
            inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)
-        if 'florence' in model.config.name_or_path:
+        if 'florence' in model.config.model_type:
            generated_ids = model.generate(input_ids=inputs["input_ids"],pixel_values=inputs["pixel_values"],max_new_tokens=20,num_beams=1, do_sample=False)
        else:
            generated_ids = model.generate(**inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, num_return_sequences=1) # temperature=0.01, do_sample=True,