优化代码逻辑

2025-12-26 05:16:21 +08:00 · 2025-03-06 18:17:22 +08:00 · 2025-03-06 18:17:22 +08:00 · dbea699336
commit dbea699336
parent 7d500a3b0e
3 changed files with 28 additions and 97 deletions
--- a/gradio_ui/agent/vlm_agent.py
+++ b/gradio_ui/agent/vlm_agent.py
@ -198,10 +198,10 @@ class VLMAgent:
 You are using a Windows device.
 You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
 You can only interact with the desktop GUI (no terminal or application menu access).
-
 You may be given some history plan and actions, this is the response from the previous loop.
 You should carefully consider your plan base on the task, screenshot, and history actions.

+
 Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}

 Your available "Next Action" only include:
--- a/gradio_ui/app.py
+++ b/gradio_ui/app.py
@ -159,6 +159,7 @@ def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="b


 def process_input(user_input, state):
+    
    # Reset the stop flag
    if state["stop"]:
        state["stop"] = False
--- a/gradio_ui/tools/computer.py
+++ b/gradio_ui/tools/computer.py
@ -9,6 +9,7 @@ from .base import BaseAnthropicTool, ToolError, ToolResult
 from .screen_capture import get_screenshot
 import requests
 import re
+import pyautogui

 OUTPUT_DIR = "./tmp/outputs"
 TYPING_DELAY_MS = 12
@ -40,11 +41,6 @@ MAX_SCALING_TARGETS: dict[str, Resolution] = {
    "FWXGA": Resolution(width=1366, height=768),  # ~16:9
 }

-
-class ScalingSource(StrEnum):
-    COMPUTER = "computer"
-    API = "api"
-
 class ComputerToolOptions(TypedDict):
    display_height_px: int
    display_width_px: int
@ -65,17 +61,13 @@ class ComputerTool(BaseAnthropicTool):
    height: int
    display_num: int | None
    _screenshot_delay = 2.0
-    _scaling_enabled = True
-

    @property
    def options(self) -> ComputerToolOptions:
-        width, height = self.scale_coordinates(
-            ScalingSource.COMPUTER, self.width, self.height
-        )
+        # 直接使用原始尺寸，不进行缩放
        return {
-            "display_width_px": width,
-            "display_height_px": height,
+            "display_width_px": self.width,
+            "display_height_px": self.height,
            "display_number": self.display_num,
        }

@ -89,7 +81,6 @@ class ComputerTool(BaseAnthropicTool):
        self.display_num = None
        self.offset_x = 0
        self.offset_y = 0
-        self.is_scaling = is_scaling
        self.width, self.height = self.get_screen_size()
        print(f"screen size: {self.width}, {self.height}")
        self.key_conversion = {"Page_Down": "pagedown",
@ -104,7 +95,7 @@ class ComputerTool(BaseAnthropicTool):
        coordinate: tuple[int, int] | None = None,
        **kwargs,
    ):
-        print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
+        print(f"action: {action}, text: {text}, coordinate: {coordinate},")
        if action in ("mouse_move", "left_click_drag"):
            if coordinate is None:
                raise ToolError(f"coordinate is required for {action}")
@ -115,23 +106,15 @@ class ComputerTool(BaseAnthropicTool):
            # if not all(isinstance(i, int) and i >= 0 for i in coordinate):
            if not all(isinstance(i, int) for i in coordinate):
                raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
-            if self.is_scaling:
-                x, y = self.scale_coordinates(
-                    ScalingSource.API, coordinate[0], coordinate[1]
-                )
-            else:
-                x, y = coordinate
-            # print(f"scaled_coordinates: {x}, {y}")
-            # print(f"offset: {self.offset_x}, {self.offset_y}")
-            # x += self.offset_x # TODO - check if this is needed
-            # y += self.offset_y
+           
+            x, y = coordinate
            print(f"mouse move to {x}, {y}")
            if action == "mouse_move":
-                self.run_command(f"pyautogui.moveTo({x}, {y})")
+                pyautogui.moveTo(x, y)
                return ToolResult(output=f"Moved mouse to ({x}, {y})")
            elif action == "left_click_drag":
-                current_x, current_y = self.run_command("pyautogui.position()")
-                self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
+                current_x, current_y = pyautogui.position()
+                pyautogui.dragTo(x, y, duration=0.5)
                return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
        if action in ("key", "type"):
            if text is None:
@ -146,17 +129,17 @@ class ComputerTool(BaseAnthropicTool):
                for key in keys:
                    key = self.key_conversion.get(key.strip(), key.strip())
                    key = key.lower()
-                    self.run_command(f"pyautogui.keyDown('{key}')")  # Press down each key
+                    pyautogui.keyDown(key)
                for key in reversed(keys):
                    key = self.key_conversion.get(key.strip(), key.strip())
                    key = key.lower()
-                    self.run_command(f"pyautogui.keyUp('{key}')")    # Release each key in reverse order
+                    pyautogui.keyUp(key)
                return ToolResult(output=f"Pressed keys: {text}")
            elif action == "type":
                # default click before type TODO: check if this is needed
-                self.run_command("pyautogui.click()")
-                self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
-                self.run_command("pyautogui.press('enter')")
+                pyautogui.click()
+                pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)
+                pyautogui.press('enter')
                screenshot_base64 = (await self.screenshot()).base64_image
                return ToolResult(output=text, base64_image=screenshot_base64)
        if action in (
@ -175,28 +158,28 @@ class ComputerTool(BaseAnthropicTool):
            if action == "screenshot":
                return await self.screenshot()
            elif action == "cursor_position":
-                x, y = self.run_command("pyautogui.position()")
-                x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
+                x, y = pyautogui.position()
+                # 直接返回原始坐标，不进行缩放
                return ToolResult(output=f"X={x},Y={y}")
            else:
                if action == "left_click":
-                    self.run_command("pyautogui.click()")
+                    pyautogui.click()
                elif action == "right_click":
-                    self.run_command("pyautogui.rightClick()")
+                    pyautogui.rightClick()
                elif action == "middle_click":
-                    self.run_command("pyautogui.middleClick()")
+                    pyautogui.middleClick()
                elif action == "double_click":
-                    self.run_command("pyautogui.doubleClick()")
+                    pyautogui.doubleClick()
                elif action == "left_press":
-                    self.run_command("pyautogui.mouseDown()")
+                    pyautogui.mouseDown()
                    time.sleep(1)
-                    self.run_command("pyautogui.mouseUp()")
+                    pyautogui.mouseUp()
                return ToolResult(output=f"Performed {action}")
        if action in ("scroll_up", "scroll_down"):
            if action == "scroll_up":
-                self.run_command("pyautogui.scroll(100)")
+                pyautogui.scroll(100)
            elif action == "scroll_down":
-                self.run_command("pyautogui.scroll(-100)")
+                pyautogui.scroll(-100)
            return ToolResult(output=f"Performed {action}")
        if action == "hover":
            return ToolResult(output=f"Performed {action}")
@ -204,31 +187,6 @@ class ComputerTool(BaseAnthropicTool):
            time.sleep(1)
            return ToolResult(output=f"Performed {action}")
        raise ToolError(f"Invalid action: {action}")
-    def run_command(self, action: str):
-        """
-
-        Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
-        """
-        prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
-        command_list = ["python", "-c", f"{prefix} {action}"]
-        parse = action == "pyautogui.position()"
-        if parse:
-            command_list[-1] = f"{prefix} print({action})"
-        try:
-            print(f"run command: {command_list}")
-            # 使用 tool.execute_command 替代 requests.post
-            response = tool.execute_command(command_list)
-            time.sleep(0.7) # avoid async error as actions take time to complete
-            print(f"action executed")
-            if parse:
-                output = response['output'].strip()
-                match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
-                if not match:
-                    raise ToolError(f"Could not parse coordinates from output: {output}")
-                x, y = map(int, match.groups())
-                return x, y
-        except requests.exceptions.RequestException as e:
-            raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
    
    async def screenshot(self):
        if not hasattr(self, 'target_dimension'):
@ -249,35 +207,7 @@ class ComputerTool(BaseAnthropicTool):
        padding_image.paste(screenshot, (0, 0))
        return padding_image

-    def scale_coordinates(self, source: ScalingSource, x: int, y: int):
-        """Scale coordinates to a target maximum resolution."""
-        if not self._scaling_enabled:
-            return x, y
-        ratio = self.width / self.height
-        target_dimension = None
-        for target_name, dimension in MAX_SCALING_TARGETS.items():
-            # allow some error in the aspect ratio - not ratios are exactly 16:9
-            if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
-                if dimension["width"] < self.width:
-                    target_dimension = dimension
-                    self.target_dimension = target_dimension
-                    # print(f"target_dimension: {target_dimension}")
-                break
-        if target_dimension is None:
-            # TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
-            target_dimension = MAX_SCALING_TARGETS["WXGA"]
-            self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
-        # should be less than 1
-        x_scaling_factor = target_dimension["width"] / self.width
-        y_scaling_factor = target_dimension["height"] / self.height
-        if source == ScalingSource.API:
-            if x > self.width or y > self.height:
-                raise ToolError(f"Coordinates {x}, {y} are out of bounds")
-            # scale up
-            return round(x / x_scaling_factor), round(y / y_scaling_factor)
-        # scale down
-        return round(x * x_scaling_factor), round(y * y_scaling_factor)
-        
+   
    def get_screen_size(self):
        """Return width and height of the screen"""
        try: