From dbea699336972794a09fda9ea44fb040c3892893 Mon Sep 17 00:00:00 2001 From: yuruo Date: Thu, 6 Mar 2025 18:17:22 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gradio_ui/agent/vlm_agent.py | 2 +- gradio_ui/app.py | 1 + gradio_ui/tools/computer.py | 122 ++++++++--------------------------- 3 files changed, 28 insertions(+), 97 deletions(-) diff --git a/gradio_ui/agent/vlm_agent.py b/gradio_ui/agent/vlm_agent.py index 1218d00..c9435bf 100644 --- a/gradio_ui/agent/vlm_agent.py +++ b/gradio_ui/agent/vlm_agent.py @@ -198,10 +198,10 @@ class VLMAgent: You are using a Windows device. You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot. You can only interact with the desktop GUI (no terminal or application menu access). - You may be given some history plan and actions, this is the response from the previous loop. You should carefully consider your plan base on the task, screenshot, and history actions. + Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info} Your available "Next Action" only include: diff --git a/gradio_ui/app.py b/gradio_ui/app.py index 27f95ca..e9f64bd 100644 --- a/gradio_ui/app.py +++ b/gradio_ui/app.py @@ -159,6 +159,7 @@ def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="b def process_input(user_input, state): + # Reset the stop flag if state["stop"]: state["stop"] = False diff --git a/gradio_ui/tools/computer.py b/gradio_ui/tools/computer.py index 9ae6d8f..202e92d 100644 --- a/gradio_ui/tools/computer.py +++ b/gradio_ui/tools/computer.py @@ -9,6 +9,7 @@ from .base import BaseAnthropicTool, ToolError, ToolResult from .screen_capture import get_screenshot import requests import re +import pyautogui OUTPUT_DIR = "./tmp/outputs" TYPING_DELAY_MS = 12 @@ -40,11 +41,6 @@ MAX_SCALING_TARGETS: dict[str, Resolution] = { "FWXGA": Resolution(width=1366, height=768), # ~16:9 } - -class ScalingSource(StrEnum): - COMPUTER = "computer" - API = "api" - class ComputerToolOptions(TypedDict): display_height_px: int display_width_px: int @@ -65,17 +61,13 @@ class ComputerTool(BaseAnthropicTool): height: int display_num: int | None _screenshot_delay = 2.0 - _scaling_enabled = True - @property def options(self) -> ComputerToolOptions: - width, height = self.scale_coordinates( - ScalingSource.COMPUTER, self.width, self.height - ) + # 直接使用原始尺寸,不进行缩放 return { - "display_width_px": width, - "display_height_px": height, + "display_width_px": self.width, + "display_height_px": self.height, "display_number": self.display_num, } @@ -89,7 +81,6 @@ class ComputerTool(BaseAnthropicTool): self.display_num = None self.offset_x = 0 self.offset_y = 0 - self.is_scaling = is_scaling self.width, self.height = self.get_screen_size() print(f"screen size: {self.width}, {self.height}") self.key_conversion = {"Page_Down": "pagedown", @@ -104,7 +95,7 @@ class ComputerTool(BaseAnthropicTool): coordinate: tuple[int, int] | None = None, **kwargs, ): - print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}") + print(f"action: {action}, text: {text}, coordinate: {coordinate},") if action in ("mouse_move", "left_click_drag"): if coordinate is None: raise ToolError(f"coordinate is required for {action}") @@ -115,23 +106,15 @@ class ComputerTool(BaseAnthropicTool): # if not all(isinstance(i, int) and i >= 0 for i in coordinate): if not all(isinstance(i, int) for i in coordinate): raise ToolError(f"{coordinate} must be a tuple of non-negative ints") - if self.is_scaling: - x, y = self.scale_coordinates( - ScalingSource.API, coordinate[0], coordinate[1] - ) - else: - x, y = coordinate - # print(f"scaled_coordinates: {x}, {y}") - # print(f"offset: {self.offset_x}, {self.offset_y}") - # x += self.offset_x # TODO - check if this is needed - # y += self.offset_y + + x, y = coordinate print(f"mouse move to {x}, {y}") if action == "mouse_move": - self.run_command(f"pyautogui.moveTo({x}, {y})") + pyautogui.moveTo(x, y) return ToolResult(output=f"Moved mouse to ({x}, {y})") elif action == "left_click_drag": - current_x, current_y = self.run_command("pyautogui.position()") - self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)") + current_x, current_y = pyautogui.position() + pyautogui.dragTo(x, y, duration=0.5) return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})") if action in ("key", "type"): if text is None: @@ -146,17 +129,17 @@ class ComputerTool(BaseAnthropicTool): for key in keys: key = self.key_conversion.get(key.strip(), key.strip()) key = key.lower() - self.run_command(f"pyautogui.keyDown('{key}')") # Press down each key + pyautogui.keyDown(key) for key in reversed(keys): key = self.key_conversion.get(key.strip(), key.strip()) key = key.lower() - self.run_command(f"pyautogui.keyUp('{key}')") # Release each key in reverse order + pyautogui.keyUp(key) return ToolResult(output=f"Pressed keys: {text}") elif action == "type": # default click before type TODO: check if this is needed - self.run_command("pyautogui.click()") - self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})") - self.run_command("pyautogui.press('enter')") + pyautogui.click() + pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) + pyautogui.press('enter') screenshot_base64 = (await self.screenshot()).base64_image return ToolResult(output=text, base64_image=screenshot_base64) if action in ( @@ -175,28 +158,28 @@ class ComputerTool(BaseAnthropicTool): if action == "screenshot": return await self.screenshot() elif action == "cursor_position": - x, y = self.run_command("pyautogui.position()") - x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y) + x, y = pyautogui.position() + # 直接返回原始坐标,不进行缩放 return ToolResult(output=f"X={x},Y={y}") else: if action == "left_click": - self.run_command("pyautogui.click()") + pyautogui.click() elif action == "right_click": - self.run_command("pyautogui.rightClick()") + pyautogui.rightClick() elif action == "middle_click": - self.run_command("pyautogui.middleClick()") + pyautogui.middleClick() elif action == "double_click": - self.run_command("pyautogui.doubleClick()") + pyautogui.doubleClick() elif action == "left_press": - self.run_command("pyautogui.mouseDown()") + pyautogui.mouseDown() time.sleep(1) - self.run_command("pyautogui.mouseUp()") + pyautogui.mouseUp() return ToolResult(output=f"Performed {action}") if action in ("scroll_up", "scroll_down"): if action == "scroll_up": - self.run_command("pyautogui.scroll(100)") + pyautogui.scroll(100) elif action == "scroll_down": - self.run_command("pyautogui.scroll(-100)") + pyautogui.scroll(-100) return ToolResult(output=f"Performed {action}") if action == "hover": return ToolResult(output=f"Performed {action}") @@ -204,31 +187,6 @@ class ComputerTool(BaseAnthropicTool): time.sleep(1) return ToolResult(output=f"Performed {action}") raise ToolError(f"Invalid action: {action}") - def run_command(self, action: str): - """ - - Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()" - """ - prefix = "import pyautogui; pyautogui.FAILSAFE = False;" - command_list = ["python", "-c", f"{prefix} {action}"] - parse = action == "pyautogui.position()" - if parse: - command_list[-1] = f"{prefix} print({action})" - try: - print(f"run command: {command_list}") - # 使用 tool.execute_command 替代 requests.post - response = tool.execute_command(command_list) - time.sleep(0.7) # avoid async error as actions take time to complete - print(f"action executed") - if parse: - output = response['output'].strip() - match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output) - if not match: - raise ToolError(f"Could not parse coordinates from output: {output}") - x, y = map(int, match.groups()) - return x, y - except requests.exceptions.RequestException as e: - raise ToolError(f"An error occurred while trying to execute the command: {str(e)}") async def screenshot(self): if not hasattr(self, 'target_dimension'): @@ -249,35 +207,7 @@ class ComputerTool(BaseAnthropicTool): padding_image.paste(screenshot, (0, 0)) return padding_image - def scale_coordinates(self, source: ScalingSource, x: int, y: int): - """Scale coordinates to a target maximum resolution.""" - if not self._scaling_enabled: - return x, y - ratio = self.width / self.height - target_dimension = None - for target_name, dimension in MAX_SCALING_TARGETS.items(): - # allow some error in the aspect ratio - not ratios are exactly 16:9 - if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: - if dimension["width"] < self.width: - target_dimension = dimension - self.target_dimension = target_dimension - # print(f"target_dimension: {target_dimension}") - break - if target_dimension is None: - # TODO: currently we force the target to be WXGA (16:10), when it cannot find a match - target_dimension = MAX_SCALING_TARGETS["WXGA"] - self.target_dimension = MAX_SCALING_TARGETS["WXGA"] - # should be less than 1 - x_scaling_factor = target_dimension["width"] / self.width - y_scaling_factor = target_dimension["height"] / self.height - if source == ScalingSource.API: - if x > self.width or y > self.height: - raise ToolError(f"Coordinates {x}, {y} are out of bounds") - # scale up - return round(x / x_scaling_factor), round(y / y_scaling_factor) - # scale down - return round(x * x_scaling_factor), round(y * y_scaling_factor) - + def get_screen_size(self): """Return width and height of the screen""" try: