From af4a5bf3e4db8393d01ef36da0f774c505070871 Mon Sep 17 00:00:00 2001 From: yuruo Date: Thu, 6 Mar 2025 16:39:52 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BB=A3=E7=A0=81=E7=BB=93?= =?UTF-8?q?=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README_CN.md | 98 --------------- gradio_ui/tools/computer.py | 238 ------------------------------------ util/download_weights.py | 1 - util/omniparser.py | 1 - util/tool.py | 3 - 5 files changed, 341 deletions(-) delete mode 100644 README_CN.md diff --git a/README_CN.md b/README_CN.md deleted file mode 100644 index 32f18ef..0000000 --- a/README_CN.md +++ /dev/null @@ -1,98 +0,0 @@ -
- -autoMate logo -

autoMate

-

🤖 AI驱动的本地自动化工具 | 让电脑自己会干活

- ->"让繁琐自动化,把时间还给生活" - -![](./resources/autoMate.png) - - -
- -## 💫 重新定义你与电脑的关系 - -深夜加班处理重复性工作让你疲惫不堪?琐碎任务占用了你的创造力和宝贵时间? - -autoMate,不仅仅是一款普通工具,它是AGI第三阶段的智能助手,你的数字同事,始终高效运转,帮你重获工作与生活的平衡。 - - -**让自动化为你的生活创造更多可能。** - - -## 💡 项目简介 -autoMate 是一款革命性的AI+RPA自动化工具,基于OmniParser构建,让AI成为你的"数字员工",它能够 - -- 📊 自动操作您的电脑界面,完成复杂的工作流程 -- 🔍 智能理解屏幕内容,模拟人类视觉和操作 -- 🧠 自主决策,根据任务需求进行判断并采取行动 -- 💻 支持本地化部署,保护您的数据安全和隐私 - -不同于传统RPA工具的繁琐规则设置,autoMate借助大模型的能力,只需用自然语言描述任务,AI就能完成复杂的自动化流程。从此告别重复性工作,专注于真正创造价值的事情! - -## 🌟 为什么autoMate会改变你的工作方式 - -> "在我使用autoMate之前,我每天花费3小时处理报表;现在,我只需10分钟设置任务,然后去做真正重要的事。"一位yy出来的财务经理的反馈。 - -当你第一次看到autoMate自动完成那些曾经占用你数小时的工作时,你会有一种难以描述的释然。这不仅仅是效率的提升,更是对创造力的解放。 - -想象一下:每天早上醒来,发现昨晚安排的数据整理、报表生成、邮件回复都已完成,等待你的只有真正需要你智慧和创造力的工作。这就是autoMate带给你的未来。 - -## ✨ 功能特点 - -- 🔮 无代码自动化 - 使用自然语言描述任务,无需编程知识 -- 🖥️ 全界面操控 - 支持任何可视化界面的操作,不限于特定软件 -- 🚅 简化安装 - 比官方版本更简洁的安装流程,支持中文环境,一键部署 -- 🔒 本地运行 - 保护数据安全,无需担心隐私泄露 -- 🌐 多模型支持 - 兼容主流大型语言模型 -- 💎 持续成长 - 随着你的使用,它会越来越了解你的工作习惯和需求 - -## 🚀 快速开始 - -### 📦 安装 -Clone项目,然后安装环境: - -```bash -git clone https://github.com/yuruotong1/autoMate.git -cd autoMate -conda create -n "automate" python==3.12 -conda activate automate -pip install -r requirements.txt -``` -### 🎮 启动应用 - -```bash -python main.py -``` -然后在浏览器中打开`http://localhost:7888/`,配置您的API密钥和基本设置。 - - -## 📝常见问题 - -### 🔧CUDA版本不匹配问题 -如果启动时报:“显卡驱动不适配,请根据readme安装合适版本的 torch”,说明当前显卡驱动不适配。你可以不用管这条信息,只用CPU运行,但是会非常慢。你也可以: - -1. 运行`pip list`查看torch版本; -2. 从[官网](https://pytorch.org/get-started/locally/)查看支持的cuda版本; -3. 重新安装Nvidia驱动。 - - -## 🤝 参与共建 - -每一个优秀的开源项目都凝聚着集体的智慧。autoMate的成长离不开你的参与和贡献。无论是修复bug、添加功能,还是改进文档,你的每一份付出都将帮助成千上万的人摆脱重复性工作的束缚。 - -加入我们,一起创造更加智能的未来。 - -> 强烈推荐阅读 [《提问的智慧》](https://github.com/ryanhanwu/How-To-Ask-Questions-The-Smart-Way)、[《如何向开源社区提问题》](https://github.com/seajs/seajs/issues/545) 和 [《如何有效地报告 Bug》](http://www.chiark.greenend.org.uk/%7Esgtatham/bugs-cn.html)、[《如何向开源项目提交无法解答的问题》](https://zhuanlan.zhihu.com/p/25795393),更好的问题更容易获得帮助。 - - - - - ---- - -
-⭐ 每一个Star都是对创作者的鼓励,也是让更多人发现并受益于autoMate的机会 ⭐ -今天你的支持,就是我们明天前进的动力 -
\ No newline at end of file diff --git a/gradio_ui/tools/computer.py b/gradio_ui/tools/computer.py index 8f0331a..31d89c4 100644 --- a/gradio_ui/tools/computer.py +++ b/gradio_ui/tools/computer.py @@ -1,439 +1,233 @@ import base64 import time - from enum import StrEnum - from typing import Literal, TypedDict - - from PIL import Image from util import tool - from anthropic.types.beta import BetaToolComputerUse20241022Param - - from .base import BaseAnthropicTool, ToolError, ToolResult - from .screen_capture import get_screenshot - import requests import re - OUTPUT_DIR = "./tmp/outputs" - - TYPING_DELAY_MS = 12 - TYPING_GROUP_SIZE = 50 Action = Literal[ - "key", - "type", - "mouse_move", - "left_click", - "left_click_drag", - "right_click", - "middle_click", - "double_click", - "screenshot", - "cursor_position", - "hover", - "wait" - ] - - class Resolution(TypedDict): - width: int - height: int - - MAX_SCALING_TARGETS: dict[str, Resolution] = { - "XGA": Resolution(width=1024, height=768), # 4:3 - "WXGA": Resolution(width=1280, height=800), # 16:10 - "FWXGA": Resolution(width=1366, height=768), # ~16:9 - } - class ScalingSource(StrEnum): - COMPUTER = "computer" - API = "api" - - class ComputerToolOptions(TypedDict): - display_height_px: int - display_width_px: int - display_number: int | None - - def chunks(s: str, chunk_size: int) -> list[str]: - return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] - class ComputerTool(BaseAnthropicTool): """ - A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. Adapted for Windows using 'pyautogui'. """ - - name: Literal["computer"] = "computer" - api_type: Literal["computer_20241022"] = "computer_20241022" - width: int - height: int - display_num: int | None - - _screenshot_delay = 2.0 - _scaling_enabled = True @property - def options(self) -> ComputerToolOptions: - width, height = self.scale_coordinates( - ScalingSource.COMPUTER, self.width, self.height ) - return { - "display_width_px": width, - "display_height_px": height, - "display_number": self.display_num, - } - def to_params(self) -> BetaToolComputerUse20241022Param: - return {"name": self.name, "type": self.api_type, **self.options} def __init__(self, is_scaling: bool = False): super().__init__() - - # Get screen width and height using Windows command - self.display_num = None - self.offset_x = 0 - self.offset_y = 0 - self.is_scaling = is_scaling - self.width, self.height = self.get_screen_size() - print(f"screen size: {self.width}, {self.height}") - - self.key_conversion = {"Page_Down": "pagedown", - "Page_Up": "pageup", - "Super_L": "win", - "Escape": "esc"} - - - async def __call__( - self, - *, - action: Action, - text: str | None = None, - coordinate: tuple[int, int] | None = None, - **kwargs, - ): - print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}") - if action in ("mouse_move", "left_click_drag"): - if coordinate is None: - raise ToolError(f"coordinate is required for {action}") - if text is not None: - raise ToolError(f"text is not accepted for {action}") - if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2: - raise ToolError(f"{coordinate} must be a tuple of length 2") - # if not all(isinstance(i, int) and i >= 0 for i in coordinate): - if not all(isinstance(i, int) for i in coordinate): - raise ToolError(f"{coordinate} must be a tuple of non-negative ints") - - if self.is_scaling: - x, y = self.scale_coordinates( - ScalingSource.API, coordinate[0], coordinate[1] ) - else: - x, y = coordinate - - # print(f"scaled_coordinates: {x}, {y}") - # print(f"offset: {self.offset_x}, {self.offset_y}") - - # x += self.offset_x # TODO - check if this is needed - # y += self.offset_y - - print(f"mouse move to {x}, {y}") - - if action == "mouse_move": self.run_command(f"pyautogui.moveTo({x}, {y})") - return ToolResult(output=f"Moved mouse to ({x}, {y})") - elif action == "left_click_drag": - current_x, current_y = self.run_command("pyautogui.position()") self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)") - return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})") - - if action in ("key", "type"): - if text is None: - raise ToolError(f"text is required for {action}") - if coordinate is not None: - raise ToolError(f"coordinate is not accepted for {action}") - if not isinstance(text, str): - raise ToolError(output=f"{text} must be a string") - - if action == "key": - # Handle key combinations - keys = text.split('+') - for key in keys: - key = self.key_conversion.get(key.strip(), key.strip()) - key = key.lower() self.run_command(f"pyautogui.keyDown('{key}')") # Press down each key - for key in reversed(keys): - key = self.key_conversion.get(key.strip(), key.strip()) - key = key.lower() self.run_command(f"pyautogui.keyUp('{key}')") # Release each key in reverse order - return ToolResult(output=f"Pressed keys: {text}") - - elif action == "type": - # default click before type TODO: check if this is needed self.run_command("pyautogui.click()") self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})") self.run_command("pyautogui.press('enter')") - screenshot_base64 = (await self.screenshot()).base64_image - return ToolResult(output=text, base64_image=screenshot_base64) - if action in ( - "left_click", - "right_click", - "double_click", - "middle_click", - "screenshot", - "cursor_position", - "left_press", - ): - if text is not None: - raise ToolError(f"text is not accepted for {action}") - if coordinate is not None: - raise ToolError(f"coordinate is not accepted for {action}") - - if action == "screenshot": - return await self.screenshot() - elif action == "cursor_position": - x, y = self.run_command("pyautogui.position()") - x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y) - return ToolResult(output=f"X={x},Y={y}") - else: - if action == "left_click": self.run_command("pyautogui.click()") - elif action == "right_click": self.run_command("pyautogui.rightClick()") - elif action == "middle_click": self.run_command("pyautogui.middleClick()") - elif action == "double_click": self.run_command("pyautogui.doubleClick()") - elif action == "left_press": self.run_command("pyautogui.mouseDown()") - time.sleep(1) self.run_command("pyautogui.mouseUp()") - return ToolResult(output=f"Performed {action}") - if action in ("scroll_up", "scroll_down"): - if action == "scroll_up": self.run_command("pyautogui.scroll(100)") - elif action == "scroll_down": self.run_command("pyautogui.scroll(-100)") - return ToolResult(output=f"Performed {action}") - if action == "hover": - return ToolResult(output=f"Performed {action}") - if action == "wait": - time.sleep(1) - return ToolResult(output=f"Performed {action}") - raise ToolError(f"Invalid action: {action}") - def run_command(self, action: str): """ Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()" """ - prefix = "import pyautogui; pyautogui.FAILSAFE = False;" - command_list = ["python", "-c", f"{prefix} {action}"] - parse = action == "pyautogui.position()" - if parse: - command_list[-1] = f"{prefix} print({action})" - - try: - print(f"run command: {command_list}") - # 使用 tool.execute_command 替代 requests.post - response = tool.execute_command(command_list) - time.sleep(0.7) # avoid async error as actions take time to complete - print(f"action executed") - if parse: - output = response['output'].strip() - match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output) - if not match: - raise ToolError(f"Could not parse coordinates from output: {output}") - x, y = map(int, match.groups()) - return x, y - except requests.exceptions.RequestException as e: - raise ToolError(f"An error occurred while trying to execute the command: {str(e)}") async def screenshot(self): if not hasattr(self, 'target_dimension'): @@ -457,71 +251,39 @@ class ComputerTool(BaseAnthropicTool): def scale_coordinates(self, source: ScalingSource, x: int, y: int): """Scale coordinates to a target maximum resolution.""" - if not self._scaling_enabled: - return x, y - ratio = self.width / self.height - target_dimension = None - - for target_name, dimension in MAX_SCALING_TARGETS.items(): - # allow some error in the aspect ratio - not ratios are exactly 16:9 - if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: - if dimension["width"] < self.width: - target_dimension = dimension - self.target_dimension = target_dimension - # print(f"target_dimension: {target_dimension}") - break - - if target_dimension is None: - # TODO: currently we force the target to be WXGA (16:10), when it cannot find a match - target_dimension = MAX_SCALING_TARGETS["WXGA"] - self.target_dimension = MAX_SCALING_TARGETS["WXGA"] - - # should be less than 1 - x_scaling_factor = target_dimension["width"] / self.width - y_scaling_factor = target_dimension["height"] / self.height - if source == ScalingSource.API: - if x > self.width or y > self.height: - raise ToolError(f"Coordinates {x}, {y} are out of bounds") - # scale up - return round(x / x_scaling_factor), round(y / y_scaling_factor) - # scale down - return round(x * x_scaling_factor), round(y * y_scaling_factor) - def get_screen_size(self): - """Return width and height of the screen""" try: response = tool.execute_command( ["python", "-c", "import pyautogui; print(pyautogui.size())"] ) - output = response['output'].strip() match = re.search(r'Size\(width=(\d+),\s*height=(\d+)\)', output) if not match: diff --git a/util/download_weights.py b/util/download_weights.py index 5e5daf0..ad3797e 100644 --- a/util/download_weights.py +++ b/util/download_weights.py @@ -1,4 +1,3 @@ -import os import subprocess from pathlib import Path diff --git a/util/omniparser.py b/util/omniparser.py index 6f646d8..2e40d28 100644 --- a/util/omniparser.py +++ b/util/omniparser.py @@ -8,7 +8,6 @@ class Omniparser(object): def __init__(self, config: Dict): self.config = config device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.som_model = get_yolo_model(model_path=config['som_model_path']) self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device) print('Omniparser initialized!') diff --git a/util/tool.py b/util/tool.py index 0aef45d..1dd973a 100644 --- a/util/tool.py +++ b/util/tool.py @@ -1,6 +1,4 @@ import os -import logging -import argparse import shlex import subprocess import threading @@ -31,7 +29,6 @@ def execute_command(command, shell=False): 'returncode': result.returncode } except Exception as e: - logger.error("\n" + traceback.format_exc() + "\n") return { 'status': 'error', 'message': str(e)