From af4a5bf3e4db8393d01ef36da0f774c505070871 Mon Sep 17 00:00:00 2001
From: yuruo <yuruotong1@163.com>
Date: Thu, 6 Mar 2025 16:39:52 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BB=A3=E7=A0=81=E7=BB=93?=
 =?UTF-8?q?=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README_CN.md                |  98 ---------------
 gradio_ui/tools/computer.py | 238 ------------------------------------
 util/download_weights.py    |   1 -
 util/omniparser.py          |   1 -
 util/tool.py                |   3 -
 5 files changed, 341 deletions(-)
 delete mode 100644 README_CN.md
diff --git a/README_CN.md b/README_CN.md
deleted file mode 100644
index 32f18ef..0000000
--- a/README_CN.md
+++ /dev/null
@@ -1,98 +0,0 @@
-<div align="center"><a name="readme-top"></a>
-
-<img src="./resources/logo.png" width="120" height="120" alt="autoMate logo">
-<h1>autoMate</h1>
-<p><b>🤖 AI驱动的本地自动化工具 | 让电脑自己会干活</b></p>
-
->"让繁琐自动化，把时间还给生活"
-
-![](./resources/autoMate.png)
-
-
-</div>
-
-## 💫 重新定义你与电脑的关系
-
-深夜加班处理重复性工作让你疲惫不堪？琐碎任务占用了你的创造力和宝贵时间？
-
-autoMate，不仅仅是一款普通工具，它是AGI第三阶段的智能助手，你的数字同事，始终高效运转，帮你重获工作与生活的平衡。
-
-
-**让自动化为你的生活创造更多可能。**
-
-
-## 💡 项目简介
-autoMate 是一款革命性的AI+RPA自动化工具，基于OmniParser构建，让AI成为你的"数字员工"，它能够
-
-- 📊 自动操作您的电脑界面，完成复杂的工作流程
-- 🔍 智能理解屏幕内容，模拟人类视觉和操作
-- 🧠 自主决策，根据任务需求进行判断并采取行动
-- 💻 支持本地化部署，保护您的数据安全和隐私
-
-不同于传统RPA工具的繁琐规则设置，autoMate借助大模型的能力，只需用自然语言描述任务，AI就能完成复杂的自动化流程。从此告别重复性工作，专注于真正创造价值的事情！
-
-## 🌟 为什么autoMate会改变你的工作方式
-
-> "在我使用autoMate之前，我每天花费3小时处理报表；现在，我只需10分钟设置任务，然后去做真正重要的事。"一位yy出来的财务经理的反馈。
-
-当你第一次看到autoMate自动完成那些曾经占用你数小时的工作时，你会有一种难以描述的释然。这不仅仅是效率的提升，更是对创造力的解放。
-
-想象一下：每天早上醒来，发现昨晚安排的数据整理、报表生成、邮件回复都已完成，等待你的只有真正需要你智慧和创造力的工作。这就是autoMate带给你的未来。
-
-## ✨ 功能特点
-
-- 🔮 无代码自动化 - 使用自然语言描述任务，无需编程知识
-- 🖥️ 全界面操控 - 支持任何可视化界面的操作，不限于特定软件
-- 🚅 简化安装 - 比官方版本更简洁的安装流程，支持中文环境，一键部署
-- 🔒 本地运行 - 保护数据安全，无需担心隐私泄露
-- 🌐 多模型支持 - 兼容主流大型语言模型
-- 💎 持续成长 - 随着你的使用，它会越来越了解你的工作习惯和需求
-
-## 🚀 快速开始
-
-### 📦 安装
-Clone项目，然后安装环境：
-
-```bash
-git clone https://github.com/yuruotong1/autoMate.git
-cd autoMate
-conda create -n "automate" python==3.12
-conda activate automate
-pip install -r requirements.txt
-```
-### 🎮 启动应用
-
-```bash
-python main.py
-```
-然后在浏览器中打开`http://localhost:7888/`，配置您的API密钥和基本设置。
-
-
-## 📝常见问题
-
-### 🔧CUDA版本不匹配问题
-如果启动时报：“显卡驱动不适配，请根据readme安装合适版本的 torch”，说明当前显卡驱动不适配。你可以不用管这条信息，只用CPU运行，但是会非常慢。你也可以：
-
-1. 运行`pip list`查看torch版本；
-2. 从[官网](https://pytorch.org/get-started/locally/)查看支持的cuda版本；
-3. 重新安装Nvidia驱动。
-
-
-## 🤝 参与共建
-
-每一个优秀的开源项目都凝聚着集体的智慧。autoMate的成长离不开你的参与和贡献。无论是修复bug、添加功能，还是改进文档，你的每一份付出都将帮助成千上万的人摆脱重复性工作的束缚。
-
-加入我们，一起创造更加智能的未来。
-
-> 强烈推荐阅读 [《提问的智慧》](https://github.com/ryanhanwu/How-To-Ask-Questions-The-Smart-Way)、[《如何向开源社区提问题》](https://github.com/seajs/seajs/issues/545) 和 [《如何有效地报告 Bug》](http://www.chiark.greenend.org.uk/%7Esgtatham/bugs-cn.html)、[《如何向开源项目提交无法解答的问题》](https://zhuanlan.zhihu.com/p/25795393)，更好的问题更容易获得帮助。
-
-<a href="https://github.com/yuruotong1/autoMate/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=yuruotong1/autoMate" />
-</a>
-
----
-
-<div align="center">
-⭐ 每一个Star都是对创作者的鼓励，也是让更多人发现并受益于autoMate的机会 ⭐
-今天你的支持，就是我们明天前进的动力
-</div>
\ No newline at end of file
diff --git a/gradio_ui/tools/computer.py b/gradio_ui/tools/computer.py
index 8f0331a..31d89c4 100644
--- a/gradio_ui/tools/computer.py
+++ b/gradio_ui/tools/computer.py
@@ -1,439 +1,233 @@
 import base64
 import time
-
 from enum import StrEnum
-
 from typing import Literal, TypedDict
-
-
 from PIL import Image
 from util import tool
-
 from anthropic.types.beta import BetaToolComputerUse20241022Param
-
-
 from .base import BaseAnthropicTool, ToolError, ToolResult
-
 from .screen_capture import get_screenshot
-
 import requests
 import re
 
-
 OUTPUT_DIR = "./tmp/outputs"
-
-
 TYPING_DELAY_MS = 12
-
 TYPING_GROUP_SIZE = 50
 
 
 Action = Literal[
-
     "key",
-
     "type",
-
     "mouse_move",
-
     "left_click",
-
     "left_click_drag",
-
     "right_click",
-
     "middle_click",
-
     "double_click",
-
     "screenshot",
-
     "cursor_position",
-
     "hover",
-
     "wait"
-
 ]
 
-
-
 class Resolution(TypedDict):
-
     width: int
-
     height: int
 
-
-
 MAX_SCALING_TARGETS: dict[str, Resolution] = {
-
     "XGA": Resolution(width=1024, height=768),  # 4:3
-
     "WXGA": Resolution(width=1280, height=800),  # 16:10
-
     "FWXGA": Resolution(width=1366, height=768),  # ~16:9
-
 }
 
 
-
 class ScalingSource(StrEnum):
-
     COMPUTER = "computer"
-
     API = "api"
 
-
-
 class ComputerToolOptions(TypedDict):
-
     display_height_px: int
-
     display_width_px: int
-
     display_number: int | None
 
-
-
 def chunks(s: str, chunk_size: int) -> list[str]:
-
     return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
 
-
 class ComputerTool(BaseAnthropicTool):
     """
-
     A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
 
     Adapted for Windows using 'pyautogui'.
     """
-
-
     name: Literal["computer"] = "computer"
-
     api_type: Literal["computer_20241022"] = "computer_20241022"
-
     width: int
-
     height: int
-
     display_num: int | None
-
-
     _screenshot_delay = 2.0
-
     _scaling_enabled = True
 
 
     @property
-
     def options(self) -> ComputerToolOptions:
-
         width, height = self.scale_coordinates(
-
             ScalingSource.COMPUTER, self.width, self.height
         )
-
         return {
-
             "display_width_px": width,
-
             "display_height_px": height,
-
             "display_number": self.display_num,
-
         }
 
-
     def to_params(self) -> BetaToolComputerUse20241022Param:
-
         return {"name": self.name, "type": self.api_type, **self.options}
 
 
     def __init__(self, is_scaling: bool = False):
         super().__init__()
-
-
         # Get screen width and height using Windows command
-
         self.display_num = None
-
         self.offset_x = 0
-
         self.offset_y = 0
-
         self.is_scaling = is_scaling
-
         self.width, self.height = self.get_screen_size()
-
         print(f"screen size: {self.width}, {self.height}")
-
-
         self.key_conversion = {"Page_Down": "pagedown",
-
                                "Page_Up": "pageup",
-
                                "Super_L": "win",
-
                                "Escape": "esc"}
-
-
-
     async def __call__(
-
         self,
-
         *,
-
         action: Action,
-
         text: str | None = None,
-
         coordinate: tuple[int, int] | None = None,
-
         **kwargs,
-
     ):
-
         print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
-
         if action in ("mouse_move", "left_click_drag"):
-
             if coordinate is None:
-
                 raise ToolError(f"coordinate is required for {action}")
-
             if text is not None:
-
                 raise ToolError(f"text is not accepted for {action}")
-
             if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
-
                 raise ToolError(f"{coordinate} must be a tuple of length 2")
-
             # if not all(isinstance(i, int) and i >= 0 for i in coordinate):
-
             if not all(isinstance(i, int) for i in coordinate):
-
                 raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
-            
-
             if self.is_scaling:
-
                 x, y = self.scale_coordinates(
-
                     ScalingSource.API, coordinate[0], coordinate[1]
                 )
-
             else:
-
                 x, y = coordinate
-
-
             # print(f"scaled_coordinates: {x}, {y}")
-
             # print(f"offset: {self.offset_x}, {self.offset_y}")
-            
-
             # x += self.offset_x # TODO - check if this is needed
-
             # y += self.offset_y
-
-
             print(f"mouse move to {x}, {y}")
-            
-
             if action == "mouse_move":
                 self.run_command(f"pyautogui.moveTo({x}, {y})")
-
                 return ToolResult(output=f"Moved mouse to ({x}, {y})")
-
             elif action == "left_click_drag":
-
                 current_x, current_y = self.run_command("pyautogui.position()")
                 self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
-
                 return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
-
-
         if action in ("key", "type"):
-
             if text is None:
-
                 raise ToolError(f"text is required for {action}")
-
             if coordinate is not None:
-
                 raise ToolError(f"coordinate is not accepted for {action}")
-
             if not isinstance(text, str):
-
                 raise ToolError(output=f"{text} must be a string")
-
-
             if action == "key":
-
                 # Handle key combinations
-
                 keys = text.split('+')
-
                 for key in keys:
-
                     key = self.key_conversion.get(key.strip(), key.strip())
-
                     key = key.lower()
                     self.run_command(f"pyautogui.keyDown('{key}')")  # Press down each key
-
                 for key in reversed(keys):
-
                     key = self.key_conversion.get(key.strip(), key.strip())
-
                     key = key.lower()
                     self.run_command(f"pyautogui.keyUp('{key}')")    # Release each key in reverse order
-
                 return ToolResult(output=f"Pressed keys: {text}")
-            
-
             elif action == "type":
-
                 # default click before type TODO: check if this is needed
                 self.run_command("pyautogui.click()")
                 self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
                 self.run_command("pyautogui.press('enter')")
-
                 screenshot_base64 = (await self.screenshot()).base64_image
-
                 return ToolResult(output=text, base64_image=screenshot_base64)
-
         if action in (
-
             "left_click",
-
             "right_click",
-
             "double_click",
-
             "middle_click",
-
             "screenshot",
-
             "cursor_position",
-
             "left_press",
-
         ):
-
             if text is not None:
-
                 raise ToolError(f"text is not accepted for {action}")
-
             if coordinate is not None:
-
                 raise ToolError(f"coordinate is not accepted for {action}")
-
-
             if action == "screenshot":
-
                 return await self.screenshot()
-
             elif action == "cursor_position":
-
                 x, y = self.run_command("pyautogui.position()")
-
                 x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
-
                 return ToolResult(output=f"X={x},Y={y}")
-
             else:
-
                 if action == "left_click":
                     self.run_command("pyautogui.click()")
-
                 elif action == "right_click":
                     self.run_command("pyautogui.rightClick()")
-
                 elif action == "middle_click":
                     self.run_command("pyautogui.middleClick()")
-
                 elif action == "double_click":
                     self.run_command("pyautogui.doubleClick()")
-
                 elif action == "left_press":
                     self.run_command("pyautogui.mouseDown()")
-
                     time.sleep(1)
                     self.run_command("pyautogui.mouseUp()")
-
                 return ToolResult(output=f"Performed {action}")
-
         if action in ("scroll_up", "scroll_down"):
-
             if action == "scroll_up":
                 self.run_command("pyautogui.scroll(100)")
-
             elif action == "scroll_down":
                 self.run_command("pyautogui.scroll(-100)")
-
             return ToolResult(output=f"Performed {action}")
-
         if action == "hover":
-
             return ToolResult(output=f"Performed {action}")
-
         if action == "wait":
-
             time.sleep(1)
-
             return ToolResult(output=f"Performed {action}")
-
         raise ToolError(f"Invalid action: {action}")
-
     def run_command(self, action: str):
         """
 
         Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
         """
-
         prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
-
         command_list = ["python", "-c", f"{prefix} {action}"]
-
         parse = action == "pyautogui.position()"
-
         if parse:
-
             command_list[-1] = f"{prefix} print({action})"
-
-
         try:
-
             print(f"run command: {command_list}")
-
             # 使用 tool.execute_command 替代 requests.post
-
             response = tool.execute_command(command_list)
-
             time.sleep(0.7) # avoid async error as actions take time to complete
-
             print(f"action executed")
-
             if parse:
-
                 output = response['output'].strip()
-
                 match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
-
                 if not match:
-
                     raise ToolError(f"Could not parse coordinates from output: {output}")
-
                 x, y = map(int, match.groups())
-
                 return x, y
-
         except requests.exceptions.RequestException as e:
-
             raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
     async def screenshot(self):
         if not hasattr(self, 'target_dimension'):
@@ -457,71 +251,39 @@ class ComputerTool(BaseAnthropicTool):
     def scale_coordinates(self, source: ScalingSource, x: int, y: int):
 
         """Scale coordinates to a target maximum resolution."""
-
         if not self._scaling_enabled:
-
             return x, y
-
         ratio = self.width / self.height
-
         target_dimension = None
-
-
         for target_name, dimension in MAX_SCALING_TARGETS.items():
-
             # allow some error in the aspect ratio - not ratios are exactly 16:9
-
             if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
-
                 if dimension["width"] < self.width:
-
                     target_dimension = dimension
-
                     self.target_dimension = target_dimension
-
                     # print(f"target_dimension: {target_dimension}")
-
                 break
-
-
         if target_dimension is None:
-
             # TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
-
             target_dimension = MAX_SCALING_TARGETS["WXGA"]
-
             self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
-
-
         # should be less than 1
-
         x_scaling_factor = target_dimension["width"] / self.width
-
         y_scaling_factor = target_dimension["height"] / self.height
-
         if source == ScalingSource.API:
-
             if x > self.width or y > self.height:
-
                 raise ToolError(f"Coordinates {x}, {y} are out of bounds")
-
             # scale up
-
             return round(x / x_scaling_factor), round(y / y_scaling_factor)
-
         # scale down
-
         return round(x * x_scaling_factor), round(y * y_scaling_factor)
         
-
     def get_screen_size(self):
-
         """Return width and height of the screen"""
         try:
             response = tool.execute_command(
                 ["python", "-c", "import pyautogui; print(pyautogui.size())"]
             )
-            
             output = response['output'].strip()
             match = re.search(r'Size\(width=(\d+),\s*height=(\d+)\)', output)
             if not match:
diff --git a/util/download_weights.py b/util/download_weights.py
index 5e5daf0..ad3797e 100644
--- a/util/download_weights.py
+++ b/util/download_weights.py
@@ -1,4 +1,3 @@
-import os
 import subprocess
 from pathlib import Path
 
diff --git a/util/omniparser.py b/util/omniparser.py
index 6f646d8..2e40d28 100644
--- a/util/omniparser.py
+++ b/util/omniparser.py
@@ -8,7 +8,6 @@ class Omniparser(object):
     def __init__(self, config: Dict):
         self.config = config
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
         self.som_model = get_yolo_model(model_path=config['som_model_path'])
         self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
         print('Omniparser initialized!')
diff --git a/util/tool.py b/util/tool.py
index 0aef45d..1dd973a 100644
--- a/util/tool.py
+++ b/util/tool.py
@@ -1,6 +1,4 @@
 import os
-import logging
-import argparse
 import shlex
 import subprocess
 import threading
@@ -31,7 +29,6 @@ def execute_command(command, shell=False):
                 'returncode': result.returncode
             }
         except Exception as e:
-            logger.error("\n" + traceback.format_exc() + "\n")
             return {
                 'status': 'error',
                 'message': str(e)