更新代码结构

This commit is contained in:
yuruo 2025-03-06 16:39:52 +08:00
parent d793055105
commit af4a5bf3e4
5 changed files with 0 additions and 341 deletions

View File

@ -1,98 +0,0 @@
<div align="center"><a name="readme-top"></a>
<img src="./resources/logo.png" width="120" height="120" alt="autoMate logo">
<h1>autoMate</h1>
<p><b>🤖 AI驱动的本地自动化工具 | 让电脑自己会干活</b></p>
>"让繁琐自动化,把时间还给生活"
![](./resources/autoMate.png)
</div>
## 💫 重新定义你与电脑的关系
深夜加班处理重复性工作让你疲惫不堪?琐碎任务占用了你的创造力和宝贵时间?
autoMate不仅仅是一款普通工具它是AGI第三阶段的智能助手你的数字同事始终高效运转帮你重获工作与生活的平衡。
**让自动化为你的生活创造更多可能。**
## 💡 项目简介
autoMate 是一款革命性的AI+RPA自动化工具基于OmniParser构建让AI成为你的"数字员工",它能够
- 📊 自动操作您的电脑界面,完成复杂的工作流程
- 🔍 智能理解屏幕内容,模拟人类视觉和操作
- 🧠 自主决策,根据任务需求进行判断并采取行动
- 💻 支持本地化部署,保护您的数据安全和隐私
不同于传统RPA工具的繁琐规则设置autoMate借助大模型的能力只需用自然语言描述任务AI就能完成复杂的自动化流程。从此告别重复性工作专注于真正创造价值的事情
## 🌟 为什么autoMate会改变你的工作方式
> "在我使用autoMate之前我每天花费3小时处理报表现在我只需10分钟设置任务然后去做真正重要的事。"一位yy出来的财务经理的反馈。
当你第一次看到autoMate自动完成那些曾经占用你数小时的工作时你会有一种难以描述的释然。这不仅仅是效率的提升更是对创造力的解放。
想象一下每天早上醒来发现昨晚安排的数据整理、报表生成、邮件回复都已完成等待你的只有真正需要你智慧和创造力的工作。这就是autoMate带给你的未来。
## ✨ 功能特点
- 🔮 无代码自动化 - 使用自然语言描述任务,无需编程知识
- 🖥️ 全界面操控 - 支持任何可视化界面的操作,不限于特定软件
- 🚅 简化安装 - 比官方版本更简洁的安装流程,支持中文环境,一键部署
- 🔒 本地运行 - 保护数据安全,无需担心隐私泄露
- 🌐 多模型支持 - 兼容主流大型语言模型
- 💎 持续成长 - 随着你的使用,它会越来越了解你的工作习惯和需求
## 🚀 快速开始
### 📦 安装
Clone项目然后安装环境
```bash
git clone https://github.com/yuruotong1/autoMate.git
cd autoMate
conda create -n "automate" python==3.12
conda activate automate
pip install -r requirements.txt
```
### 🎮 启动应用
```bash
python main.py
```
然后在浏览器中打开`http://localhost:7888/`配置您的API密钥和基本设置。
## 📝常见问题
### 🔧CUDA版本不匹配问题
如果启动时报“显卡驱动不适配请根据readme安装合适版本的 torch”说明当前显卡驱动不适配。你可以不用管这条信息只用CPU运行但是会非常慢。你也可以
1. 运行`pip list`查看torch版本
2. 从[官网](https://pytorch.org/get-started/locally/)查看支持的cuda版本
3. 重新安装Nvidia驱动。
## 🤝 参与共建
每一个优秀的开源项目都凝聚着集体的智慧。autoMate的成长离不开你的参与和贡献。无论是修复bug、添加功能还是改进文档你的每一份付出都将帮助成千上万的人摆脱重复性工作的束缚。
加入我们,一起创造更加智能的未来。
> 强烈推荐阅读 [《提问的智慧》](https://github.com/ryanhanwu/How-To-Ask-Questions-The-Smart-Way)、[《如何向开源社区提问题》](https://github.com/seajs/seajs/issues/545) 和 [《如何有效地报告 Bug》](http://www.chiark.greenend.org.uk/%7Esgtatham/bugs-cn.html)、[《如何向开源项目提交无法解答的问题》](https://zhuanlan.zhihu.com/p/25795393),更好的问题更容易获得帮助。
<a href="https://github.com/yuruotong1/autoMate/graphs/contributors">
<img src="https://contrib.rocks/image?repo=yuruotong1/autoMate" />
</a>
---
<div align="center">
⭐ 每一个Star都是对创作者的鼓励也是让更多人发现并受益于autoMate的机会 ⭐
今天你的支持,就是我们明天前进的动力
</div>

View File

@ -1,439 +1,233 @@
import base64
import time
from enum import StrEnum
from typing import Literal, TypedDict
from PIL import Image
from util import tool
from anthropic.types.beta import BetaToolComputerUse20241022Param
from .base import BaseAnthropicTool, ToolError, ToolResult
from .screen_capture import get_screenshot
import requests
import re
OUTPUT_DIR = "./tmp/outputs"
TYPING_DELAY_MS = 12
TYPING_GROUP_SIZE = 50
Action = Literal[
"key",
"type",
"mouse_move",
"left_click",
"left_click_drag",
"right_click",
"middle_click",
"double_click",
"screenshot",
"cursor_position",
"hover",
"wait"
]
class Resolution(TypedDict):
width: int
height: int
MAX_SCALING_TARGETS: dict[str, Resolution] = {
"XGA": Resolution(width=1024, height=768), # 4:3
"WXGA": Resolution(width=1280, height=800), # 16:10
"FWXGA": Resolution(width=1366, height=768), # ~16:9
}
class ScalingSource(StrEnum):
COMPUTER = "computer"
API = "api"
class ComputerToolOptions(TypedDict):
display_height_px: int
display_width_px: int
display_number: int | None
def chunks(s: str, chunk_size: int) -> list[str]:
return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
class ComputerTool(BaseAnthropicTool):
"""
A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
Adapted for Windows using 'pyautogui'.
"""
name: Literal["computer"] = "computer"
api_type: Literal["computer_20241022"] = "computer_20241022"
width: int
height: int
display_num: int | None
_screenshot_delay = 2.0
_scaling_enabled = True
@property
def options(self) -> ComputerToolOptions:
width, height = self.scale_coordinates(
ScalingSource.COMPUTER, self.width, self.height
)
return {
"display_width_px": width,
"display_height_px": height,
"display_number": self.display_num,
}
def to_params(self) -> BetaToolComputerUse20241022Param:
return {"name": self.name, "type": self.api_type, **self.options}
def __init__(self, is_scaling: bool = False):
super().__init__()
# Get screen width and height using Windows command
self.display_num = None
self.offset_x = 0
self.offset_y = 0
self.is_scaling = is_scaling
self.width, self.height = self.get_screen_size()
print(f"screen size: {self.width}, {self.height}")
self.key_conversion = {"Page_Down": "pagedown",
"Page_Up": "pageup",
"Super_L": "win",
"Escape": "esc"}
async def __call__(
self,
*,
action: Action,
text: str | None = None,
coordinate: tuple[int, int] | None = None,
**kwargs,
):
print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
if action in ("mouse_move", "left_click_drag"):
if coordinate is None:
raise ToolError(f"coordinate is required for {action}")
if text is not None:
raise ToolError(f"text is not accepted for {action}")
if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
raise ToolError(f"{coordinate} must be a tuple of length 2")
# if not all(isinstance(i, int) and i >= 0 for i in coordinate):
if not all(isinstance(i, int) for i in coordinate):
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
if self.is_scaling:
x, y = self.scale_coordinates(
ScalingSource.API, coordinate[0], coordinate[1]
)
else:
x, y = coordinate
# print(f"scaled_coordinates: {x}, {y}")
# print(f"offset: {self.offset_x}, {self.offset_y}")
# x += self.offset_x # TODO - check if this is needed
# y += self.offset_y
print(f"mouse move to {x}, {y}")
if action == "mouse_move":
self.run_command(f"pyautogui.moveTo({x}, {y})")
return ToolResult(output=f"Moved mouse to ({x}, {y})")
elif action == "left_click_drag":
current_x, current_y = self.run_command("pyautogui.position()")
self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
if action in ("key", "type"):
if text is None:
raise ToolError(f"text is required for {action}")
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
if not isinstance(text, str):
raise ToolError(output=f"{text} must be a string")
if action == "key":
# Handle key combinations
keys = text.split('+')
for key in keys:
key = self.key_conversion.get(key.strip(), key.strip())
key = key.lower()
self.run_command(f"pyautogui.keyDown('{key}')") # Press down each key
for key in reversed(keys):
key = self.key_conversion.get(key.strip(), key.strip())
key = key.lower()
self.run_command(f"pyautogui.keyUp('{key}')") # Release each key in reverse order
return ToolResult(output=f"Pressed keys: {text}")
elif action == "type":
# default click before type TODO: check if this is needed
self.run_command("pyautogui.click()")
self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
self.run_command("pyautogui.press('enter')")
screenshot_base64 = (await self.screenshot()).base64_image
return ToolResult(output=text, base64_image=screenshot_base64)
if action in (
"left_click",
"right_click",
"double_click",
"middle_click",
"screenshot",
"cursor_position",
"left_press",
):
if text is not None:
raise ToolError(f"text is not accepted for {action}")
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
if action == "screenshot":
return await self.screenshot()
elif action == "cursor_position":
x, y = self.run_command("pyautogui.position()")
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
return ToolResult(output=f"X={x},Y={y}")
else:
if action == "left_click":
self.run_command("pyautogui.click()")
elif action == "right_click":
self.run_command("pyautogui.rightClick()")
elif action == "middle_click":
self.run_command("pyautogui.middleClick()")
elif action == "double_click":
self.run_command("pyautogui.doubleClick()")
elif action == "left_press":
self.run_command("pyautogui.mouseDown()")
time.sleep(1)
self.run_command("pyautogui.mouseUp()")
return ToolResult(output=f"Performed {action}")
if action in ("scroll_up", "scroll_down"):
if action == "scroll_up":
self.run_command("pyautogui.scroll(100)")
elif action == "scroll_down":
self.run_command("pyautogui.scroll(-100)")
return ToolResult(output=f"Performed {action}")
if action == "hover":
return ToolResult(output=f"Performed {action}")
if action == "wait":
time.sleep(1)
return ToolResult(output=f"Performed {action}")
raise ToolError(f"Invalid action: {action}")
def run_command(self, action: str):
"""
Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
"""
prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
command_list = ["python", "-c", f"{prefix} {action}"]
parse = action == "pyautogui.position()"
if parse:
command_list[-1] = f"{prefix} print({action})"
try:
print(f"run command: {command_list}")
# 使用 tool.execute_command 替代 requests.post
response = tool.execute_command(command_list)
time.sleep(0.7) # avoid async error as actions take time to complete
print(f"action executed")
if parse:
output = response['output'].strip()
match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
if not match:
raise ToolError(f"Could not parse coordinates from output: {output}")
x, y = map(int, match.groups())
return x, y
except requests.exceptions.RequestException as e:
raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
async def screenshot(self):
if not hasattr(self, 'target_dimension'):
@ -457,71 +251,39 @@ class ComputerTool(BaseAnthropicTool):
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
"""Scale coordinates to a target maximum resolution."""
if not self._scaling_enabled:
return x, y
ratio = self.width / self.height
target_dimension = None
for target_name, dimension in MAX_SCALING_TARGETS.items():
# allow some error in the aspect ratio - not ratios are exactly 16:9
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
if dimension["width"] < self.width:
target_dimension = dimension
self.target_dimension = target_dimension
# print(f"target_dimension: {target_dimension}")
break
if target_dimension is None:
# TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
target_dimension = MAX_SCALING_TARGETS["WXGA"]
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
# should be less than 1
x_scaling_factor = target_dimension["width"] / self.width
y_scaling_factor = target_dimension["height"] / self.height
if source == ScalingSource.API:
if x > self.width or y > self.height:
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
# scale up
return round(x / x_scaling_factor), round(y / y_scaling_factor)
# scale down
return round(x * x_scaling_factor), round(y * y_scaling_factor)
def get_screen_size(self):
"""Return width and height of the screen"""
try:
response = tool.execute_command(
["python", "-c", "import pyautogui; print(pyautogui.size())"]
)
output = response['output'].strip()
match = re.search(r'Size\(width=(\d+),\s*height=(\d+)\)', output)
if not match:

View File

@ -1,4 +1,3 @@
import os
import subprocess
from pathlib import Path

View File

@ -8,7 +8,6 @@ class Omniparser(object):
def __init__(self, config: Dict):
self.config = config
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.som_model = get_yolo_model(model_path=config['som_model_path'])
self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
print('Omniparser initialized!')

View File

@ -1,6 +1,4 @@
import os
import logging
import argparse
import shlex
import subprocess
import threading
@ -31,7 +29,6 @@ def execute_command(command, shell=False):
'returncode': result.returncode
}
except Exception as e:
logger.error("\n" + traceback.format_exc() + "\n")
return {
'status': 'error',
'message': str(e)