diff --git a/gradio_ui/.DS_Store b/auto_control/.DS_Store
similarity index 100%
rename from gradio_ui/.DS_Store
rename to auto_control/.DS_Store
diff --git a/gradio_ui/__init__.py b/auto_control/__init__.py
similarity index 100%
rename from gradio_ui/__init__.py
rename to auto_control/__init__.py
diff --git a/gradio_ui/agent/base_agent.py b/auto_control/agent/base_agent.py
similarity index 100%
rename from gradio_ui/agent/base_agent.py
rename to auto_control/agent/base_agent.py
diff --git a/gradio_ui/agent/task_plan_agent.py b/auto_control/agent/task_plan_agent.py
similarity index 95%
rename from gradio_ui/agent/task_plan_agent.py
rename to auto_control/agent/task_plan_agent.py
index 4a082f9..7470c1e 100644
--- a/gradio_ui/agent/task_plan_agent.py
+++ b/auto_control/agent/task_plan_agent.py
@@ -1,9 +1,9 @@
import json
from pydantic import BaseModel, Field
-from gradio_ui.agent.base_agent import BaseAgent
+from auto_control.agent.base_agent import BaseAgent
from xbrain.core.chat import run
-from gradio_ui.tools.computer import Action
+from auto_control.tools.computer import Action
class TaskPlanAgent(BaseAgent):
def __call__(self, messages, parsed_screen_result):
diff --git a/gradio_ui/agent/task_run_agent.py b/auto_control/agent/task_run_agent.py
similarity index 98%
rename from gradio_ui/agent/task_run_agent.py
rename to auto_control/agent/task_run_agent.py
index d83387e..9b3c72f 100644
--- a/gradio_ui/agent/task_run_agent.py
+++ b/auto_control/agent/task_run_agent.py
@@ -2,10 +2,10 @@ import json
import uuid
from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage
from pydantic import Field, create_model
-from gradio_ui.agent.base_agent import BaseAgent
+from auto_control.agent.base_agent import BaseAgent
from xbrain.core.chat import run
-from gradio_ui.tools.computer import Action
+from auto_control.tools.computer import Action
class TaskRunAgent(BaseAgent):
def __init__(self):
self.OUTPUT_DIR = "./tmp/outputs"
diff --git a/gradio_ui/agent/vision_agent.py b/auto_control/agent/vision_agent.py
similarity index 100%
rename from gradio_ui/agent/vision_agent.py
rename to auto_control/agent/vision_agent.py
diff --git a/gradio_ui/app.py b/auto_control/app.py
similarity index 98%
rename from gradio_ui/app.py
rename to auto_control/app.py
index 94df407..6fd7171 100644
--- a/gradio_ui/app.py
+++ b/auto_control/app.py
@@ -7,8 +7,8 @@ import os
from pathlib import Path
import argparse
import gradio as gr
-from gradio_ui.agent.vision_agent import VisionAgent
-from gradio_ui.loop import (
+from auto_control.agent.vision_agent import VisionAgent
+from auto_control.loop import (
sampling_loop_sync,
)
import base64
@@ -349,4 +349,4 @@ def run():
while True:
time.sleep(1)
except KeyboardInterrupt:
- print("\n💤 closing server")
+ print("\n�� closing server")
diff --git a/gradio_ui/executor/anthropic_executor.py b/auto_control/executor/anthropic_executor.py
similarity index 93%
rename from gradio_ui/executor/anthropic_executor.py
rename to auto_control/executor/anthropic_executor.py
index 020bb65..8e99163 100644
--- a/gradio_ui/executor/anthropic_executor.py
+++ b/auto_control/executor/anthropic_executor.py
@@ -3,7 +3,7 @@ from typing import Any, cast
from anthropic.types.beta import (
BetaContentBlock
)
-from gradio_ui.tools import ComputerTool, ToolCollection
+from auto_control.tools import ComputerTool, ToolCollection
class AnthropicExecutor:
diff --git a/gradio_ui/loop.py b/auto_control/loop.py
similarity index 94%
rename from gradio_ui/loop.py
rename to auto_control/loop.py
index 4ba08c2..5bd84ef 100644
--- a/gradio_ui/loop.py
+++ b/auto_control/loop.py
@@ -4,12 +4,12 @@ Agentic sampling loop that calls the Anthropic API and local implenmentation of
import base64
from io import BytesIO
import cv2
-from gradio_ui.agent.vision_agent import VisionAgent
-from gradio_ui.tools.screen_capture import get_screenshot
+from auto_control.agent.vision_agent import VisionAgent
+from auto_control.tools.screen_capture import get_screenshot
from anthropic.types.beta import (BetaMessageParam)
-from gradio_ui.agent.task_plan_agent import TaskPlanAgent
-from gradio_ui.agent.task_run_agent import TaskRunAgent
-from gradio_ui.executor.anthropic_executor import AnthropicExecutor
+from auto_control.agent.task_plan_agent import TaskPlanAgent
+from auto_control.agent.task_run_agent import TaskRunAgent
+from auto_control.executor.anthropic_executor import AnthropicExecutor
import numpy as np
from PIL import Image
diff --git a/gradio_ui/tools/__init__.py b/auto_control/tools/__init__.py
similarity index 100%
rename from gradio_ui/tools/__init__.py
rename to auto_control/tools/__init__.py
diff --git a/gradio_ui/tools/base.py b/auto_control/tools/base.py
similarity index 100%
rename from gradio_ui/tools/base.py
rename to auto_control/tools/base.py
diff --git a/gradio_ui/tools/collection.py b/auto_control/tools/collection.py
similarity index 100%
rename from gradio_ui/tools/collection.py
rename to auto_control/tools/collection.py
diff --git a/gradio_ui/tools/computer.py b/auto_control/tools/computer.py
similarity index 100%
rename from gradio_ui/tools/computer.py
rename to auto_control/tools/computer.py
diff --git a/gradio_ui/tools/screen_capture.py b/auto_control/tools/screen_capture.py
similarity index 100%
rename from gradio_ui/tools/screen_capture.py
rename to auto_control/tools/screen_capture.py
diff --git a/gradio_ui/.gitignore b/gradio_ui/.gitignore
deleted file mode 100644
index c036379..0000000
--- a/gradio_ui/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tmp/
\ No newline at end of file
diff --git a/imgs/wechat/chat_select.png b/imgs/wechat/chat_select.png
new file mode 100644
index 0000000..2477a86
Binary files /dev/null and b/imgs/wechat/chat_select.png differ
diff --git a/imgs/wechat/chat_unselect.png b/imgs/wechat/chat_unselect.png
new file mode 100644
index 0000000..6ca3de0
Binary files /dev/null and b/imgs/wechat/chat_unselect.png differ
diff --git a/imgs/wechat/contact_person.png b/imgs/wechat/contact_person.png
new file mode 100644
index 0000000..779bf02
Binary files /dev/null and b/imgs/wechat/contact_person.png differ
diff --git a/imgs/wechat/search.png b/imgs/wechat/search.png
new file mode 100644
index 0000000..7523d59
Binary files /dev/null and b/imgs/wechat/search.png differ
diff --git a/imgs/wechat/send_message.png b/imgs/wechat/send_message.png
new file mode 100644
index 0000000..c045c96
Binary files /dev/null and b/imgs/wechat/send_message.png differ
diff --git a/main.py b/main.py
index af3827d..8c019dc 100644
--- a/main.py
+++ b/main.py
@@ -1,11 +1,9 @@
-from gradio_ui import app
-import os
+from ui.main import main
from util import download_weights
-os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
def run():
download_weights.download()
- app.run()
+ main()
+
+if __name__ == "__main__":
+ run()
-
-if __name__ == '__main__':
- run()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 43c1040..491dee3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,6 @@ timm
einops==0.8.0
modelscope
pynput
-lap
\ No newline at end of file
+lap
+pyqt6==6.8.1
+keyboard==0.13.5
\ No newline at end of file
diff --git a/ui/__init__.py b/ui/__init__.py
new file mode 100644
index 0000000..6396c95
--- /dev/null
+++ b/ui/__init__.py
@@ -0,0 +1,3 @@
+"""
+autoMate UI package
+"""
\ No newline at end of file
diff --git a/ui/agent_worker.py b/ui/agent_worker.py
new file mode 100644
index 0000000..e2da3b6
--- /dev/null
+++ b/ui/agent_worker.py
@@ -0,0 +1,174 @@
+"""
+Worker thread for handling agent operations
+"""
+import json
+from PyQt6.QtCore import QThread, pyqtSignal
+
+from auto_control.loop import sampling_loop_sync
+from xbrain.utils.config import Config
+
+class AgentWorker(QThread):
+ """Worker thread for running agent operations asynchronously"""
+
+ update_signal = pyqtSignal(list, list)
+ status_signal = pyqtSignal(str) # Signal for status updates
+ task_signal = pyqtSignal(str) # Signal for current task
+ error_signal = pyqtSignal(str) # Error signal
+
+ def __init__(self, user_input, state, vision_agent):
+ super().__init__()
+ self.user_input = user_input
+ self.state = state
+ self.vision_agent = vision_agent
+
+ def run(self):
+ # Reset stop flag
+ if self.state["stop"]:
+ self.state["stop"] = False
+
+ # Configure API
+ config = Config()
+ config.set_openai_config(
+ base_url=self.state["base_url"],
+ api_key=self.state["api_key"],
+ model=self.state["model"]
+ )
+
+ # Add user message
+ self.state["messages"].append({"role": "user", "content": self.user_input})
+ self.state["chatbox_messages"].append({"role": "user", "content": self.user_input})
+
+ # Send initial update
+ self.update_signal.emit(self.state["chatbox_messages"], [])
+ self.status_signal.emit("Starting analysis...")
+
+ try:
+ # Process with agent
+ for _ in sampling_loop_sync(
+ model=self.state["model"],
+ messages=self.state["messages"],
+ vision_agent=self.vision_agent,
+ screen_region=self.state.get("screen_region", None)
+ ):
+ if self.state["stop"]:
+ self.state["chatbox_messages"].append({"role": "user", "content": "Stop!"})
+ self.status_signal.emit("Operation stopped by user")
+ return
+
+ # task_plan_agent first response
+ if len(self.state["messages"]) == 2:
+ task_list = json.loads(self.state["messages"][-1]["content"])["task_list"]
+ for task in task_list:
+ self.state["tasks"].append({
+ "status": "⬜",
+ "task": task
+ })
+ else:
+ # Reset all task statuses
+ for i in range(len(self.state["tasks"])):
+ self.state["tasks"][i]["status"] = "⬜"
+
+ # Update task progress
+ content_json = json.loads(self.state["messages"][-1]["content"])
+ task_completed_number = content_json["current_task_id"]
+
+ # Update status with reasoning
+ if "reasoning" in content_json:
+ self.status_signal.emit(content_json["reasoning"])
+
+ # Update current task
+ if task_completed_number < len(self.state["tasks"]):
+ current_task = self.state["tasks"][task_completed_number]["task"]
+ self.task_signal.emit(current_task)
+
+ if task_completed_number > len(self.state["tasks"]) + 1:
+ for i in range(len(self.state["tasks"])):
+ self.state["tasks"][i]["status"] = "✅"
+ else:
+ for i in range(task_completed_number + 1):
+ self.state["tasks"][i]["status"] = "✅"
+
+ # Reconstruct chat messages from original messages
+ self.state["chatbox_messages"] = []
+
+ for message in self.state["messages"]:
+ formatted_content, json_reasoning = self.format_message_content(message["content"])
+
+ # Add json reasoning as a separate message if exists
+ if json_reasoning:
+ self.state["chatbox_messages"].append({
+ "role": message["role"],
+ "content": json_reasoning
+ })
+
+ # Add formatted content
+ self.state["chatbox_messages"].append({
+ "role": message["role"],
+ "content": formatted_content
+ })
+
+ # Convert data format before returning results
+ tasks_2d = [[task["status"], task["task"]] for task in self.state["tasks"]]
+ self.update_signal.emit(self.state["chatbox_messages"], tasks_2d)
+
+ # All done
+ self.status_signal.emit("Task completed")
+
+ except Exception as e:
+ # Send error signal
+ import traceback
+ error_message = f"Error occurred: {str(e)}\n{traceback.format_exc()}"
+ print(error_message)
+
+ # Add error message to chat
+ self.state["chatbox_messages"].append({
+ "role": "assistant",
+ "content": f"⚠️ Network connection error: {str(e)}
Please check your network connection and API settings, or try again later."
+ })
+ self.update_signal.emit(self.state["chatbox_messages"],
+ [[task["status"], task["task"]] for task in self.state["tasks"]])
+ self.error_signal.emit(str(e))
+ self.status_signal.emit(f"Error: {str(e)}")
+
+ def format_message_content(self, content):
+ """Format message content for display"""
+ # Handle list-type content (multimodal)
+ if isinstance(content, list):
+ formatted_content = ""
+ json_reasoning = None
+
+ for item in content:
+ if item["type"] == "image_url":
+ # Changed image style to be smaller
+ formatted_content += f'
'
+ elif item["type"] == "text":
+ if self.is_json_format(item["text"]):
+ reasoning, details = self.format_json_content(item["text"])
+ json_reasoning = reasoning
+ formatted_content += details
+ else:
+ formatted_content += item["text"]
+
+ return formatted_content, json_reasoning
+
+ # Handle string content
+ if self.is_json_format(content):
+ reasoning, _ = self.format_json_content(content)
+ formatted_content = json.dumps(json.loads(content), indent=4, ensure_ascii=False)
+ return formatted_content, reasoning
+
+ return content, None
+
+ def format_json_content(self, json_content):
+ """Format JSON content with reasoning and details"""
+ content_json = json.loads(json_content)
+ reasoning = f'
{content_json["reasoning"]}
'
+ details = f'
Detail
{json.dumps(content_json, indent=4, ensure_ascii=False)} '
+ return reasoning, details
+
+ def is_json_format(self, text):
+ try:
+ json.loads(text)
+ return True
+ except:
+ return False
\ No newline at end of file
diff --git a/ui/hotkey_edit.py b/ui/hotkey_edit.py
new file mode 100644
index 0000000..4591a3c
--- /dev/null
+++ b/ui/hotkey_edit.py
@@ -0,0 +1,90 @@
+"""
+Hotkey editing widget
+"""
+import keyboard
+from PyQt6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QPushButton
+
+# Default stop hotkey
+DEFAULT_STOP_HOTKEY = "ctrl+k"
+
+class HotkeyEdit(QWidget):
+ """Widget for recording hotkey combinations"""
+
+ def __init__(self, hotkey="", parent=None):
+ super().__init__(parent)
+ layout = QHBoxLayout(self)
+ layout.setContentsMargins(0, 0, 0, 0)
+
+ self.hotkey_input = QLineEdit(hotkey)
+ self.hotkey_input.setReadOnly(True)
+ self.hotkey_input.setPlaceholderText("Click to record hotkey")
+
+ self.record_btn = QPushButton("Record")
+ self.record_btn.clicked.connect(self.start_recording)
+
+ layout.addWidget(self.hotkey_input, 1)
+ layout.addWidget(self.record_btn)
+
+ self.recording = False
+ self.keys_pressed = set()
+
+ def start_recording(self):
+ """Start recording a new hotkey"""
+ if self.recording:
+ self.stop_recording()
+ return
+
+ self.hotkey_input.setText("Press keys...")
+ self.record_btn.setText("Stop")
+ self.recording = True
+ self.keys_pressed = set()
+
+ # Hook global events
+ keyboard.hook(self.on_key_event)
+
+ def stop_recording(self):
+ """Stop recording and set the hotkey"""
+ keyboard.unhook(self.on_key_event)
+ self.recording = False
+ self.record_btn.setText("Record")
+
+ # Convert keys to hotkey string
+ if self.keys_pressed:
+ hotkey = '+'.join(sorted(self.keys_pressed))
+ self.hotkey_input.setText(hotkey)
+ else:
+ self.hotkey_input.setText("")
+
+ def on_key_event(self, event):
+ """Handle key events during recording"""
+ if not self.recording:
+ return
+
+ # Skip key up events
+ if not event.event_type == keyboard.KEY_DOWN:
+ return
+
+ # Get key name
+ key_name = event.name.lower()
+
+ # Special handling for modifier keys
+ if key_name in ['ctrl', 'alt', 'shift', 'windows']:
+ self.keys_pressed.add(key_name)
+ else:
+ self.keys_pressed.add(key_name)
+
+ # Show current keys
+ self.hotkey_input.setText('+'.join(sorted(self.keys_pressed)))
+
+ # Stop recording if user presses Escape alone
+ if len(self.keys_pressed) == 1 and 'esc' in self.keys_pressed:
+ self.keys_pressed.clear()
+ self.stop_recording()
+
+ def get_hotkey(self):
+ """Get the current hotkey string"""
+ return self.hotkey_input.text()
+
+ def set_hotkey(self, hotkey):
+ """Set the hotkey string"""
+ self.hotkey_input.setText(hotkey)
\ No newline at end of file
diff --git a/ui/main.py b/ui/main.py
new file mode 100644
index 0000000..5c7379c
--- /dev/null
+++ b/ui/main.py
@@ -0,0 +1,25 @@
+"""
+Main entry point for autoMate application
+"""
+import sys
+import argparse
+from PyQt6.QtWidgets import QApplication
+from ui.main_window import MainWindow
+
+def parse_arguments():
+ """Parse command line arguments"""
+ parser = argparse.ArgumentParser(description="PyQt6 App")
+ parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
+ parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
+ return parser.parse_args()
+
+def main():
+ """Main application entry point"""
+ args = parse_arguments()
+ app = QApplication(sys.argv)
+ window = MainWindow(args)
+ window.show()
+ sys.exit(app.exec())
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/ui/main_window.py b/ui/main_window.py
new file mode 100644
index 0000000..020d81e
--- /dev/null
+++ b/ui/main_window.py
@@ -0,0 +1,388 @@
+"""
+Main application window
+"""
+import os
+import keyboard
+from pathlib import Path
+from PyQt6.QtWidgets import (QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
+ QLabel, QLineEdit, QPushButton, QTableWidget, QTableWidgetItem,
+ QTextEdit, QSplitter, QMessageBox, QHeaderView, QDialog, QSystemTrayIcon)
+from PyQt6.QtCore import Qt, pyqtSlot, QSize
+from PyQt6.QtGui import QPixmap, QIcon, QTextCursor, QTextCharFormat, QColor
+
+from xbrain.utils.config import Config
+from auto_control.agent.vision_agent import VisionAgent
+from util.download_weights import OMNI_PARSER_DIR
+
+from ui.theme import apply_theme
+from ui.settings_dialog import SettingsDialog
+from ui.agent_worker import AgentWorker
+from ui.tray_icon import StatusTrayIcon
+from ui.hotkey_edit import DEFAULT_STOP_HOTKEY
+
+# Intro text for application
+INTRO_TEXT = '''
+Based on Omniparser to control desktop!
+'''
+
+class MainWindow(QMainWindow):
+ """Main application window"""
+
+ def __init__(self, args):
+ super().__init__()
+ self.args = args
+
+ # Initialize state
+ self.state = self.setup_initial_state()
+
+ # Initialize Agent
+ self.vision_agent = VisionAgent(
+ yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt")
+ )
+
+ # Create tray icon
+ self.setup_tray_icon()
+
+ self.setWindowTitle("autoMate")
+ self.setMinimumSize(1200, 800)
+
+ self.init_ui()
+ self.apply_theme()
+
+ # Register hotkey handler
+ self.hotkey_handler = None
+ self.register_stop_hotkey()
+
+ # Print startup information
+ print(f"\n\n🚀 PyQt6 application launched")
+
+ def setup_tray_icon(self):
+ """Setup system tray icon"""
+ # Create or load icon
+ try:
+ script_dir = Path(__file__).parent
+
+ # Use logo.png as icon
+ image_path = script_dir.parent / "imgs" / "logo.png"
+ # Load image and create suitable icon size
+ pixmap = QPixmap(str(image_path))
+ # Resize to suitable icon size
+ icon_pixmap = pixmap.scaled(32, 32, Qt.AspectRatioMode.KeepAspectRatio, Qt.TransformationMode.SmoothTransformation)
+ app_icon = QIcon(icon_pixmap)
+ # Set application icon
+ self.setWindowIcon(app_icon)
+
+ # Create system tray icon
+ self.tray_icon = StatusTrayIcon(app_icon, self)
+ self.tray_icon.show()
+
+ except Exception as e:
+ print(f"Error setting up tray icon: {e}")
+ self.tray_icon = None
+
+ def setup_initial_state(self):
+ """Set up initial state"""
+ state = {}
+
+ # Load data from config
+ config = Config()
+ if config.OPENAI_API_KEY:
+ state["api_key"] = config.OPENAI_API_KEY
+ else:
+ state["api_key"] = ""
+
+ if config.OPENAI_BASE_URL:
+ state["base_url"] = config.OPENAI_BASE_URL
+ else:
+ state["base_url"] = "https://api.openai.com/v1"
+
+ if config.OPENAI_MODEL:
+ state["model"] = config.OPENAI_MODEL
+ else:
+ state["model"] = "gpt-4o"
+
+ # Default to light theme
+ state["theme"] = "Light"
+
+ # Default stop hotkey
+ state["stop_hotkey"] = DEFAULT_STOP_HOTKEY
+
+ state["messages"] = []
+ state["chatbox_messages"] = []
+ state["auth_validated"] = False
+ state["responses"] = {}
+ state["tools"] = {}
+ state["tasks"] = []
+ state["only_n_most_recent_images"] = 2
+ state["stop"] = False
+
+ return state
+
+ def register_stop_hotkey(self):
+ """Register the global stop hotkey"""
+ # First unregister any existing hotkey
+ if self.hotkey_handler:
+ try:
+ keyboard.unhook_all()
+ self.hotkey_handler = None
+ except:
+ pass
+
+ # Get the current hotkey from state
+ hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)
+
+ # Check if hotkey is valid
+ if not hotkey:
+ return
+
+ try:
+ # Register new hotkey
+ self.hotkey_handler = keyboard.add_hotkey(hotkey, self.handle_stop_hotkey)
+ print(f"Registered stop hotkey: {hotkey}")
+ except Exception as e:
+ print(f"Error registering hotkey '{hotkey}': {e}")
+
+ def handle_stop_hotkey(self):
+ """Handle stop hotkey press"""
+ print("Stop hotkey pressed!")
+ self.state["stop"] = True
+
+ # Show brief notification
+ if hasattr(self, 'tray_icon') and self.tray_icon is not None:
+ self.tray_icon.showMessage("autoMate", "Stopping automation...", QSystemTrayIcon.MessageIcon.Information, 1000)
+
+ def apply_theme(self):
+ """Apply the current theme to the application"""
+ theme_name = self.state.get("theme", "Light")
+ apply_theme(self, theme_name)
+
+ def init_ui(self):
+ """Initialize UI components"""
+ central_widget = QWidget()
+ main_layout = QVBoxLayout(central_widget)
+
+ # Load top image
+ header_layout = QVBoxLayout()
+ try:
+ script_dir = Path(__file__).parent
+ image_path = script_dir.parent.parent / "imgs" / "header_bar_thin.png"
+ if image_path.exists():
+ pixmap = QPixmap(str(image_path))
+ header_label = QLabel()
+ header_label.setPixmap(pixmap.scaledToWidth(self.width()))
+ header_layout.addWidget(header_label)
+ except Exception as e:
+ print(f"Failed to load header image: {e}")
+
+ title_label = QLabel("autoMate")
+ title_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
+ font = title_label.font()
+ font.setPointSize(20)
+ title_label.setFont(font)
+ header_layout.addWidget(title_label)
+
+ # Introduction text
+ intro_label = QLabel(INTRO_TEXT)
+ intro_label.setWordWrap(True)
+ font = intro_label.font()
+ font.setPointSize(12)
+ intro_label.setFont(font)
+
+ # Settings button and clear chat button (at top)
+ top_buttons_layout = QHBoxLayout()
+ self.settings_button = QPushButton("Settings")
+ self.settings_button.clicked.connect(self.open_settings_dialog)
+ self.clear_button = QPushButton("Clear Chat")
+ self.clear_button.clicked.connect(self.clear_chat)
+ top_buttons_layout.addWidget(self.settings_button)
+ top_buttons_layout.addWidget(self.clear_button)
+ top_buttons_layout.addStretch() # Add elastic space to left-align buttons
+
+ # Input area
+ input_layout = QHBoxLayout()
+ self.chat_input = QLineEdit()
+ self.chat_input.setPlaceholderText("Type a message to send to Omniparser + X ...")
+ # Send message on Enter key
+ self.chat_input.returnPressed.connect(self.process_input)
+ self.submit_button = QPushButton("Send")
+ self.submit_button.clicked.connect(self.process_input)
+ self.stop_button = QPushButton("Stop")
+ self.stop_button.clicked.connect(self.stop_process)
+
+ input_layout.addWidget(self.chat_input, 8)
+ input_layout.addWidget(self.submit_button, 1)
+ input_layout.addWidget(self.stop_button, 1)
+
+ # Main content area
+ content_splitter = QSplitter(Qt.Orientation.Horizontal)
+
+ # Task list
+ task_widget = QWidget()
+ task_layout = QVBoxLayout(task_widget)
+ task_label = QLabel("Task List")
+ self.task_table = QTableWidget(0, 2)
+ self.task_table.setHorizontalHeaderLabels(["Status", "Task"])
+ self.task_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch)
+ task_layout.addWidget(task_label)
+ task_layout.addWidget(self.task_table)
+
+ # Chat area
+ chat_widget = QWidget()
+ chat_layout = QVBoxLayout(chat_widget)
+ chat_label = QLabel("Chat History")
+ self.chat_display = QTextEdit()
+ self.chat_display.setReadOnly(True)
+ chat_layout.addWidget(chat_label)
+ chat_layout.addWidget(self.chat_display)
+
+ # Add to splitter
+ content_splitter.addWidget(task_widget)
+ content_splitter.addWidget(chat_widget)
+ content_splitter.setSizes([int(self.width() * 0.2), int(self.width() * 0.8)])
+
+ # Add all components to main layout
+ main_layout.addLayout(header_layout)
+ main_layout.addWidget(intro_label)
+ main_layout.addLayout(top_buttons_layout) # Add top button area
+ main_layout.addLayout(input_layout)
+ main_layout.addWidget(content_splitter, 1) # 1 is the stretch factor
+
+ self.setCentralWidget(central_widget)
+
+ def open_settings_dialog(self):
+ """Open settings dialog"""
+ dialog = SettingsDialog(self, self.state)
+ result = dialog.exec()
+
+ if result == QDialog.DialogCode.Accepted:
+ # Get and apply new settings
+ settings = dialog.get_settings()
+
+ # Check if stop hotkey changed
+ old_hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)
+ new_hotkey = settings["stop_hotkey"]
+
+ self.state["model"] = settings["model"]
+ self.state["base_url"] = settings["base_url"]
+ self.state["api_key"] = settings["api_key"]
+ self.state["stop_hotkey"] = new_hotkey
+
+ # Update theme if changed
+ if settings["theme"] != self.state.get("theme", "Light"):
+ self.state["theme"] = settings["theme"]
+ self.apply_theme()
+
+ if settings["screen_region"]:
+ self.state["screen_region"] = settings["screen_region"]
+
+ # Update hotkey if changed
+ if old_hotkey != new_hotkey:
+ self.register_stop_hotkey()
+
+ def process_input(self):
+ """Process user input"""
+ user_input = self.chat_input.text()
+ if not user_input.strip():
+ return
+
+ # Clear input box
+ self.chat_input.clear()
+
+ # Show hotkey reminder
+ hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)
+ QMessageBox.information(self, "Automation Starting",
+ f"Automation will start now. You can press {hotkey} to stop at any time.")
+
+ # Minimize main window
+ self.showMinimized()
+
+ # Create and start worker thread
+ self.worker = AgentWorker(user_input, self.state, self.vision_agent)
+ self.worker.update_signal.connect(self.update_ui)
+ self.worker.error_signal.connect(self.handle_error)
+
+ # Connect signals to tray icon if available
+ if hasattr(self, 'tray_icon') and self.tray_icon is not None:
+ self.worker.status_signal.connect(self.tray_icon.update_status)
+ self.worker.task_signal.connect(self.tray_icon.update_task)
+
+ self.worker.start()
+
+ def handle_error(self, error_message):
+ """Handle error messages"""
+ # Restore main window to show the error
+ self.showNormal()
+ self.activateWindow()
+
+ # Show error message
+ QMessageBox.warning(self, "Connection Error",
+ f"Error connecting to AI service:\n{error_message}\n\nPlease check your network connection and API settings.")
+
+ @pyqtSlot(list, list)
+ def update_ui(self, chatbox_messages, tasks):
+ """Update UI display"""
+ # Update chat display
+ self.chat_display.clear()
+
+ for msg in chatbox_messages:
+ role = msg["role"]
+ content = msg["content"]
+
+ # Set different formats based on role
+ format = QTextCharFormat()
+ if role == "user":
+ format.setForeground(QColor(0, 0, 255)) # Blue for user
+ self.chat_display.append("You:")
+ else:
+ format.setForeground(QColor(0, 128, 0)) # Green for AI
+ self.chat_display.append("AI:")
+
+ # Add content
+ cursor = self.chat_display.textCursor()
+ cursor.movePosition(QTextCursor.MoveOperation.End)
+
+ # Special handling for HTML content
+ if "<" in content and ">" in content:
+ self.chat_display.insertHtml(content)
+ self.chat_display.append("") # Add empty line
+ else:
+ self.chat_display.append(content)
+ self.chat_display.append("") # Add empty line
+
+ # Scroll to bottom
+ self.chat_display.verticalScrollBar().setValue(
+ self.chat_display.verticalScrollBar().maximum()
+ )
+
+ # Update task table
+ self.task_table.setRowCount(len(tasks))
+ for i, (status, task) in enumerate(tasks):
+ self.task_table.setItem(i, 0, QTableWidgetItem(status))
+ self.task_table.setItem(i, 1, QTableWidgetItem(task))
+
+ def stop_process(self):
+ """Stop processing"""
+ self.state["stop"] = True
+
+ def clear_chat(self):
+ """Clear chat history"""
+ self.state["messages"] = []
+ self.state["chatbox_messages"] = []
+ self.state["responses"] = {}
+ self.state["tools"] = {}
+ self.state["tasks"] = []
+
+ self.chat_display.clear()
+ self.task_table.setRowCount(0)
+
+ def closeEvent(self, event):
+ """Handle window close event"""
+ # This allows the app to continue running in the system tray
+ # when the main window is closed
+ if hasattr(self, 'tray_icon') and self.tray_icon is not None and self.tray_icon.isVisible():
+ self.hide()
+ event.ignore()
+ else:
+ # Clean up on exit
+ keyboard.unhook_all()
+ event.accept()
\ No newline at end of file
diff --git a/ui/settings_dialog.py b/ui/settings_dialog.py
new file mode 100644
index 0000000..d572644
--- /dev/null
+++ b/ui/settings_dialog.py
@@ -0,0 +1,125 @@
+"""
+Settings dialog for application configuration
+"""
+from PyQt6.QtWidgets import (QDialog, QVBoxLayout, QHBoxLayout,
+ QLabel, QLineEdit, QPushButton, QComboBox)
+from PyQt6.QtCore import QTimer
+from ui.hotkey_edit import HotkeyEdit, DEFAULT_STOP_HOTKEY
+from ui.theme import THEMES
+
+class SettingsDialog(QDialog):
+ """Dialog for application settings"""
+
+ def __init__(self, parent=None, state=None):
+ super().__init__(parent)
+ self.state = state
+ self.parent_window = parent
+ self.setWindowTitle("Settings")
+ self.setMinimumWidth(500)
+ self.init_ui()
+
+ def init_ui(self):
+ layout = QVBoxLayout(self)
+
+ # Model settings
+ model_layout = QHBoxLayout()
+ model_label = QLabel("Model:")
+ self.model_input = QLineEdit(self.state["model"])
+ model_layout.addWidget(model_label)
+ model_layout.addWidget(self.model_input)
+
+ # Base URL settings
+ url_layout = QHBoxLayout()
+ url_label = QLabel("Base URL:")
+ self.base_url_input = QLineEdit(self.state["base_url"])
+ url_layout.addWidget(url_label)
+ url_layout.addWidget(self.base_url_input)
+
+ # API key settings
+ api_layout = QHBoxLayout()
+ api_label = QLabel("API Key:")
+ self.api_key_input = QLineEdit(self.state["api_key"])
+ self.api_key_input.setEchoMode(QLineEdit.EchoMode.Password)
+ api_layout.addWidget(api_label)
+ api_layout.addWidget(self.api_key_input)
+
+ # Theme selection
+ theme_layout = QHBoxLayout()
+ theme_label = QLabel("Theme:")
+ self.theme_combo = QComboBox()
+ self.theme_combo.addItems(list(THEMES.keys()))
+ current_theme = self.state.get("theme", "Light")
+ self.theme_combo.setCurrentText(current_theme)
+ theme_layout.addWidget(theme_label)
+ theme_layout.addWidget(self.theme_combo)
+
+ # Stop hotkey setting
+ hotkey_layout = QHBoxLayout()
+ hotkey_label = QLabel("Stop Hotkey:")
+ self.hotkey_edit = HotkeyEdit(self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY))
+ hotkey_layout.addWidget(hotkey_label)
+ hotkey_layout.addWidget(self.hotkey_edit)
+
+ # Screen region selection
+ region_layout = QHBoxLayout()
+ self.select_region_btn = QPushButton("Select Screen Region")
+ self.region_info = QLabel("No region selected" if "screen_region" not in self.state else f"Selected region: {self.state['screen_region']}")
+ self.select_region_btn.clicked.connect(self.select_screen_region)
+ region_layout.addWidget(self.select_region_btn)
+ region_layout.addWidget(self.region_info)
+
+ # OK and Cancel buttons
+ button_layout = QHBoxLayout()
+ self.ok_button = QPushButton("OK")
+ self.cancel_button = QPushButton("Cancel")
+ self.ok_button.clicked.connect(self.accept)
+ self.cancel_button.clicked.connect(self.reject)
+ button_layout.addWidget(self.ok_button)
+ button_layout.addWidget(self.cancel_button)
+
+ # Add all elements to main layout
+ layout.addLayout(model_layout)
+ layout.addLayout(url_layout)
+ layout.addLayout(api_layout)
+ layout.addLayout(theme_layout)
+ layout.addLayout(hotkey_layout)
+ layout.addLayout(region_layout)
+ layout.addLayout(button_layout)
+
+ def select_screen_region(self):
+ """Select screen region"""
+ # Minimize the parent window before selecting region
+ if self.parent_window:
+ self.parent_window.showMinimized()
+ # Wait a moment for the window to minimize
+ QTimer.singleShot(500, self._do_select_region)
+ else:
+ self._do_select_region()
+
+ def _do_select_region(self):
+ """Actual region selection after minimizing"""
+ from util.screen_selector import ScreenSelector
+ region = ScreenSelector().get_selection()
+
+ # Restore the dialog and parent window
+ self.activateWindow()
+ if self.parent_window:
+ self.parent_window.showNormal()
+ self.parent_window.activateWindow()
+
+ if region:
+ self.state["screen_region"] = region
+ self.region_info.setText(f"Selected region: {region}")
+ else:
+ self.region_info.setText("Selection cancelled")
+
+ def get_settings(self):
+ """Get settings content"""
+ return {
+ "model": self.model_input.text(),
+ "base_url": self.base_url_input.text(),
+ "api_key": self.api_key_input.text(),
+ "screen_region": self.state.get("screen_region", None),
+ "theme": self.theme_combo.currentText(),
+ "stop_hotkey": self.hotkey_edit.get_hotkey()
+ }
\ No newline at end of file
diff --git a/ui/theme.py b/ui/theme.py
new file mode 100644
index 0000000..fed7c91
--- /dev/null
+++ b/ui/theme.py
@@ -0,0 +1,99 @@
+"""
+Theme definitions and theme handling functionality
+"""
+
+# Theme definitions
+THEMES = {
+ "Light": {
+ "main_bg": "#F5F5F5",
+ "widget_bg": "#FFFFFF",
+ "text": "#333333",
+ "accent": "#4A86E8",
+ "button_bg": "#E3E3E3",
+ "button_text": "#333333",
+ "border": "#CCCCCC",
+ "selection_bg": "#D0E2F4"
+ },
+ "Dark": {
+ "main_bg": "#2D2D2D",
+ "widget_bg": "#3D3D3D",
+ "text": "#FFFFFF",
+ "accent": "#4A86E8",
+ "button_bg": "#555555",
+ "button_text": "#FFFFFF",
+ "border": "#555555",
+ "selection_bg": "#3A5F8A"
+ }
+}
+
+def apply_theme(widget, theme_name="Light"):
+ """Apply the specified theme to the widget"""
+ theme = THEMES[theme_name]
+
+ # Create stylesheet for the application
+ stylesheet = f"""
+ QMainWindow, QDialog {{
+ background-color: {theme['main_bg']};
+ color: {theme['text']};
+ }}
+
+ QWidget {{
+ background-color: {theme['main_bg']};
+ color: {theme['text']};
+ }}
+
+ QLabel {{
+ color: {theme['text']};
+ }}
+
+ QPushButton {{
+ background-color: {theme['button_bg']};
+ color: {theme['button_text']};
+ border: 1px solid {theme['border']};
+ border-radius: 4px;
+ padding: 5px 10px;
+ }}
+
+ QPushButton:hover {{
+ background-color: {theme['accent']};
+ color: white;
+ }}
+
+ QLineEdit, QTextEdit, QTableWidget, QComboBox {{
+ background-color: {theme['widget_bg']};
+ color: {theme['text']};
+ border: 1px solid {theme['border']};
+ border-radius: 4px;
+ padding: 4px;
+ }}
+
+ QTextEdit {{
+ background-color: {theme['widget_bg']};
+ }}
+
+ QTableWidget::item:selected {{
+ background-color: {theme['selection_bg']};
+ }}
+
+ QHeaderView::section {{
+ background-color: {theme['button_bg']};
+ color: {theme['button_text']};
+ padding: 4px;
+ border: 1px solid {theme['border']};
+ }}
+
+ QSplitter::handle {{
+ background-color: {theme['border']};
+ }}
+
+ QScrollBar {{
+ background-color: {theme['widget_bg']};
+ }}
+
+ QScrollBar::handle {{
+ background-color: {theme['button_bg']};
+ border-radius: 4px;
+ }}
+ """
+
+ widget.setStyleSheet(stylesheet)
\ No newline at end of file
diff --git a/ui/tray_icon.py b/ui/tray_icon.py
new file mode 100644
index 0000000..dc63ff9
--- /dev/null
+++ b/ui/tray_icon.py
@@ -0,0 +1,60 @@
+"""
+System tray icon implementation
+"""
+from PyQt6.QtWidgets import QSystemTrayIcon, QMenu, QApplication
+from PyQt6.QtGui import QAction
+
+class StatusTrayIcon(QSystemTrayIcon):
+ """System tray icon that displays application status"""
+
+ def __init__(self, icon, parent=None):
+ super().__init__(icon, parent)
+ self.parent = parent
+ self.setToolTip("autoMate")
+
+ # Create context menu
+ self.menu = QMenu()
+ self.show_action = QAction("Show Main Window")
+ self.show_action.triggered.connect(self.show_main_window)
+ self.menu_status = QAction("Status: Idle")
+ self.menu_status.setEnabled(False)
+ self.menu_task = QAction("Task: None")
+ self.menu_task.setEnabled(False)
+ self.exit_action = QAction("Exit")
+ self.exit_action.triggered.connect(QApplication.quit)
+
+ self.menu.addAction(self.show_action)
+ self.menu.addSeparator()
+ self.menu.addAction(self.menu_status)
+ self.menu.addAction(self.menu_task)
+ self.menu.addSeparator()
+ self.menu.addAction(self.exit_action)
+
+ self.setContextMenu(self.menu)
+
+ # Connect signals
+ self.activated.connect(self.icon_activated)
+
+ def show_main_window(self):
+ if self.parent:
+ self.parent.showNormal()
+ self.parent.activateWindow()
+
+ def icon_activated(self, reason):
+ if reason == QSystemTrayIcon.ActivationReason.DoubleClick:
+ self.show_main_window()
+
+ def update_status(self, status_text):
+ """Update status text in tray tooltip and menu"""
+ # Truncate if too long for menu
+ short_status = status_text[:50] + "..." if len(status_text) > 50 else status_text
+ self.menu_status.setText(f"Status: {short_status}")
+
+ # Show brief notification but don't disrupt automation
+ # Only show notification for 500ms (very brief) to not interfere with visual automation
+ self.showMessage("autoMate Status", status_text, QSystemTrayIcon.MessageIcon.Information, 500)
+
+ def update_task(self, task_text):
+ """Update task text in tray menu"""
+ short_task = task_text[:50] + "..." if len(task_text) > 50 else task_text
+ self.menu_task.setText(f"Task: {short_task}")
\ No newline at end of file
diff --git a/util/auto_control.py b/util/auto_control.py
index 8913f66..b2852d9 100644
--- a/util/auto_control.py
+++ b/util/auto_control.py
@@ -4,12 +4,12 @@ import time
# Add the project root directory to Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from gradio_ui.agent.vision_agent import VisionAgent
+from auto_control.agent.vision_agent import VisionAgent
from util.download_weights import MODEL_DIR
from pynput import mouse, keyboard
-# Now you can import from gradio_ui
-from gradio_ui.tools.screen_capture import get_screenshot
+# Now you can import from auto_control
+from auto_control.tools.screen_capture import get_screenshot
class AutoControl:
def __init__(self):
@@ -81,8 +81,7 @@ class AutoControl:
if key == keyboard.Key.esc:
print("self.auto_list", self.auto_list)
- vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"),
- caption_model_path=os.path.join(MODEL_DIR, "icon_caption"))
+ vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"))
for item in self.auto_list:
element_list =vision_agent(str(item["path"]))
diff --git a/util/auto_util.py b/util/auto_util.py
new file mode 100644
index 0000000..ba98092
--- /dev/null
+++ b/util/auto_util.py
@@ -0,0 +1,34 @@
+import os
+import platform
+import pyautogui
+from enum import Enum
+
+import pyperclip
+class AppName(Enum):
+ WECHAT = "wechat"
+
+
+class AutoUtil:
+ def __init__(self, app_name: AppName):
+ self.img_dir = os.path.join(os.path.dirname(__file__),"..", "imgs", app_name.value)
+
+ def click_multi_img(self, img_names, offset_x=0, offset_y=0, minSearchTime=0):
+ for img_name in img_names:
+ self.find_click_img(img_name, offset_x, offset_y, minSearchTime)
+
+ def find_click_img(self, img_name, offset_x=0, offset_y=0, minSearchTime=0):
+ img_path = os.path.join(self.img_dir, img_name + ".png")
+ img = pyautogui.locateOnScreen(img_path, minSearchTime=minSearchTime)
+ x,y = pyautogui.center(img)
+ # Add offset to click position
+ pyautogui.click(x + offset_x, y + offset_y)
+
+ def send_text(self, text):
+ clipboard_data = pyperclip.paste()
+ pyperclip.copy(text)
+ if platform.system() == 'Darwin':
+ pyautogui.hotkey('command', 'v', interval=0.1)
+ else:
+ pyautogui.hotkey('ctrl', 'v')
+ # Copy old data back to clipboard
+ pyperclip.copy(clipboard_data)
diff --git a/util/opencv._detect.py b/util/opencv._detect.py
deleted file mode 100644
index d637388..0000000
--- a/util/opencv._detect.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import cv2
-import sys
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from gradio_ui.tools.screen_capture import get_screenshot
-
-def detect_and_draw_edges():
- # Read the image
- screenshot, path = get_screenshot(is_cursor=False)
- img = cv2.imread(path)
- if img is None:
- print("Error: Could not read the image.")
- return
-
- # Create a copy for drawing contours later
- original = img.copy()
-
- # Convert to grayscale
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Apply Gaussian blur to reduce noise
- blurred = cv2.GaussianBlur(gray, (5, 5), 0)
-
- # Detect edges using Canny algorithm
- edges = cv2.Canny(blurred, 50, 150)
-
- # Find contours from the edges
- contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
- # Draw all detected contours
- cv2.drawContours(original, contours, -1, (0, 255, 0), 2)
-
- print(f"Found {len(contours)} contours in the image")
-
- # Display results
- # cv2.imshow("Original Image", img)
- cv2.imshow("Edges", edges)
- # cv2.imshow("Contours", original)
- cv2.waitKey(0)
- cv2.destroyAllWindows()
-
- return original, contours
-
-# Example usage
-if __name__ == "__main__":
- result_image, detected_contours = detect_and_draw_edges()
diff --git a/util/wechat_auto.py b/util/wechat_auto.py
new file mode 100644
index 0000000..4463773
--- /dev/null
+++ b/util/wechat_auto.py
@@ -0,0 +1,30 @@
+import os
+import sys
+import time
+
+import pyautogui
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from util.auto_util import AppName, AutoUtil
+class WechatAuto:
+ def __init__(self):
+ self.auto_util = AutoUtil(AppName.WECHAT)
+
+ def go_to_chat(self):
+ self.auto_util.find_click_img("chat_unselect.png")
+
+ def search_friend(self, friend_name):
+ try:
+ self.auto_util.find_click_img("chat_unselect")
+ except pyautogui.ImageNotFoundException:
+ self.auto_util.find_click_img("chat_select")
+ self.auto_util.find_click_img("search", offset_x=100)
+ self.auto_util.send_text(friend_name)
+ self.auto_util.find_click_img("contact_person",offset_x=100,offset_y=100,minSearchTime=10)
+ self.auto_util.find_click_img("search",offset_x=-100,offset_y=-100,minSearchTime=10)
+
+if __name__ == "__main__":
+ time.sleep(3)
+ wechat_auto = WechatAuto()
+ wechat_auto.search_friend("李杨林")
+