use pyqt replace gradio

2025-12-25 21:06:47 +08:00 · 2025-03-24 17:31:53 +08:00 · 2025-03-24 17:31:53 +08:00 · 6cc993e537
commit 6cc993e537
parent 1733facad8
34 changed files with 1053 additions and 74 deletions
--- a/auto_control/.DS_Store
+++ b/auto_control/.DS_Store
--- a/auto_control/init.py
+++ b/auto_control/init.py
--- a/auto_control/agent/base_agent.py
+++ b/auto_control/agent/base_agent.py
--- a/auto_control/agent/task_plan_agent.py
+++ b/auto_control/agent/task_plan_agent.py
@ -1,9 +1,9 @@
 import json
 from pydantic import BaseModel, Field
-from gradio_ui.agent.base_agent import BaseAgent
+from auto_control.agent.base_agent import BaseAgent
 from xbrain.core.chat import run

-from gradio_ui.tools.computer import Action
+from auto_control.tools.computer import Action

 class TaskPlanAgent(BaseAgent):
    def __call__(self, messages, parsed_screen_result):
--- a/auto_control/agent/task_run_agent.py
+++ b/auto_control/agent/task_run_agent.py
@ -2,10 +2,10 @@ import json
 import uuid
 from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage
 from pydantic import Field, create_model
-from gradio_ui.agent.base_agent import BaseAgent
+from auto_control.agent.base_agent import BaseAgent
 from xbrain.core.chat import run

-from gradio_ui.tools.computer import Action
+from auto_control.tools.computer import Action
 class TaskRunAgent(BaseAgent):
    def __init__(self):
        self.OUTPUT_DIR = "./tmp/outputs"
--- a/auto_control/agent/vision_agent.py
+++ b/auto_control/agent/vision_agent.py
--- a/auto_control/app.py
+++ b/auto_control/app.py
@ -7,8 +7,8 @@ import os
 from pathlib import Path
 import argparse
 import gradio as gr
-from gradio_ui.agent.vision_agent import VisionAgent
-from gradio_ui.loop import (
+from auto_control.agent.vision_agent import VisionAgent
+from auto_control.loop import (
    sampling_loop_sync,
 )
 import base64
@ -349,4 +349,4 @@ def run():
            while True:
                time.sleep(1) 
        except KeyboardInterrupt:
-            print("\n💤 closing server")
+            print("\n<EFBFBD><EFBFBD> closing server")
--- a/auto_control/executor/anthropic_executor.py
+++ b/auto_control/executor/anthropic_executor.py
@ -3,7 +3,7 @@ from typing import Any, cast
 from anthropic.types.beta import (
    BetaContentBlock
 )
-from gradio_ui.tools import ComputerTool, ToolCollection
+from auto_control.tools import ComputerTool, ToolCollection


 class AnthropicExecutor:
--- a/auto_control/loop.py
+++ b/auto_control/loop.py
@ -4,12 +4,12 @@ Agentic sampling loop that calls the Anthropic API and local implenmentation of
 import base64
 from io import BytesIO
 import cv2
-from gradio_ui.agent.vision_agent import VisionAgent
-from gradio_ui.tools.screen_capture import get_screenshot
+from auto_control.agent.vision_agent import VisionAgent
+from auto_control.tools.screen_capture import get_screenshot
 from anthropic.types.beta import (BetaMessageParam)
-from gradio_ui.agent.task_plan_agent import TaskPlanAgent
-from gradio_ui.agent.task_run_agent import TaskRunAgent
-from gradio_ui.executor.anthropic_executor import AnthropicExecutor
+from auto_control.agent.task_plan_agent import TaskPlanAgent
+from auto_control.agent.task_run_agent import TaskRunAgent
+from auto_control.executor.anthropic_executor import AnthropicExecutor
 import numpy as np
 from PIL import Image

--- a/auto_control/tools/init.py
+++ b/auto_control/tools/init.py
--- a/auto_control/tools/base.py
+++ b/auto_control/tools/base.py
--- a/auto_control/tools/collection.py
+++ b/auto_control/tools/collection.py
--- a/auto_control/tools/computer.py
+++ b/auto_control/tools/computer.py
--- a/auto_control/tools/screen_capture.py
+++ b/auto_control/tools/screen_capture.py
--- a/gradio_ui/.gitignore
+++ b/gradio_ui/.gitignore
@ -1 +0,0 @@
-tmp/
--- a/imgs/wechat/chat_select.png
+++ b/imgs/wechat/chat_select.png
--- a/imgs/wechat/chat_unselect.png
+++ b/imgs/wechat/chat_unselect.png
--- a/imgs/wechat/contact_person.png
+++ b/imgs/wechat/contact_person.png
--- a/imgs/wechat/search.png
+++ b/imgs/wechat/search.png
--- a/imgs/wechat/send_message.png
+++ b/imgs/wechat/send_message.png
--- a/main.py
+++ b/main.py
@ -1,11 +1,9 @@
-from gradio_ui import app
-import os
+from ui.main import main
 from util import download_weights
-os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 def run():
    download_weights.download() 
-    app.run()
+    main()
+    
+if __name__ == "__main__":
+    run()

-
-if __name__ == '__main__':
-    run()
--- a/requirements.txt
+++ b/requirements.txt
@ -13,4 +13,6 @@ timm
 einops==0.8.0
 modelscope
 pynput
-lap
+lap
+pyqt6==6.8.1
+keyboard==0.13.5
--- a/ui/init.py
+++ b/ui/init.py
@ -0,0 +1,3 @@
+"""
+autoMate UI package
+""" 
--- a/ui/agent_worker.py
+++ b/ui/agent_worker.py
@ -0,0 +1,174 @@
+"""
+Worker thread for handling agent operations
+"""
+import json
+from PyQt6.QtCore import QThread, pyqtSignal
+
+from auto_control.loop import sampling_loop_sync
+from xbrain.utils.config import Config
+
+class AgentWorker(QThread):
+    """Worker thread for running agent operations asynchronously"""
+    
+    update_signal = pyqtSignal(list, list)
+    status_signal = pyqtSignal(str)  # Signal for status updates
+    task_signal = pyqtSignal(str)    # Signal for current task
+    error_signal = pyqtSignal(str)   # Error signal
+    
+    def __init__(self, user_input, state, vision_agent):
+        super().__init__()
+        self.user_input = user_input
+        self.state = state
+        self.vision_agent = vision_agent
+        
+    def run(self):
+        # Reset stop flag
+        if self.state["stop"]:
+            self.state["stop"] = False
+            
+        # Configure API
+        config = Config()
+        config.set_openai_config(
+            base_url=self.state["base_url"], 
+            api_key=self.state["api_key"], 
+            model=self.state["model"]
+        )
+        
+        # Add user message
+        self.state["messages"].append({"role": "user", "content": self.user_input})
+        self.state["chatbox_messages"].append({"role": "user", "content": self.user_input})
+        
+        # Send initial update
+        self.update_signal.emit(self.state["chatbox_messages"], [])
+        self.status_signal.emit("Starting analysis...")
+        
+        try:
+            # Process with agent
+            for _ in sampling_loop_sync(
+                model=self.state["model"],
+                messages=self.state["messages"],
+                vision_agent=self.vision_agent,
+                screen_region=self.state.get("screen_region", None)
+            ):
+                if self.state["stop"]:
+                    self.state["chatbox_messages"].append({"role": "user", "content": "Stop!"})
+                    self.status_signal.emit("Operation stopped by user")
+                    return
+
+                # task_plan_agent first response
+                if len(self.state["messages"]) == 2:
+                    task_list = json.loads(self.state["messages"][-1]["content"])["task_list"]
+                    for task in task_list:
+                        self.state["tasks"].append({
+                            "status": "⬜",
+                            "task": task
+                        })
+                else:
+                    # Reset all task statuses
+                    for i in range(len(self.state["tasks"])):
+                        self.state["tasks"][i]["status"] = "⬜"
+                        
+                    # Update task progress
+                    content_json = json.loads(self.state["messages"][-1]["content"])
+                    task_completed_number = content_json["current_task_id"]
+                    
+                    # Update status with reasoning
+                    if "reasoning" in content_json:
+                        self.status_signal.emit(content_json["reasoning"])
+                    
+                    # Update current task
+                    if task_completed_number < len(self.state["tasks"]):
+                        current_task = self.state["tasks"][task_completed_number]["task"]
+                        self.task_signal.emit(current_task)
+                    
+                    if task_completed_number > len(self.state["tasks"]) + 1:
+                        for i in range(len(self.state["tasks"])):
+                            self.state["tasks"][i]["status"] = "✅"
+                    else:
+                        for i in range(task_completed_number + 1):
+                            self.state["tasks"][i]["status"] = "✅"
+                         
+                # Reconstruct chat messages from original messages
+                self.state["chatbox_messages"] = []
+                
+                for message in self.state["messages"]:
+                    formatted_content, json_reasoning = self.format_message_content(message["content"])
+                    
+                    # Add json reasoning as a separate message if exists
+                    if json_reasoning:
+                        self.state["chatbox_messages"].append({
+                            "role": message["role"],
+                            "content": json_reasoning
+                        })
+                    
+                    # Add formatted content
+                    self.state["chatbox_messages"].append({
+                        "role": message["role"],
+                        "content": formatted_content
+                    })
+                    
+                # Convert data format before returning results
+                tasks_2d = [[task["status"], task["task"]] for task in self.state["tasks"]]
+                self.update_signal.emit(self.state["chatbox_messages"], tasks_2d)
+            
+            # All done
+            self.status_signal.emit("Task completed")
+        
+        except Exception as e:
+            # Send error signal
+            import traceback
+            error_message = f"Error occurred: {str(e)}\n{traceback.format_exc()}"
+            print(error_message)
+            
+            # Add error message to chat
+            self.state["chatbox_messages"].append({
+                "role": "assistant", 
+                "content": f"<span style='color:red'>⚠️ Network connection error: {str(e)}</span><br>Please check your network connection and API settings, or try again later."
+            })
+            self.update_signal.emit(self.state["chatbox_messages"], 
+                                   [[task["status"], task["task"]] for task in self.state["tasks"]])
+            self.error_signal.emit(str(e))
+            self.status_signal.emit(f"Error: {str(e)}")
+            
+    def format_message_content(self, content):
+        """Format message content for display"""
+        # Handle list-type content (multimodal)
+        if isinstance(content, list):
+            formatted_content = ""
+            json_reasoning = None
+            
+            for item in content:
+                if item["type"] == "image_url":
+                    # Changed image style to be smaller
+                    formatted_content += f'<br/><img style="width: 50%; max-width: 400px;" src="{item["image_url"]["url"]}">'
+                elif item["type"] == "text":
+                    if self.is_json_format(item["text"]):
+                        reasoning, details = self.format_json_content(item["text"])
+                        json_reasoning = reasoning
+                        formatted_content += details
+                    else:
+                        formatted_content += item["text"]
+            
+            return formatted_content, json_reasoning
+        
+        # Handle string content
+        if self.is_json_format(content):
+            reasoning, _ = self.format_json_content(content)
+            formatted_content = json.dumps(json.loads(content), indent=4, ensure_ascii=False)
+            return formatted_content, reasoning
+        
+        return content, None
+    
+    def format_json_content(self, json_content):
+        """Format JSON content with reasoning and details"""
+        content_json = json.loads(json_content)
+        reasoning = f'<h3>{content_json["reasoning"]}</h3>'
+        details = f'<br/> <details> <summary>Detail</summary> <pre>{json.dumps(content_json, indent=4, ensure_ascii=False)}</pre> </details>'
+        return reasoning, details
+    
+    def is_json_format(self, text):
+        try:
+            json.loads(text)
+            return True
+        except:
+            return False 
--- a/ui/hotkey_edit.py
+++ b/ui/hotkey_edit.py
@ -0,0 +1,90 @@
+"""
+Hotkey editing widget
+"""
+import keyboard
+from PyQt6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QPushButton
+
+# Default stop hotkey
+DEFAULT_STOP_HOTKEY = "ctrl+k"
+
+class HotkeyEdit(QWidget):
+    """Widget for recording hotkey combinations"""
+    
+    def __init__(self, hotkey="", parent=None):
+        super().__init__(parent)
+        layout = QHBoxLayout(self)
+        layout.setContentsMargins(0, 0, 0, 0)
+        
+        self.hotkey_input = QLineEdit(hotkey)
+        self.hotkey_input.setReadOnly(True)
+        self.hotkey_input.setPlaceholderText("Click to record hotkey")
+        
+        self.record_btn = QPushButton("Record")
+        self.record_btn.clicked.connect(self.start_recording)
+        
+        layout.addWidget(self.hotkey_input, 1)
+        layout.addWidget(self.record_btn)
+        
+        self.recording = False
+        self.keys_pressed = set()
+        
+    def start_recording(self):
+        """Start recording a new hotkey"""
+        if self.recording:
+            self.stop_recording()
+            return
+            
+        self.hotkey_input.setText("Press keys...")
+        self.record_btn.setText("Stop")
+        self.recording = True
+        self.keys_pressed = set()
+        
+        # Hook global events
+        keyboard.hook(self.on_key_event)
+        
+    def stop_recording(self):
+        """Stop recording and set the hotkey"""
+        keyboard.unhook(self.on_key_event)
+        self.recording = False
+        self.record_btn.setText("Record")
+        
+        # Convert keys to hotkey string
+        if self.keys_pressed:
+            hotkey = '+'.join(sorted(self.keys_pressed))
+            self.hotkey_input.setText(hotkey)
+        else:
+            self.hotkey_input.setText("")
+    
+    def on_key_event(self, event):
+        """Handle key events during recording"""
+        if not self.recording:
+            return
+            
+        # Skip key up events
+        if not event.event_type == keyboard.KEY_DOWN:
+            return
+            
+        # Get key name
+        key_name = event.name.lower()
+        
+        # Special handling for modifier keys
+        if key_name in ['ctrl', 'alt', 'shift', 'windows']:
+            self.keys_pressed.add(key_name)
+        else:
+            self.keys_pressed.add(key_name)
+            
+        # Show current keys
+        self.hotkey_input.setText('+'.join(sorted(self.keys_pressed)))
+        
+        # Stop recording if user presses Escape alone
+        if len(self.keys_pressed) == 1 and 'esc' in self.keys_pressed:
+            self.keys_pressed.clear()
+            self.stop_recording()
+    
+    def get_hotkey(self):
+        """Get the current hotkey string"""
+        return self.hotkey_input.text()
+        
+    def set_hotkey(self, hotkey):
+        """Set the hotkey string"""
+        self.hotkey_input.setText(hotkey) 
--- a/ui/main.py
+++ b/ui/main.py
@ -0,0 +1,25 @@
+"""
+Main entry point for autoMate application
+"""
+import sys
+import argparse
+from PyQt6.QtWidgets import QApplication
+from ui.main_window import MainWindow
+
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description="PyQt6 App")
+    parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
+    parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
+    return parser.parse_args()
+
+def main():
+    """Main application entry point"""
+    args = parse_arguments()
+    app = QApplication(sys.argv)
+    window = MainWindow(args)
+    window.show()
+    sys.exit(app.exec())
+
+if __name__ == "__main__":
+    main() 
--- a/ui/main_window.py
+++ b/ui/main_window.py
@ -0,0 +1,388 @@
+"""
+Main application window
+"""
+import os
+import keyboard
+from pathlib import Path
+from PyQt6.QtWidgets import (QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, 
+                           QLabel, QLineEdit, QPushButton, QTableWidget, QTableWidgetItem,
+                           QTextEdit, QSplitter, QMessageBox, QHeaderView, QDialog, QSystemTrayIcon)
+from PyQt6.QtCore import Qt, pyqtSlot, QSize
+from PyQt6.QtGui import QPixmap, QIcon, QTextCursor, QTextCharFormat, QColor
+
+from xbrain.utils.config import Config
+from auto_control.agent.vision_agent import VisionAgent
+from util.download_weights import OMNI_PARSER_DIR
+
+from ui.theme import apply_theme
+from ui.settings_dialog import SettingsDialog
+from ui.agent_worker import AgentWorker
+from ui.tray_icon import StatusTrayIcon
+from ui.hotkey_edit import DEFAULT_STOP_HOTKEY
+
+# Intro text for application
+INTRO_TEXT = '''
+Based on Omniparser to control desktop!
+'''
+
+class MainWindow(QMainWindow):
+    """Main application window"""
+    
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        
+        # Initialize state
+        self.state = self.setup_initial_state()
+        
+        # Initialize Agent
+        self.vision_agent = VisionAgent(
+            yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt")
+        )
+        
+        # Create tray icon
+        self.setup_tray_icon()
+        
+        self.setWindowTitle("autoMate")
+        self.setMinimumSize(1200, 800)
+        
+        self.init_ui()
+        self.apply_theme()
+        
+        # Register hotkey handler
+        self.hotkey_handler = None
+        self.register_stop_hotkey()
+        
+        # Print startup information
+        print(f"\n\n🚀 PyQt6 application launched")
+    
+    def setup_tray_icon(self):
+        """Setup system tray icon"""
+        # Create or load icon
+        try:
+            script_dir = Path(__file__).parent
+            
+            # Use logo.png as icon
+            image_path = script_dir.parent / "imgs" / "logo.png"
+            # Load image and create suitable icon size
+            pixmap = QPixmap(str(image_path))
+            # Resize to suitable icon size
+            icon_pixmap = pixmap.scaled(32, 32, Qt.AspectRatioMode.KeepAspectRatio, Qt.TransformationMode.SmoothTransformation)
+            app_icon = QIcon(icon_pixmap)
+            # Set application icon
+            self.setWindowIcon(app_icon)
+            
+            # Create system tray icon
+            self.tray_icon = StatusTrayIcon(app_icon, self)
+            self.tray_icon.show()
+            
+        except Exception as e:
+            print(f"Error setting up tray icon: {e}")
+            self.tray_icon = None
+    
+    def setup_initial_state(self):
+        """Set up initial state"""
+        state = {}
+        
+        # Load data from config
+        config = Config()
+        if config.OPENAI_API_KEY:
+            state["api_key"] = config.OPENAI_API_KEY
+        else:
+            state["api_key"] = ""
+            
+        if config.OPENAI_BASE_URL:
+            state["base_url"] = config.OPENAI_BASE_URL
+        else:
+            state["base_url"] = "https://api.openai.com/v1"
+            
+        if config.OPENAI_MODEL:
+            state["model"] = config.OPENAI_MODEL
+        else:
+            state["model"] = "gpt-4o"
+        
+        # Default to light theme
+        state["theme"] = "Light"
+        
+        # Default stop hotkey
+        state["stop_hotkey"] = DEFAULT_STOP_HOTKEY
+        
+        state["messages"] = []
+        state["chatbox_messages"] = []
+        state["auth_validated"] = False
+        state["responses"] = {}
+        state["tools"] = {}
+        state["tasks"] = []
+        state["only_n_most_recent_images"] = 2
+        state["stop"] = False
+        
+        return state
+    
+    def register_stop_hotkey(self):
+        """Register the global stop hotkey"""
+        # First unregister any existing hotkey
+        if self.hotkey_handler:
+            try:
+                keyboard.unhook_all()
+                self.hotkey_handler = None
+            except:
+                pass
+        
+        # Get the current hotkey from state
+        hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)
+        
+        # Check if hotkey is valid
+        if not hotkey:
+            return
+            
+        try:
+            # Register new hotkey
+            self.hotkey_handler = keyboard.add_hotkey(hotkey, self.handle_stop_hotkey)
+            print(f"Registered stop hotkey: {hotkey}")
+        except Exception as e:
+            print(f"Error registering hotkey '{hotkey}': {e}")
+    
+    def handle_stop_hotkey(self):
+        """Handle stop hotkey press"""
+        print("Stop hotkey pressed!")
+        self.state["stop"] = True
+        
+        # Show brief notification
+        if hasattr(self, 'tray_icon') and self.tray_icon is not None:
+            self.tray_icon.showMessage("autoMate", "Stopping automation...", QSystemTrayIcon.MessageIcon.Information, 1000)
+    
+    def apply_theme(self):
+        """Apply the current theme to the application"""
+        theme_name = self.state.get("theme", "Light")
+        apply_theme(self, theme_name)
+    
+    def init_ui(self):
+        """Initialize UI components"""
+        central_widget = QWidget()
+        main_layout = QVBoxLayout(central_widget)
+        
+        # Load top image
+        header_layout = QVBoxLayout()
+        try:
+            script_dir = Path(__file__).parent
+            image_path = script_dir.parent.parent / "imgs" / "header_bar_thin.png"
+            if image_path.exists():
+                pixmap = QPixmap(str(image_path))
+                header_label = QLabel()
+                header_label.setPixmap(pixmap.scaledToWidth(self.width()))
+                header_layout.addWidget(header_label)
+        except Exception as e:
+            print(f"Failed to load header image: {e}")
+        
+        title_label = QLabel("autoMate")
+        title_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
+        font = title_label.font()
+        font.setPointSize(20)
+        title_label.setFont(font)
+        header_layout.addWidget(title_label)
+        
+        # Introduction text
+        intro_label = QLabel(INTRO_TEXT)
+        intro_label.setWordWrap(True)
+        font = intro_label.font()
+        font.setPointSize(12)
+        intro_label.setFont(font)
+        
+        # Settings button and clear chat button (at top)
+        top_buttons_layout = QHBoxLayout()
+        self.settings_button = QPushButton("Settings")
+        self.settings_button.clicked.connect(self.open_settings_dialog)
+        self.clear_button = QPushButton("Clear Chat")
+        self.clear_button.clicked.connect(self.clear_chat)
+        top_buttons_layout.addWidget(self.settings_button)
+        top_buttons_layout.addWidget(self.clear_button)
+        top_buttons_layout.addStretch()  # Add elastic space to left-align buttons
+        
+        # Input area
+        input_layout = QHBoxLayout()
+        self.chat_input = QLineEdit()
+        self.chat_input.setPlaceholderText("Type a message to send to Omniparser + X ...")
+        # Send message on Enter key
+        self.chat_input.returnPressed.connect(self.process_input)
+        self.submit_button = QPushButton("Send")
+        self.submit_button.clicked.connect(self.process_input)
+        self.stop_button = QPushButton("Stop")
+        self.stop_button.clicked.connect(self.stop_process)
+        
+        input_layout.addWidget(self.chat_input, 8)
+        input_layout.addWidget(self.submit_button, 1)
+        input_layout.addWidget(self.stop_button, 1)
+        
+        # Main content area
+        content_splitter = QSplitter(Qt.Orientation.Horizontal)
+        
+        # Task list
+        task_widget = QWidget()
+        task_layout = QVBoxLayout(task_widget)
+        task_label = QLabel("Task List")
+        self.task_table = QTableWidget(0, 2)
+        self.task_table.setHorizontalHeaderLabels(["Status", "Task"])
+        self.task_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch)
+        task_layout.addWidget(task_label)
+        task_layout.addWidget(self.task_table)
+        
+        # Chat area
+        chat_widget = QWidget()
+        chat_layout = QVBoxLayout(chat_widget)
+        chat_label = QLabel("Chat History")
+        self.chat_display = QTextEdit()
+        self.chat_display.setReadOnly(True)
+        chat_layout.addWidget(chat_label)
+        chat_layout.addWidget(self.chat_display)
+        
+        # Add to splitter
+        content_splitter.addWidget(task_widget)
+        content_splitter.addWidget(chat_widget)
+        content_splitter.setSizes([int(self.width() * 0.2), int(self.width() * 0.8)])
+        
+        # Add all components to main layout
+        main_layout.addLayout(header_layout)
+        main_layout.addWidget(intro_label)
+        main_layout.addLayout(top_buttons_layout)  # Add top button area
+        main_layout.addLayout(input_layout)
+        main_layout.addWidget(content_splitter, 1)  # 1 is the stretch factor
+        
+        self.setCentralWidget(central_widget)
+    
+    def open_settings_dialog(self):
+        """Open settings dialog"""
+        dialog = SettingsDialog(self, self.state)
+        result = dialog.exec()
+        
+        if result == QDialog.DialogCode.Accepted:
+            # Get and apply new settings
+            settings = dialog.get_settings()
+            
+            # Check if stop hotkey changed
+            old_hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)
+            new_hotkey = settings["stop_hotkey"]
+            
+            self.state["model"] = settings["model"]
+            self.state["base_url"] = settings["base_url"]
+            self.state["api_key"] = settings["api_key"]
+            self.state["stop_hotkey"] = new_hotkey
+            
+            # Update theme if changed
+            if settings["theme"] != self.state.get("theme", "Light"):
+                self.state["theme"] = settings["theme"]
+                self.apply_theme()
+                
+            if settings["screen_region"]:
+                self.state["screen_region"] = settings["screen_region"]
+                
+            # Update hotkey if changed
+            if old_hotkey != new_hotkey:
+                self.register_stop_hotkey()
+    
+    def process_input(self):
+        """Process user input"""
+        user_input = self.chat_input.text()
+        if not user_input.strip():
+            return
+            
+        # Clear input box
+        self.chat_input.clear()
+        
+        # Show hotkey reminder
+        hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)
+        QMessageBox.information(self, "Automation Starting", 
+                               f"Automation will start now. You can press {hotkey} to stop at any time.")
+        
+        # Minimize main window
+        self.showMinimized()
+        
+        # Create and start worker thread
+        self.worker = AgentWorker(user_input, self.state, self.vision_agent)
+        self.worker.update_signal.connect(self.update_ui)
+        self.worker.error_signal.connect(self.handle_error)
+        
+        # Connect signals to tray icon if available
+        if hasattr(self, 'tray_icon') and self.tray_icon is not None:
+            self.worker.status_signal.connect(self.tray_icon.update_status)
+            self.worker.task_signal.connect(self.tray_icon.update_task)
+        
+        self.worker.start()
+    
+    def handle_error(self, error_message):
+        """Handle error messages"""
+        # Restore main window to show the error
+        self.showNormal()
+        self.activateWindow()
+        
+        # Show error message
+        QMessageBox.warning(self, "Connection Error", 
+                           f"Error connecting to AI service:\n{error_message}\n\nPlease check your network connection and API settings.")
+    
+    @pyqtSlot(list, list)
+    def update_ui(self, chatbox_messages, tasks):
+        """Update UI display"""
+        # Update chat display
+        self.chat_display.clear()
+        
+        for msg in chatbox_messages:
+            role = msg["role"]
+            content = msg["content"]
+            
+            # Set different formats based on role
+            format = QTextCharFormat()
+            if role == "user":
+                format.setForeground(QColor(0, 0, 255))  # Blue for user
+                self.chat_display.append("You:")
+            else:
+                format.setForeground(QColor(0, 128, 0))  # Green for AI
+                self.chat_display.append("AI:")
+            
+            # Add content
+            cursor = self.chat_display.textCursor()
+            cursor.movePosition(QTextCursor.MoveOperation.End)
+            
+            # Special handling for HTML content
+            if "<" in content and ">" in content:
+                self.chat_display.insertHtml(content)
+                self.chat_display.append("")  # Add empty line
+            else:
+                self.chat_display.append(content)
+                self.chat_display.append("")  # Add empty line
+            
+            # Scroll to bottom
+            self.chat_display.verticalScrollBar().setValue(
+                self.chat_display.verticalScrollBar().maximum()
+            )
+        
+        # Update task table
+        self.task_table.setRowCount(len(tasks))
+        for i, (status, task) in enumerate(tasks):
+            self.task_table.setItem(i, 0, QTableWidgetItem(status))
+            self.task_table.setItem(i, 1, QTableWidgetItem(task))
+    
+    def stop_process(self):
+        """Stop processing"""
+        self.state["stop"] = True
+        
+    def clear_chat(self):
+        """Clear chat history"""
+        self.state["messages"] = []
+        self.state["chatbox_messages"] = []
+        self.state["responses"] = {}
+        self.state["tools"] = {}
+        self.state["tasks"] = []
+        
+        self.chat_display.clear()
+        self.task_table.setRowCount(0)
+    
+    def closeEvent(self, event):
+        """Handle window close event"""
+        # This allows the app to continue running in the system tray
+        # when the main window is closed
+        if hasattr(self, 'tray_icon') and self.tray_icon is not None and self.tray_icon.isVisible():
+            self.hide()
+            event.ignore()
+        else:
+            # Clean up on exit
+            keyboard.unhook_all()
+            event.accept() 
--- a/ui/settings_dialog.py
+++ b/ui/settings_dialog.py
@ -0,0 +1,125 @@
+"""
+Settings dialog for application configuration
+"""
+from PyQt6.QtWidgets import (QDialog, QVBoxLayout, QHBoxLayout, 
+                          QLabel, QLineEdit, QPushButton, QComboBox)
+from PyQt6.QtCore import QTimer
+from ui.hotkey_edit import HotkeyEdit, DEFAULT_STOP_HOTKEY
+from ui.theme import THEMES
+
+class SettingsDialog(QDialog):
+    """Dialog for application settings"""
+    
+    def __init__(self, parent=None, state=None):
+        super().__init__(parent)
+        self.state = state
+        self.parent_window = parent
+        self.setWindowTitle("Settings")
+        self.setMinimumWidth(500)
+        self.init_ui()
+        
+    def init_ui(self):
+        layout = QVBoxLayout(self)
+        
+        # Model settings
+        model_layout = QHBoxLayout()
+        model_label = QLabel("Model:")
+        self.model_input = QLineEdit(self.state["model"])
+        model_layout.addWidget(model_label)
+        model_layout.addWidget(self.model_input)
+        
+        # Base URL settings
+        url_layout = QHBoxLayout()
+        url_label = QLabel("Base URL:")
+        self.base_url_input = QLineEdit(self.state["base_url"])
+        url_layout.addWidget(url_label)
+        url_layout.addWidget(self.base_url_input)
+        
+        # API key settings
+        api_layout = QHBoxLayout()
+        api_label = QLabel("API Key:")
+        self.api_key_input = QLineEdit(self.state["api_key"])
+        self.api_key_input.setEchoMode(QLineEdit.EchoMode.Password)
+        api_layout.addWidget(api_label)
+        api_layout.addWidget(self.api_key_input)
+        
+        # Theme selection
+        theme_layout = QHBoxLayout()
+        theme_label = QLabel("Theme:")
+        self.theme_combo = QComboBox()
+        self.theme_combo.addItems(list(THEMES.keys()))
+        current_theme = self.state.get("theme", "Light")
+        self.theme_combo.setCurrentText(current_theme)
+        theme_layout.addWidget(theme_label)
+        theme_layout.addWidget(self.theme_combo)
+        
+        # Stop hotkey setting
+        hotkey_layout = QHBoxLayout()
+        hotkey_label = QLabel("Stop Hotkey:")
+        self.hotkey_edit = HotkeyEdit(self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY))
+        hotkey_layout.addWidget(hotkey_label)
+        hotkey_layout.addWidget(self.hotkey_edit)
+        
+        # Screen region selection
+        region_layout = QHBoxLayout()
+        self.select_region_btn = QPushButton("Select Screen Region")
+        self.region_info = QLabel("No region selected" if "screen_region" not in self.state else f"Selected region: {self.state['screen_region']}")
+        self.select_region_btn.clicked.connect(self.select_screen_region)
+        region_layout.addWidget(self.select_region_btn)
+        region_layout.addWidget(self.region_info)
+        
+        # OK and Cancel buttons
+        button_layout = QHBoxLayout()
+        self.ok_button = QPushButton("OK")
+        self.cancel_button = QPushButton("Cancel")
+        self.ok_button.clicked.connect(self.accept)
+        self.cancel_button.clicked.connect(self.reject)
+        button_layout.addWidget(self.ok_button)
+        button_layout.addWidget(self.cancel_button)
+        
+        # Add all elements to main layout
+        layout.addLayout(model_layout)
+        layout.addLayout(url_layout)
+        layout.addLayout(api_layout)
+        layout.addLayout(theme_layout)
+        layout.addLayout(hotkey_layout)
+        layout.addLayout(region_layout)
+        layout.addLayout(button_layout)
+    
+    def select_screen_region(self):
+        """Select screen region"""
+        # Minimize the parent window before selecting region
+        if self.parent_window:
+            self.parent_window.showMinimized()
+            # Wait a moment for the window to minimize
+            QTimer.singleShot(500, self._do_select_region)
+        else:
+            self._do_select_region()
+            
+    def _do_select_region(self):
+        """Actual region selection after minimizing"""
+        from util.screen_selector import ScreenSelector
+        region = ScreenSelector().get_selection()
+        
+        # Restore the dialog and parent window
+        self.activateWindow()
+        if self.parent_window:
+            self.parent_window.showNormal()
+            self.parent_window.activateWindow()
+        
+        if region:
+            self.state["screen_region"] = region
+            self.region_info.setText(f"Selected region: {region}")
+        else:
+            self.region_info.setText("Selection cancelled")
+    
+    def get_settings(self):
+        """Get settings content"""
+        return {
+            "model": self.model_input.text(),
+            "base_url": self.base_url_input.text(),
+            "api_key": self.api_key_input.text(),
+            "screen_region": self.state.get("screen_region", None),
+            "theme": self.theme_combo.currentText(),
+            "stop_hotkey": self.hotkey_edit.get_hotkey()
+        } 
--- a/ui/theme.py
+++ b/ui/theme.py
@ -0,0 +1,99 @@
+"""
+Theme definitions and theme handling functionality
+"""
+
+# Theme definitions
+THEMES = {
+    "Light": {
+        "main_bg": "#F5F5F5",
+        "widget_bg": "#FFFFFF",
+        "text": "#333333",
+        "accent": "#4A86E8",
+        "button_bg": "#E3E3E3",
+        "button_text": "#333333",
+        "border": "#CCCCCC",
+        "selection_bg": "#D0E2F4"
+    },
+    "Dark": {
+        "main_bg": "#2D2D2D",
+        "widget_bg": "#3D3D3D",
+        "text": "#FFFFFF",
+        "accent": "#4A86E8",
+        "button_bg": "#555555",
+        "button_text": "#FFFFFF",
+        "border": "#555555",
+        "selection_bg": "#3A5F8A"
+    }
+}
+
+def apply_theme(widget, theme_name="Light"):
+    """Apply the specified theme to the widget"""
+    theme = THEMES[theme_name]
+    
+    # Create stylesheet for the application
+    stylesheet = f"""
+    QMainWindow, QDialog {{
+        background-color: {theme['main_bg']};
+        color: {theme['text']};
+    }}
+    
+    QWidget {{
+        background-color: {theme['main_bg']};
+        color: {theme['text']};
+    }}
+    
+    QLabel {{
+        color: {theme['text']};
+    }}
+    
+    QPushButton {{
+        background-color: {theme['button_bg']};
+        color: {theme['button_text']};
+        border: 1px solid {theme['border']};
+        border-radius: 4px;
+        padding: 5px 10px;
+    }}
+    
+    QPushButton:hover {{
+        background-color: {theme['accent']};
+        color: white;
+    }}
+    
+    QLineEdit, QTextEdit, QTableWidget, QComboBox {{
+        background-color: {theme['widget_bg']};
+        color: {theme['text']};
+        border: 1px solid {theme['border']};
+        border-radius: 4px;
+        padding: 4px;
+    }}
+    
+    QTextEdit {{
+        background-color: {theme['widget_bg']};
+    }}
+    
+    QTableWidget::item:selected {{
+        background-color: {theme['selection_bg']};
+    }}
+    
+    QHeaderView::section {{
+        background-color: {theme['button_bg']};
+        color: {theme['button_text']};
+        padding: 4px;
+        border: 1px solid {theme['border']};
+    }}
+    
+    QSplitter::handle {{
+        background-color: {theme['border']};
+    }}
+    
+    QScrollBar {{
+        background-color: {theme['widget_bg']};
+    }}
+    
+    QScrollBar::handle {{
+        background-color: {theme['button_bg']};
+        border-radius: 4px;
+    }}
+    """
+    
+    widget.setStyleSheet(stylesheet) 
--- a/ui/tray_icon.py
+++ b/ui/tray_icon.py
@ -0,0 +1,60 @@
+"""
+System tray icon implementation
+"""
+from PyQt6.QtWidgets import QSystemTrayIcon, QMenu, QApplication
+from PyQt6.QtGui import QAction
+
+class StatusTrayIcon(QSystemTrayIcon):
+    """System tray icon that displays application status"""
+    
+    def __init__(self, icon, parent=None):
+        super().__init__(icon, parent)
+        self.parent = parent
+        self.setToolTip("autoMate")
+        
+        # Create context menu
+        self.menu = QMenu()
+        self.show_action = QAction("Show Main Window")
+        self.show_action.triggered.connect(self.show_main_window)
+        self.menu_status = QAction("Status: Idle")
+        self.menu_status.setEnabled(False)
+        self.menu_task = QAction("Task: None")
+        self.menu_task.setEnabled(False)
+        self.exit_action = QAction("Exit")
+        self.exit_action.triggered.connect(QApplication.quit)
+        
+        self.menu.addAction(self.show_action)
+        self.menu.addSeparator()
+        self.menu.addAction(self.menu_status)
+        self.menu.addAction(self.menu_task)
+        self.menu.addSeparator()
+        self.menu.addAction(self.exit_action)
+        
+        self.setContextMenu(self.menu)
+        
+        # Connect signals
+        self.activated.connect(self.icon_activated)
+        
+    def show_main_window(self):
+        if self.parent:
+            self.parent.showNormal()
+            self.parent.activateWindow()
+            
+    def icon_activated(self, reason):
+        if reason == QSystemTrayIcon.ActivationReason.DoubleClick:
+            self.show_main_window()
+            
+    def update_status(self, status_text):
+        """Update status text in tray tooltip and menu"""
+        # Truncate if too long for menu
+        short_status = status_text[:50] + "..." if len(status_text) > 50 else status_text
+        self.menu_status.setText(f"Status: {short_status}")
+        
+        # Show brief notification but don't disrupt automation
+        # Only show notification for 500ms (very brief) to not interfere with visual automation
+        self.showMessage("autoMate Status", status_text, QSystemTrayIcon.MessageIcon.Information, 500)
+        
+    def update_task(self, task_text):
+        """Update task text in tray menu"""
+        short_task = task_text[:50] + "..." if len(task_text) > 50 else task_text
+        self.menu_task.setText(f"Task: {short_task}") 
--- a/util/auto_control.py
+++ b/util/auto_control.py
@ -4,12 +4,12 @@ import time

 # Add the project root directory to Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from gradio_ui.agent.vision_agent import VisionAgent
+from auto_control.agent.vision_agent import VisionAgent
 from util.download_weights import MODEL_DIR
 from pynput import mouse, keyboard

-# Now you can import from gradio_ui
-from gradio_ui.tools.screen_capture import get_screenshot
+# Now you can import from auto_control
+from auto_control.tools.screen_capture import get_screenshot

 class AutoControl:
    def __init__(self):
@ -81,8 +81,7 @@ class AutoControl:
        if key == keyboard.Key.esc:
            
            print("self.auto_list", self.auto_list)
-            vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"),
-                                 caption_model_path=os.path.join(MODEL_DIR, "icon_caption"))
+            vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"))
            
            for item in self.auto_list:
                element_list =vision_agent(str(item["path"]))
--- a/util/auto_util.py
+++ b/util/auto_util.py
@ -0,0 +1,34 @@
+import os
+import platform
+import pyautogui
+from enum import Enum
+
+import pyperclip
+class AppName(Enum):
+    WECHAT = "wechat"
+   
+
+class AutoUtil:
+    def __init__(self, app_name: AppName):
+        self.img_dir = os.path.join(os.path.dirname(__file__),"..", "imgs", app_name.value)
+
+    def click_multi_img(self, img_names, offset_x=0, offset_y=0, minSearchTime=0):
+        for img_name in img_names:
+            self.find_click_img(img_name, offset_x, offset_y, minSearchTime)
+    
+    def find_click_img(self, img_name, offset_x=0, offset_y=0, minSearchTime=0):
+        img_path = os.path.join(self.img_dir, img_name + ".png")
+        img = pyautogui.locateOnScreen(img_path, minSearchTime=minSearchTime)
+        x,y = pyautogui.center(img)
+        # Add offset to click position
+        pyautogui.click(x + offset_x, y + offset_y)
+
+    def send_text(self, text):
+        clipboard_data = pyperclip.paste()
+        pyperclip.copy(text)
+        if platform.system() == 'Darwin':
+            pyautogui.hotkey('command', 'v', interval=0.1)
+        else:
+            pyautogui.hotkey('ctrl', 'v')
+        # Copy old data back to clipboard
+        pyperclip.copy(clipboard_data)
--- a/util/opencv._detect.py
+++ b/util/opencv._detect.py
@ -1,47 +0,0 @@
-import cv2
-import sys
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from gradio_ui.tools.screen_capture import get_screenshot
-
-def detect_and_draw_edges():
-    # Read the image
-    screenshot, path = get_screenshot(is_cursor=False)
-    img = cv2.imread(path)
-    if img is None:
-        print("Error: Could not read the image.")
-        return
-    
-    # Create a copy for drawing contours later
-    original = img.copy()
-    
-    # Convert to grayscale
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    
-    # Apply Gaussian blur to reduce noise
-    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
-    
-    # Detect edges using Canny algorithm
-    edges = cv2.Canny(blurred, 50, 150)
-    
-    # Find contours from the edges
-    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    
-    # Draw all detected contours
-    cv2.drawContours(original, contours, -1, (0, 255, 0), 2)
-    
-    print(f"Found {len(contours)} contours in the image")
-    
-    # Display results
-    # cv2.imshow("Original Image", img)
-    cv2.imshow("Edges", edges)
-    # cv2.imshow("Contours", original)
-    cv2.waitKey(0)
-    cv2.destroyAllWindows()
-    
-    return original, contours
-
-# Example usage
-if __name__ == "__main__":
-    result_image, detected_contours = detect_and_draw_edges()
--- a/util/wechat_auto.py
+++ b/util/wechat_auto.py
@ -0,0 +1,30 @@
+import os
+import sys
+import time
+
+import pyautogui
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from util.auto_util import AppName, AutoUtil
+class WechatAuto:
+    def __init__(self):
+        self.auto_util = AutoUtil(AppName.WECHAT)
+        
+    def go_to_chat(self):
+        self.auto_util.find_click_img("chat_unselect.png")
+
+    def search_friend(self, friend_name):
+        try:
+            self.auto_util.find_click_img("chat_unselect")
+        except pyautogui.ImageNotFoundException:
+            self.auto_util.find_click_img("chat_select")
+        self.auto_util.find_click_img("search", offset_x=100)
+        self.auto_util.send_text(friend_name)
+        self.auto_util.find_click_img("contact_person",offset_x=100,offset_y=100,minSearchTime=10)
+        self.auto_util.find_click_img("search",offset_x=-100,offset_y=-100,minSearchTime=10)
+
+if __name__ == "__main__":
+    time.sleep(3)
+    wechat_auto = WechatAuto()
+    wechat_auto.search_friend("李杨林")
+