当刷新界面时，更新输入内容为上一次保存的数据。增加录制功能

2026-03-22 13:07:17 +08:00 · 2025-03-16 17:31:36 +08:00
parent d9044e0a87
commit b778dcc87d
4 changed files with 153 additions and 12 deletions
--- a/gradio_ui/app.py
+++ b/gradio_ui/app.py
@@ -63,11 +63,14 @@ def setup_state(state):
        state["only_n_most_recent_images"] = 2
    if 'stop' not in state:
        state['stop'] = False
-
+    # update state
-async def main(state):
+    return (
-    """Render loop for Gradio"""
+        state["model"],      # model textbox
-    setup_state(state)
+        state["base_url"],   # base_url textbox
-    return "Setup completed"
+        state["api_key"],    # api_key textbox
        state["chatbox_messages"],  # chatbot
        [[task["status"], task["task"]] for task in state["tasks"]]  # task_list
    )
 def load_from_storage(filename: str) -> str | None:
    """Load data from a file in the storage directory."""
@@ -324,5 +327,9 @@ def run():
        stop_button.click(stop_app, [state], None)
        base_url.change(fn=update_base_url, inputs=[base_url, state], outputs=None)
-
+        demo.load(
-    demo.launch(server_name="0.0.0.0", server_port=7888)
+            setup_state, 
            inputs=[state], 
            outputs=[model, base_url, api_key, chatbot, task_list]
        )
        demo.launch(server_name="0.0.0.0", server_port=7888)
--- a/gradio_ui/tools/screen_capture.py
+++ b/gradio_ui/tools/screen_capture.py
@@ -1,29 +1,36 @@
 from io import BytesIO
 from pathlib import Path
 from uuid import uuid4
 from PIL import Image
 import pyautogui
 from .base import ToolError
 from util import tool
 OUTPUT_DIR = "./tmp/outputs"
-def get_screenshot(screen_region):
+def get_screenshot(screen_region=None, is_cursor=True):
    output_dir = Path(OUTPUT_DIR)
    output_dir.mkdir(parents=True, exist_ok=True)
    path = output_dir / f"screenshot_{uuid4().hex}.png"
    try:
-        img_io = tool.capture_screen_with_cursor()
+        if is_cursor:
            img_io = tool.capture_screen_with_cursor()
        else:
            pyautogui_screenshot =  pyautogui.screenshot()
            img_io = BytesIO()
            pyautogui_screenshot.save(img_io, 'PNG')
        screenshot = Image.open(img_io)
        # Create a black mask of the same size
        black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
        # If screen_region is provided and valid, copy only that region
        if screen_region and len(screen_region) == 4:
            black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
            x1, y1, x2, y2 = screen_region
            region = screenshot.crop((x1, y1, x2, y2))
            # Paste the region onto the black mask
            black_mask.paste(region, (x1, y1, x2, y2))
            # Use the modified image as screenshot
            screenshot = black_mask
        screenshot.save(path)
        return screenshot, path
    except Exception as e:
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ anthropic[bedrock,vertex]>=0.37.1
 pyxbrain==1.1.31
 timm
 einops==0.8.0
-modelscope
+modelscope
 pynput
--- a/util/auto_control.py
+++ b/util/auto_control.py
@@ -0,0 +1,126 @@
 import sys
 import os
 import time
 # Add the project root directory to Python path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from gradio_ui.agent.vision_agent import VisionAgent
 from util.download_weights import MODEL_DIR
 from pynput import mouse, keyboard
 # Now you can import from gradio_ui
 from gradio_ui.tools.screen_capture import get_screenshot
 class AutoControl:
    def __init__(self):
        self.auto_list = []
    def start_listen(self):
        # Create both mouse and keyboard listeners
        mouse_listener = mouse.Listener(
            on_move=self.on_move,
            on_click=self.on_click,
            on_scroll=self.on_scroll)
        keyboard_listener = keyboard.Listener(
            on_press=self.on_press,
            on_release=self.on_release)
        # Start both listeners
        mouse_listener.start()
        keyboard_listener.start()
        # Keep the program running until keyboard listener stops
        keyboard_listener.join()
        # After keyboard stops (ESC pressed), stop mouse listener too
        mouse_listener.stop()
    def on_move(self, x, y, injected):
        print('Pointer moved to {}; it was {}'.format(
            (x, y), 'faked' if injected else 'not faked'))
    def on_click(self, x, y, button, pressed, injected):
        print('Mouse {} {} at {}; it was {}'.format(
            button, 
            'Pressed' if pressed else 'Released',
            (x, y), 
            'faked' if injected else 'not faked'))
        if not pressed:
            # wait right click window
            if button == mouse.Button.right:
                    time.sleep(1)
            screenshot, path = get_screenshot(is_cursor=False)
            self.auto_list.append(
                {"button": button, 
                 "pressed": pressed, 
                 "position": (x, y), 
                 "path": path,
                 "image": screenshot
                 }
            )
    def on_scroll(self, x, y, dx, dy, injected):
        print('Scrolled {} at {}; it was {}'.format(
            'down' if dy < 0 else 'up',
            (x, y), 'faked' if injected else 'not faked'))
    def on_press(self, key, injected):
        try:
            print('alphanumeric key {} pressed; it was {}'.format(
                key.char, 'faked' if injected else 'not faked'))
        except AttributeError:
            print('special key {} pressed'.format(
                key))
    def on_release(self, key, injected):
        print('{} released; it was {}'.format(
            key, 'faked' if injected else 'not faked'))
        if key == keyboard.Key.esc:
            print("self.auto_list", self.auto_list)
            vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"),
                                 caption_model_path=os.path.join(MODEL_DIR, "icon_caption"))
            for item in self.auto_list:
                element_list =vision_agent(str(item["path"]))
                for element in element_list:
                    if self.crop_image_if_position_in_coordinates(item["image"], item["path"], item["position"], element.coordinates):
                        break
            # Stop listener
            return False
    def crop_image_if_position_in_coordinates(self, image, image_path, position, coordinates):
        """
        Check if position is within coordinates and crop image if true
        Args:
            image: PIL Image object
            position: tuple of (x, y) - current position
            coordinates: tuple of (x1, y1, x2, y2) - target area
        Returns:
            bool: True if position is in coordinates
        """
        x, y = position
        x1, y1, x2, y2 = coordinates
        # Check if position is within coordinates
        if (x1 <= x <= x2) and (y1 <= y <= y2):
            # Crop the image to the coordinates
            cropped_image = image.crop(coordinates)
            # Save the cropped image with proper path and format
            save_path = str(image_path).replace('.png', '_cropped.png')
            cropped_image.save(save_path, 'PNG')
            return True
        return False
 if __name__ == "__main__":
    auto_control = AutoControl()
    auto_control.start_listen()