当刷新界面时，更新输入内容为上一次保存的数据。增加录制功能

2025-12-26 05:16:21 +08:00 · 2025-03-16 17:31:36 +08:00 · 2025-03-16 17:31:36 +08:00 · b778dcc87d
commit b778dcc87d
parent d9044e0a87
4 changed files with 153 additions and 12 deletions
--- a/gradio_ui/app.py
+++ b/gradio_ui/app.py
@ -63,11 +63,14 @@ def setup_state(state):
        state["only_n_most_recent_images"] = 2
    if 'stop' not in state:
        state['stop'] = False
-
-async def main(state):
-    """Render loop for Gradio"""
-    setup_state(state)
-    return "Setup completed"
+    # update state
+    return (
+        state["model"],      # model textbox
+        state["base_url"],   # base_url textbox
+        state["api_key"],    # api_key textbox
+        state["chatbox_messages"],  # chatbot
+        [[task["status"], task["task"]] for task in state["tasks"]]  # task_list
+    )

 def load_from_storage(filename: str) -> str | None:
    """Load data from a file in the storage directory."""
@ -324,5 +327,9 @@ def run():
        stop_button.click(stop_app, [state], None)
        base_url.change(fn=update_base_url, inputs=[base_url, state], outputs=None)

-
-    demo.launch(server_name="0.0.0.0", server_port=7888)
+        demo.load(
+            setup_state, 
+            inputs=[state], 
+            outputs=[model, base_url, api_key, chatbot, task_list]
+        )
+        demo.launch(server_name="0.0.0.0", server_port=7888)
--- a/gradio_ui/tools/screen_capture.py
+++ b/gradio_ui/tools/screen_capture.py
@ -1,29 +1,36 @@
+from io import BytesIO
 from pathlib import Path
 from uuid import uuid4
 from PIL import Image
+import pyautogui
 from .base import ToolError
 from util import tool

 OUTPUT_DIR = "./tmp/outputs"

-def get_screenshot(screen_region):
+def get_screenshot(screen_region=None, is_cursor=True):
    output_dir = Path(OUTPUT_DIR)
    output_dir.mkdir(parents=True, exist_ok=True)
    path = output_dir / f"screenshot_{uuid4().hex}.png"
    try:
-        img_io = tool.capture_screen_with_cursor()
+        if is_cursor:
+            img_io = tool.capture_screen_with_cursor()
+        else:
+            pyautogui_screenshot =  pyautogui.screenshot()
+            img_io = BytesIO()
+            pyautogui_screenshot.save(img_io, 'PNG')
        screenshot = Image.open(img_io)
+        
        # Create a black mask of the same size
-        black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
        # If screen_region is provided and valid, copy only that region
        if screen_region and len(screen_region) == 4:
+            black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
            x1, y1, x2, y2 = screen_region
            region = screenshot.crop((x1, y1, x2, y2))
            # Paste the region onto the black mask
            black_mask.paste(region, (x1, y1, x2, y2))
            # Use the modified image as screenshot
            screenshot = black_mask
-        
        screenshot.save(path)
        return screenshot, path
    except Exception as e:
--- a/requirements.txt
+++ b/requirements.txt
@ -11,4 +11,5 @@ anthropic[bedrock,vertex]>=0.37.1
 pyxbrain==1.1.31
 timm
 einops==0.8.0
-modelscope
+modelscope
+pynput
--- a/util/auto_control.py
+++ b/util/auto_control.py
@ -0,0 +1,126 @@
+import sys
+import os
+import time
+
+# Add the project root directory to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from gradio_ui.agent.vision_agent import VisionAgent
+from util.download_weights import MODEL_DIR
+from pynput import mouse, keyboard
+
+# Now you can import from gradio_ui
+from gradio_ui.tools.screen_capture import get_screenshot
+
+class AutoControl:
+    def __init__(self):
+        self.auto_list = []
+
+    def start_listen(self):
+        # Create both mouse and keyboard listeners
+        mouse_listener = mouse.Listener(
+            on_move=self.on_move,
+            on_click=self.on_click,
+            on_scroll=self.on_scroll)
+        
+        keyboard_listener = keyboard.Listener(
+            on_press=self.on_press,
+            on_release=self.on_release)
+        
+        # Start both listeners
+        mouse_listener.start()
+        keyboard_listener.start()
+        
+        # Keep the program running until keyboard listener stops
+        keyboard_listener.join()
+        
+        # After keyboard stops (ESC pressed), stop mouse listener too
+        mouse_listener.stop()
+
+    def on_move(self, x, y, injected):
+        print('Pointer moved to {}; it was {}'.format(
+            (x, y), 'faked' if injected else 'not faked'))
+
+    def on_click(self, x, y, button, pressed, injected):
+        print('Mouse {} {} at {}; it was {}'.format(
+            button, 
+            'Pressed' if pressed else 'Released',
+            (x, y), 
+            'faked' if injected else 'not faked'))
+        if not pressed:
+            # wait right click window
+            if button == mouse.Button.right:
+                    time.sleep(1)
+            screenshot, path = get_screenshot(is_cursor=False)
+            self.auto_list.append(
+                {"button": button, 
+                 "pressed": pressed, 
+                 "position": (x, y), 
+                 "path": path,
+                 "image": screenshot
+                 }
+            )
+            
+
+    def on_scroll(self, x, y, dx, dy, injected):
+        print('Scrolled {} at {}; it was {}'.format(
+            'down' if dy < 0 else 'up',
+            (x, y), 'faked' if injected else 'not faked'))
+        
+    def on_press(self, key, injected):
+        try:
+            print('alphanumeric key {} pressed; it was {}'.format(
+                key.char, 'faked' if injected else 'not faked'))
+        except AttributeError:
+            print('special key {} pressed'.format(
+                key))
+
+    def on_release(self, key, injected):
+        print('{} released; it was {}'.format(
+            key, 'faked' if injected else 'not faked'))
+
+        if key == keyboard.Key.esc:
+            
+            print("self.auto_list", self.auto_list)
+            vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"),
+                                 caption_model_path=os.path.join(MODEL_DIR, "icon_caption"))
+            
+            for item in self.auto_list:
+                element_list =vision_agent(str(item["path"]))
+                for element in element_list:
+                    if self.crop_image_if_position_in_coordinates(item["image"], item["path"], item["position"], element.coordinates):
+                        break
+            # Stop listener
+            return False
+
+        
+
+    def crop_image_if_position_in_coordinates(self, image, image_path, position, coordinates):
+        """
+        Check if position is within coordinates and crop image if true
+        
+        Args:
+            image: PIL Image object
+            position: tuple of (x, y) - current position
+            coordinates: tuple of (x1, y1, x2, y2) - target area
+        
+        Returns:
+            bool: True if position is in coordinates
+        """
+        x, y = position
+        x1, y1, x2, y2 = coordinates
+        
+        # Check if position is within coordinates
+        if (x1 <= x <= x2) and (y1 <= y <= y2):
+            # Crop the image to the coordinates
+            cropped_image = image.crop(coordinates)
+            # Save the cropped image with proper path and format
+            save_path = str(image_path).replace('.png', '_cropped.png')
+            cropped_image.save(save_path, 'PNG')
+            return True
+        
+        return False
+
+if __name__ == "__main__":
+    auto_control = AutoControl()
+    auto_control.start_listen()
+