diff --git a/gradio_ui/app.py b/gradio_ui/app.py index 858706b..5527bd2 100644 --- a/gradio_ui/app.py +++ b/gradio_ui/app.py @@ -63,11 +63,14 @@ def setup_state(state): state["only_n_most_recent_images"] = 2 if 'stop' not in state: state['stop'] = False - -async def main(state): - """Render loop for Gradio""" - setup_state(state) - return "Setup completed" + # update state + return ( + state["model"], # model textbox + state["base_url"], # base_url textbox + state["api_key"], # api_key textbox + state["chatbox_messages"], # chatbot + [[task["status"], task["task"]] for task in state["tasks"]] # task_list + ) def load_from_storage(filename: str) -> str | None: """Load data from a file in the storage directory.""" @@ -324,5 +327,9 @@ def run(): stop_button.click(stop_app, [state], None) base_url.change(fn=update_base_url, inputs=[base_url, state], outputs=None) - - demo.launch(server_name="0.0.0.0", server_port=7888) + demo.load( + setup_state, + inputs=[state], + outputs=[model, base_url, api_key, chatbot, task_list] + ) + demo.launch(server_name="0.0.0.0", server_port=7888) diff --git a/gradio_ui/tools/screen_capture.py b/gradio_ui/tools/screen_capture.py index df61757..aa2089c 100644 --- a/gradio_ui/tools/screen_capture.py +++ b/gradio_ui/tools/screen_capture.py @@ -1,29 +1,36 @@ +from io import BytesIO from pathlib import Path from uuid import uuid4 from PIL import Image +import pyautogui from .base import ToolError from util import tool OUTPUT_DIR = "./tmp/outputs" -def get_screenshot(screen_region): +def get_screenshot(screen_region=None, is_cursor=True): output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True) path = output_dir / f"screenshot_{uuid4().hex}.png" try: - img_io = tool.capture_screen_with_cursor() + if is_cursor: + img_io = tool.capture_screen_with_cursor() + else: + pyautogui_screenshot = pyautogui.screenshot() + img_io = BytesIO() + pyautogui_screenshot.save(img_io, 'PNG') screenshot = Image.open(img_io) + # Create a black mask of the same size - black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255)) # If screen_region is provided and valid, copy only that region if screen_region and len(screen_region) == 4: + black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255)) x1, y1, x2, y2 = screen_region region = screenshot.crop((x1, y1, x2, y2)) # Paste the region onto the black mask black_mask.paste(region, (x1, y1, x2, y2)) # Use the modified image as screenshot screenshot = black_mask - screenshot.save(path) return screenshot, path except Exception as e: diff --git a/requirements.txt b/requirements.txt index af48c6a..967b8e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ anthropic[bedrock,vertex]>=0.37.1 pyxbrain==1.1.31 timm einops==0.8.0 -modelscope \ No newline at end of file +modelscope +pynput \ No newline at end of file diff --git a/util/auto_control.py b/util/auto_control.py new file mode 100644 index 0000000..8913f66 --- /dev/null +++ b/util/auto_control.py @@ -0,0 +1,126 @@ +import sys +import os +import time + +# Add the project root directory to Python path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from gradio_ui.agent.vision_agent import VisionAgent +from util.download_weights import MODEL_DIR +from pynput import mouse, keyboard + +# Now you can import from gradio_ui +from gradio_ui.tools.screen_capture import get_screenshot + +class AutoControl: + def __init__(self): + self.auto_list = [] + + def start_listen(self): + # Create both mouse and keyboard listeners + mouse_listener = mouse.Listener( + on_move=self.on_move, + on_click=self.on_click, + on_scroll=self.on_scroll) + + keyboard_listener = keyboard.Listener( + on_press=self.on_press, + on_release=self.on_release) + + # Start both listeners + mouse_listener.start() + keyboard_listener.start() + + # Keep the program running until keyboard listener stops + keyboard_listener.join() + + # After keyboard stops (ESC pressed), stop mouse listener too + mouse_listener.stop() + + def on_move(self, x, y, injected): + print('Pointer moved to {}; it was {}'.format( + (x, y), 'faked' if injected else 'not faked')) + + def on_click(self, x, y, button, pressed, injected): + print('Mouse {} {} at {}; it was {}'.format( + button, + 'Pressed' if pressed else 'Released', + (x, y), + 'faked' if injected else 'not faked')) + if not pressed: + # wait right click window + if button == mouse.Button.right: + time.sleep(1) + screenshot, path = get_screenshot(is_cursor=False) + self.auto_list.append( + {"button": button, + "pressed": pressed, + "position": (x, y), + "path": path, + "image": screenshot + } + ) + + + def on_scroll(self, x, y, dx, dy, injected): + print('Scrolled {} at {}; it was {}'.format( + 'down' if dy < 0 else 'up', + (x, y), 'faked' if injected else 'not faked')) + + def on_press(self, key, injected): + try: + print('alphanumeric key {} pressed; it was {}'.format( + key.char, 'faked' if injected else 'not faked')) + except AttributeError: + print('special key {} pressed'.format( + key)) + + def on_release(self, key, injected): + print('{} released; it was {}'.format( + key, 'faked' if injected else 'not faked')) + + if key == keyboard.Key.esc: + + print("self.auto_list", self.auto_list) + vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"), + caption_model_path=os.path.join(MODEL_DIR, "icon_caption")) + + for item in self.auto_list: + element_list =vision_agent(str(item["path"])) + for element in element_list: + if self.crop_image_if_position_in_coordinates(item["image"], item["path"], item["position"], element.coordinates): + break + # Stop listener + return False + + + + def crop_image_if_position_in_coordinates(self, image, image_path, position, coordinates): + """ + Check if position is within coordinates and crop image if true + + Args: + image: PIL Image object + position: tuple of (x, y) - current position + coordinates: tuple of (x1, y1, x2, y2) - target area + + Returns: + bool: True if position is in coordinates + """ + x, y = position + x1, y1, x2, y2 = coordinates + + # Check if position is within coordinates + if (x1 <= x <= x2) and (y1 <= y <= y2): + # Crop the image to the coordinates + cropped_image = image.crop(coordinates) + # Save the cropped image with proper path and format + save_path = str(image_path).replace('.png', '_cropped.png') + cropped_image.save(save_path, 'PNG') + return True + + return False + +if __name__ == "__main__": + auto_control = AutoControl() + auto_control.start_listen() +