当刷新界面时,更新输入内容为上一次保存的数据。增加录制功能

This commit is contained in:
yuruo 2025-03-16 17:31:36 +08:00
parent d9044e0a87
commit b778dcc87d
4 changed files with 153 additions and 12 deletions

View File

@ -63,11 +63,14 @@ def setup_state(state):
state["only_n_most_recent_images"] = 2
if 'stop' not in state:
state['stop'] = False
async def main(state):
"""Render loop for Gradio"""
setup_state(state)
return "Setup completed"
# update state
return (
state["model"], # model textbox
state["base_url"], # base_url textbox
state["api_key"], # api_key textbox
state["chatbox_messages"], # chatbot
[[task["status"], task["task"]] for task in state["tasks"]] # task_list
)
def load_from_storage(filename: str) -> str | None:
"""Load data from a file in the storage directory."""
@ -324,5 +327,9 @@ def run():
stop_button.click(stop_app, [state], None)
base_url.change(fn=update_base_url, inputs=[base_url, state], outputs=None)
demo.launch(server_name="0.0.0.0", server_port=7888)
demo.load(
setup_state,
inputs=[state],
outputs=[model, base_url, api_key, chatbot, task_list]
)
demo.launch(server_name="0.0.0.0", server_port=7888)

View File

@ -1,29 +1,36 @@
from io import BytesIO
from pathlib import Path
from uuid import uuid4
from PIL import Image
import pyautogui
from .base import ToolError
from util import tool
OUTPUT_DIR = "./tmp/outputs"
def get_screenshot(screen_region):
def get_screenshot(screen_region=None, is_cursor=True):
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / f"screenshot_{uuid4().hex}.png"
try:
img_io = tool.capture_screen_with_cursor()
if is_cursor:
img_io = tool.capture_screen_with_cursor()
else:
pyautogui_screenshot = pyautogui.screenshot()
img_io = BytesIO()
pyautogui_screenshot.save(img_io, 'PNG')
screenshot = Image.open(img_io)
# Create a black mask of the same size
black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
# If screen_region is provided and valid, copy only that region
if screen_region and len(screen_region) == 4:
black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
x1, y1, x2, y2 = screen_region
region = screenshot.crop((x1, y1, x2, y2))
# Paste the region onto the black mask
black_mask.paste(region, (x1, y1, x2, y2))
# Use the modified image as screenshot
screenshot = black_mask
screenshot.save(path)
return screenshot, path
except Exception as e:

View File

@ -11,4 +11,5 @@ anthropic[bedrock,vertex]>=0.37.1
pyxbrain==1.1.31
timm
einops==0.8.0
modelscope
modelscope
pynput

126
util/auto_control.py Normal file
View File

@ -0,0 +1,126 @@
import sys
import os
import time
# Add the project root directory to Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gradio_ui.agent.vision_agent import VisionAgent
from util.download_weights import MODEL_DIR
from pynput import mouse, keyboard
# Now you can import from gradio_ui
from gradio_ui.tools.screen_capture import get_screenshot
class AutoControl:
def __init__(self):
self.auto_list = []
def start_listen(self):
# Create both mouse and keyboard listeners
mouse_listener = mouse.Listener(
on_move=self.on_move,
on_click=self.on_click,
on_scroll=self.on_scroll)
keyboard_listener = keyboard.Listener(
on_press=self.on_press,
on_release=self.on_release)
# Start both listeners
mouse_listener.start()
keyboard_listener.start()
# Keep the program running until keyboard listener stops
keyboard_listener.join()
# After keyboard stops (ESC pressed), stop mouse listener too
mouse_listener.stop()
def on_move(self, x, y, injected):
print('Pointer moved to {}; it was {}'.format(
(x, y), 'faked' if injected else 'not faked'))
def on_click(self, x, y, button, pressed, injected):
print('Mouse {} {} at {}; it was {}'.format(
button,
'Pressed' if pressed else 'Released',
(x, y),
'faked' if injected else 'not faked'))
if not pressed:
# wait right click window
if button == mouse.Button.right:
time.sleep(1)
screenshot, path = get_screenshot(is_cursor=False)
self.auto_list.append(
{"button": button,
"pressed": pressed,
"position": (x, y),
"path": path,
"image": screenshot
}
)
def on_scroll(self, x, y, dx, dy, injected):
print('Scrolled {} at {}; it was {}'.format(
'down' if dy < 0 else 'up',
(x, y), 'faked' if injected else 'not faked'))
def on_press(self, key, injected):
try:
print('alphanumeric key {} pressed; it was {}'.format(
key.char, 'faked' if injected else 'not faked'))
except AttributeError:
print('special key {} pressed'.format(
key))
def on_release(self, key, injected):
print('{} released; it was {}'.format(
key, 'faked' if injected else 'not faked'))
if key == keyboard.Key.esc:
print("self.auto_list", self.auto_list)
vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt"),
caption_model_path=os.path.join(MODEL_DIR, "icon_caption"))
for item in self.auto_list:
element_list =vision_agent(str(item["path"]))
for element in element_list:
if self.crop_image_if_position_in_coordinates(item["image"], item["path"], item["position"], element.coordinates):
break
# Stop listener
return False
def crop_image_if_position_in_coordinates(self, image, image_path, position, coordinates):
"""
Check if position is within coordinates and crop image if true
Args:
image: PIL Image object
position: tuple of (x, y) - current position
coordinates: tuple of (x1, y1, x2, y2) - target area
Returns:
bool: True if position is in coordinates
"""
x, y = position
x1, y1, x2, y2 = coordinates
# Check if position is within coordinates
if (x1 <= x <= x2) and (y1 <= y <= y2):
# Crop the image to the coordinates
cropped_image = image.crop(coordinates)
# Save the cropped image with proper path and format
save_path = str(image_path).replace('.png', '_cropped.png')
cropped_image.save(save_path, 'PNG')
return True
return False
if __name__ == "__main__":
auto_control = AutoControl()
auto_control.start_listen()