mirror of
https://github.com/yuruotong1/autoMate.git
synced 2026-03-22 13:07:17 +08:00
picture upload
This commit is contained in:
@@ -4,6 +4,7 @@ Conversation manager module for handling dialog flow and states
|
||||
import json
|
||||
import time
|
||||
from PyQt6.QtCore import QObject, QThread, QTimer
|
||||
from src.core.few_shot_agent import FewShotGenerateAgent
|
||||
from src.core.input_listener import InputListener
|
||||
from xbrain.core.chat import run
|
||||
|
||||
@@ -95,11 +96,11 @@ class ConversationManager(QObject):
|
||||
|
||||
# Initialize status text
|
||||
status_text = f"Action detected: {action}"
|
||||
|
||||
few_shot_agent = FewShotGenerateAgent()
|
||||
# Format display based on action type
|
||||
if action["type"] == "mouse":
|
||||
|
||||
self.text_buffer = ""
|
||||
status_text = few_shot_agent(action)
|
||||
|
||||
elif action["type"] == "keyboard":
|
||||
current_time = time.time()
|
||||
@@ -115,7 +116,8 @@ class ConversationManager(QObject):
|
||||
elif "key.space" in key_str.lower():
|
||||
self.text_buffer += " "
|
||||
elif "key.enter" in key_str.lower() or "return" in key_str.lower():
|
||||
status_text = f"Keyboard input completed: \"{self.text_buffer}\""
|
||||
# status_text = f"Keyboard input completed: \"{self.text_buffer}\""
|
||||
status_text = few_shot_agent(action)
|
||||
self.update_mini_window_status(status_text)
|
||||
self.text_buffer = ""
|
||||
return
|
||||
@@ -125,6 +127,7 @@ class ConversationManager(QObject):
|
||||
# Display buffer if timeout occurred
|
||||
if current_time - self.last_keypress_time > 2.0 and self.text_buffer:
|
||||
status_text = f"Keyboard input: \"{self.text_buffer}\""
|
||||
status_text = few_shot_agent(action)
|
||||
else:
|
||||
status_text = f"Keyboard action: {action['event']} (current input: \"{self.text_buffer}\")"
|
||||
|
||||
|
||||
@@ -1,10 +1,30 @@
|
||||
from pdb import run
|
||||
from typing import Any
|
||||
from xbrain.core.chat import run
|
||||
class FewShotGenerateAgent:
|
||||
def __call__(self, action):
|
||||
# Create content list with text-image pairs for each action
|
||||
# Create action message without base64 image
|
||||
action_copy = action.copy()
|
||||
action_copy.pop('base64_image', None)
|
||||
messages = [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": f"action:\n {action_copy}"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{action['base64_image']}"}
|
||||
}]}
|
||||
]
|
||||
response = run(
|
||||
messages,
|
||||
user_prompt=prompt)
|
||||
return "【THINKING】\n" + response
|
||||
|
||||
prompt = """Please analyze this sequence of user input actions and create few-shot learning examples.
|
||||
The recorded actions include mouse clicks, keyboard inputs, and special key presses, along with their timing and UI context.
|
||||
|
||||
class FewShotAgent:
|
||||
def __init__(self):
|
||||
self.messages = []
|
||||
Please create structured examples that show:
|
||||
1. The user's intent and context
|
||||
2. The sequence of actions needed
|
||||
3. Important UI elements involved
|
||||
4. Any timing or order dependencies
|
||||
|
||||
def __call__(self, *args: Any, **kwds: Any) -> Any:
|
||||
pass
|
||||
Format each example to demonstrate the complete interaction pattern."""
|
||||
|
||||
@@ -44,32 +44,32 @@ class InputListener(QObject):
|
||||
Only emit on release (when pressed is False)
|
||||
"""
|
||||
if not pressed:
|
||||
_, screenshot_path = get_screenshot()
|
||||
screenshot, _ = get_screenshot(is_base64=True)
|
||||
self.action_detected.emit({
|
||||
"type": "mouse",
|
||||
"event": button.name + " click",
|
||||
"position": (x, y),
|
||||
"screenshot_path": str(screenshot_path)
|
||||
"base64_image": screenshot
|
||||
})
|
||||
|
||||
def on_scroll(self, x, y, dx, dy, injected):
|
||||
"""Handle mouse scroll events"""
|
||||
_, screenshot_path = get_screenshot()
|
||||
screenshot, _ = get_screenshot(is_base64=True)
|
||||
scroll_direction = 'down' if dy < 0 else 'up'
|
||||
self.action_detected.emit({
|
||||
"type": "mouse",
|
||||
"event": f"scroll {scroll_direction}",
|
||||
"position": (x, y),
|
||||
"screenshot_path": str(screenshot_path)
|
||||
"base64_image": screenshot
|
||||
})
|
||||
|
||||
def on_release(self, key, injected):
|
||||
"""Handle keyboard release events"""
|
||||
_, screenshot_path = get_screenshot()
|
||||
screenshot, _ = get_screenshot(is_base64=True)
|
||||
self.action_detected.emit({
|
||||
"type": "keyboard",
|
||||
"event": str(key),
|
||||
"screenshot_path": str(screenshot_path)
|
||||
"base64_image": screenshot
|
||||
})
|
||||
|
||||
def stop_listen(self):
|
||||
|
||||
@@ -17,10 +17,10 @@ def main():
|
||||
config = Config()
|
||||
base_url = "https://api.openai-next.com/v1"
|
||||
api_key = "sk-fb4R0ieuTV2OISKX715e7e4a588447F0A6A0AaE6123d16C7"
|
||||
model = "gpt-4o"
|
||||
model = "gpt-4o-2024-11-20"
|
||||
config.set_openai_config(base_url=base_url, api_key=api_key, model=model)
|
||||
|
||||
app = QApplication(sys.argv)
|
||||
app = QApplication(sys .argv)
|
||||
window = MainWindow()
|
||||
window.show()
|
||||
sys.exit(app.exec())
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
Input area component for user message entry
|
||||
"""
|
||||
from PyQt6.QtWidgets import (QWidget, QTextEdit, QPushButton, QHBoxLayout, QVBoxLayout)
|
||||
|
||||
from PyQt6.QtGui import QFont
|
||||
from PyQt6.QtCore import Qt
|
||||
|
||||
class InputArea(QWidget):
|
||||
"""
|
||||
@@ -108,8 +109,9 @@ class InputArea(QWidget):
|
||||
if message:
|
||||
# Call the callback
|
||||
self.message_callback(message)
|
||||
# Clear the input
|
||||
self.text_edit.clear()
|
||||
# Clear the input only if there is text
|
||||
if len(message) > 0:
|
||||
self.text_edit.clear()
|
||||
|
||||
def set_enabled(self, enabled):
|
||||
"""
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Screenshot utility module for capturing screen content
|
||||
"""
|
||||
import base64
|
||||
from io import BytesIO
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -12,7 +13,7 @@ import pyautogui
|
||||
OUTPUT_DIR = "./tmp/outputs"
|
||||
|
||||
|
||||
def get_screenshot(screen_region=None, is_cursor=True):
|
||||
def get_screenshot(screen_region=None, is_cursor=True, is_base64=False):
|
||||
"""
|
||||
Capture a screenshot with or without cursor
|
||||
|
||||
@@ -34,8 +35,6 @@ def get_screenshot(screen_region=None, is_cursor=True):
|
||||
img_io = BytesIO()
|
||||
pyautogui_screenshot.save(img_io, 'PNG')
|
||||
|
||||
screenshot = Image.open(img_io)
|
||||
|
||||
# Apply region mask if specified
|
||||
if screen_region and len(screen_region) == 4:
|
||||
black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
|
||||
@@ -45,8 +44,11 @@ def get_screenshot(screen_region=None, is_cursor=True):
|
||||
black_mask.paste(region, (x1, y1, x2, y2))
|
||||
# Use the modified image as screenshot
|
||||
screenshot = black_mask
|
||||
|
||||
screenshot.save(path)
|
||||
if is_base64:
|
||||
screenshot = base64.b64encode(img_io.getvalue()).decode('utf-8')
|
||||
else:
|
||||
screenshot = Image.open(img_io)
|
||||
screenshot.save(path)
|
||||
return screenshot, path
|
||||
|
||||
|
||||
|
||||
@@ -1,110 +1 @@
|
||||
[
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1184,
|
||||
1025
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_1d542843e6e745199a36fa367995a7be.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1188,
|
||||
711
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_82bc33a76fda43c5b1faec1ff0dffe60.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1324,
|
||||
577
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_7f8b51c9937e46e3a6e829e3426c2aab.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1402,
|
||||
467
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_5b3e8d35a309483d9979fd1cfd991af1.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1457,
|
||||
289
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_44de70ef74234ee082139da58d0512d2.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1444,
|
||||
396
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_8364d28720c54f6cb4abf34c0b16ebc1.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1201,
|
||||
385
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_b67f8a493fc144ceb656c8aad3d368b0.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1052,
|
||||
344
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_c657989d97d94e54b5173f911eeacf29.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "right click",
|
||||
"position": [
|
||||
1007,
|
||||
345
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_d80d3f85d51f41cc9ae4bf573a14106d.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
979,
|
||||
453
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_458eca72c66f4fb8bb63a2b61897c209.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1137,
|
||||
570
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_bd5a272513864f6f82b664ecf63084ac.png"
|
||||
},
|
||||
{
|
||||
"type": "mouse",
|
||||
"event": "left click",
|
||||
"position": [
|
||||
1947,
|
||||
1250
|
||||
],
|
||||
"screenshot_path": "tmp\\outputs\\screenshot_db51b11fe77b4819a34876117fbc85b3.png"
|
||||
}
|
||||
]
|
||||
[]
|
||||
Reference in New Issue
Block a user