picture upload

This commit is contained in:
yuruo
2025-03-26 17:29:34 +08:00
parent 431ea7075c
commit 9a7ae21d1c
7 changed files with 54 additions and 136 deletions

View File

@@ -4,6 +4,7 @@ Conversation manager module for handling dialog flow and states
import json
import time
from PyQt6.QtCore import QObject, QThread, QTimer
from src.core.few_shot_agent import FewShotGenerateAgent
from src.core.input_listener import InputListener
from xbrain.core.chat import run
@@ -95,11 +96,11 @@ class ConversationManager(QObject):
# Initialize status text
status_text = f"Action detected: {action}"
few_shot_agent = FewShotGenerateAgent()
# Format display based on action type
if action["type"] == "mouse":
self.text_buffer = ""
status_text = few_shot_agent(action)
elif action["type"] == "keyboard":
current_time = time.time()
@@ -115,7 +116,8 @@ class ConversationManager(QObject):
elif "key.space" in key_str.lower():
self.text_buffer += " "
elif "key.enter" in key_str.lower() or "return" in key_str.lower():
status_text = f"Keyboard input completed: \"{self.text_buffer}\""
# status_text = f"Keyboard input completed: \"{self.text_buffer}\""
status_text = few_shot_agent(action)
self.update_mini_window_status(status_text)
self.text_buffer = ""
return
@@ -125,6 +127,7 @@ class ConversationManager(QObject):
# Display buffer if timeout occurred
if current_time - self.last_keypress_time > 2.0 and self.text_buffer:
status_text = f"Keyboard input: \"{self.text_buffer}\""
status_text = few_shot_agent(action)
else:
status_text = f"Keyboard action: {action['event']} (current input: \"{self.text_buffer}\")"

View File

@@ -1,10 +1,30 @@
from pdb import run
from typing import Any
from xbrain.core.chat import run
class FewShotGenerateAgent:
def __call__(self, action):
# Create content list with text-image pairs for each action
# Create action message without base64 image
action_copy = action.copy()
action_copy.pop('base64_image', None)
messages = [
{"role": "user", "content": [
{"type": "text", "text": f"action:\n {action_copy}"},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{action['base64_image']}"}
}]}
]
response = run(
messages,
user_prompt=prompt)
return "【THINKING】\n" + response
prompt = """Please analyze this sequence of user input actions and create few-shot learning examples.
The recorded actions include mouse clicks, keyboard inputs, and special key presses, along with their timing and UI context.
class FewShotAgent:
def __init__(self):
self.messages = []
Please create structured examples that show:
1. The user's intent and context
2. The sequence of actions needed
3. Important UI elements involved
4. Any timing or order dependencies
def __call__(self, *args: Any, **kwds: Any) -> Any:
pass
Format each example to demonstrate the complete interaction pattern."""

View File

@@ -44,32 +44,32 @@ class InputListener(QObject):
Only emit on release (when pressed is False)
"""
if not pressed:
_, screenshot_path = get_screenshot()
screenshot, _ = get_screenshot(is_base64=True)
self.action_detected.emit({
"type": "mouse",
"event": button.name + " click",
"position": (x, y),
"screenshot_path": str(screenshot_path)
"base64_image": screenshot
})
def on_scroll(self, x, y, dx, dy, injected):
"""Handle mouse scroll events"""
_, screenshot_path = get_screenshot()
screenshot, _ = get_screenshot(is_base64=True)
scroll_direction = 'down' if dy < 0 else 'up'
self.action_detected.emit({
"type": "mouse",
"event": f"scroll {scroll_direction}",
"position": (x, y),
"screenshot_path": str(screenshot_path)
"base64_image": screenshot
})
def on_release(self, key, injected):
"""Handle keyboard release events"""
_, screenshot_path = get_screenshot()
screenshot, _ = get_screenshot(is_base64=True)
self.action_detected.emit({
"type": "keyboard",
"event": str(key),
"screenshot_path": str(screenshot_path)
"base64_image": screenshot
})
def stop_listen(self):

View File

@@ -17,10 +17,10 @@ def main():
config = Config()
base_url = "https://api.openai-next.com/v1"
api_key = "sk-fb4R0ieuTV2OISKX715e7e4a588447F0A6A0AaE6123d16C7"
model = "gpt-4o"
model = "gpt-4o-2024-11-20"
config.set_openai_config(base_url=base_url, api_key=api_key, model=model)
app = QApplication(sys.argv)
app = QApplication(sys .argv)
window = MainWindow()
window.show()
sys.exit(app.exec())

View File

@@ -2,7 +2,8 @@
Input area component for user message entry
"""
from PyQt6.QtWidgets import (QWidget, QTextEdit, QPushButton, QHBoxLayout, QVBoxLayout)
from PyQt6.QtGui import QFont
from PyQt6.QtCore import Qt
class InputArea(QWidget):
"""
@@ -108,8 +109,9 @@ class InputArea(QWidget):
if message:
# Call the callback
self.message_callback(message)
# Clear the input
self.text_edit.clear()
# Clear the input only if there is text
if len(message) > 0:
self.text_edit.clear()
def set_enabled(self, enabled):
"""

View File

@@ -1,6 +1,7 @@
"""
Screenshot utility module for capturing screen content
"""
import base64
from io import BytesIO
import os
from pathlib import Path
@@ -12,7 +13,7 @@ import pyautogui
OUTPUT_DIR = "./tmp/outputs"
def get_screenshot(screen_region=None, is_cursor=True):
def get_screenshot(screen_region=None, is_cursor=True, is_base64=False):
"""
Capture a screenshot with or without cursor
@@ -34,8 +35,6 @@ def get_screenshot(screen_region=None, is_cursor=True):
img_io = BytesIO()
pyautogui_screenshot.save(img_io, 'PNG')
screenshot = Image.open(img_io)
# Apply region mask if specified
if screen_region and len(screen_region) == 4:
black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
@@ -45,8 +44,11 @@ def get_screenshot(screen_region=None, is_cursor=True):
black_mask.paste(region, (x1, y1, x2, y2))
# Use the modified image as screenshot
screenshot = black_mask
screenshot.save(path)
if is_base64:
screenshot = base64.b64encode(img_io.getvalue()).decode('utf-8')
else:
screenshot = Image.open(img_io)
screenshot.save(path)
return screenshot, path

View File

@@ -1,110 +1 @@
[
{
"type": "mouse",
"event": "left click",
"position": [
1184,
1025
],
"screenshot_path": "tmp\\outputs\\screenshot_1d542843e6e745199a36fa367995a7be.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1188,
711
],
"screenshot_path": "tmp\\outputs\\screenshot_82bc33a76fda43c5b1faec1ff0dffe60.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1324,
577
],
"screenshot_path": "tmp\\outputs\\screenshot_7f8b51c9937e46e3a6e829e3426c2aab.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1402,
467
],
"screenshot_path": "tmp\\outputs\\screenshot_5b3e8d35a309483d9979fd1cfd991af1.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1457,
289
],
"screenshot_path": "tmp\\outputs\\screenshot_44de70ef74234ee082139da58d0512d2.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1444,
396
],
"screenshot_path": "tmp\\outputs\\screenshot_8364d28720c54f6cb4abf34c0b16ebc1.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1201,
385
],
"screenshot_path": "tmp\\outputs\\screenshot_b67f8a493fc144ceb656c8aad3d368b0.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1052,
344
],
"screenshot_path": "tmp\\outputs\\screenshot_c657989d97d94e54b5173f911eeacf29.png"
},
{
"type": "mouse",
"event": "right click",
"position": [
1007,
345
],
"screenshot_path": "tmp\\outputs\\screenshot_d80d3f85d51f41cc9ae4bf573a14106d.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
979,
453
],
"screenshot_path": "tmp\\outputs\\screenshot_458eca72c66f4fb8bb63a2b61897c209.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1137,
570
],
"screenshot_path": "tmp\\outputs\\screenshot_bd5a272513864f6f82b664ecf63084ac.png"
},
{
"type": "mouse",
"event": "left click",
"position": [
1947,
1250
],
"screenshot_path": "tmp\\outputs\\screenshot_db51b11fe77b4819a34876117fbc85b3.png"
}
]
[]