picture upload

2026-03-22 13:07:17 +08:00 · 2025-03-26 17:29:34 +08:00
parent 431ea7075c
commit 9a7ae21d1c
7 changed files with 54 additions and 136 deletions
--- a/src/core/conversation_manager.py
+++ b/src/core/conversation_manager.py
@@ -4,6 +4,7 @@ Conversation manager module for handling dialog flow and states
 import json
 import time
 from PyQt6.QtCore import QObject, QThread, QTimer
+from src.core.few_shot_agent import FewShotGenerateAgent
 from src.core.input_listener import InputListener
 from xbrain.core.chat import run

@@ -95,11 +96,11 @@ class ConversationManager(QObject):
        
        # Initialize status text
        status_text = f"Action detected: {action}"
-        
+        few_shot_agent = FewShotGenerateAgent()
        # Format display based on action type
        if action["type"] == "mouse":
-           
            self.text_buffer = ""
+            status_text = few_shot_agent(action)

        elif action["type"] == "keyboard":
            current_time = time.time()
@@ -115,7 +116,8 @@ class ConversationManager(QObject):
            elif "key.space" in key_str.lower():
                self.text_buffer += " "
            elif "key.enter" in key_str.lower() or "return" in key_str.lower():
-                status_text = f"Keyboard input completed: \"{self.text_buffer}\""
+                # status_text = f"Keyboard input completed: \"{self.text_buffer}\""
+                status_text = few_shot_agent(action)
                self.update_mini_window_status(status_text)
                self.text_buffer = ""
                return
@@ -125,6 +127,7 @@ class ConversationManager(QObject):
            # Display buffer if timeout occurred
            if current_time - self.last_keypress_time > 2.0 and self.text_buffer:
                status_text = f"Keyboard input: \"{self.text_buffer}\""
+                status_text = few_shot_agent(action)
            else:
                status_text = f"Keyboard action: {action['event']} (current input: \"{self.text_buffer}\")"
            
--- a/src/core/few_shot_agent.py
+++ b/src/core/few_shot_agent.py
@@ -1,10 +1,30 @@
-from pdb import run
-from typing import Any
+from xbrain.core.chat import run
+class FewShotGenerateAgent:
+    def __call__(self, action):
+        # Create content list with text-image pairs for each action
+        # Create action message without base64 image
+        action_copy = action.copy()
+        action_copy.pop('base64_image', None)
+        messages = [
+            {"role": "user", "content": [
+            {"type": "text", "text": f"action:\n {action_copy}"},
+            {
+                "type": "image_url", 
+                "image_url": {"url": f"data:image/png;base64,{action['base64_image']}"}
+            }]}
+        ]
+        response = run(
+            messages,
+            user_prompt=prompt)
+        return "【THINKING】\n" + response

+prompt = """Please analyze this sequence of user input actions and create few-shot learning examples.
+The recorded actions include mouse clicks, keyboard inputs, and special key presses, along with their timing and UI context.

-class FewShotAgent:
-    def __init__(self):
-        self.messages = []
+Please create structured examples that show:
+1. The user's intent and context
+2. The sequence of actions needed
+3. Important UI elements involved
+4. Any timing or order dependencies

-    def __call__(self, *args: Any, **kwds: Any) -> Any:
-          pass
+Format each example to demonstrate the complete interaction pattern."""
--- a/src/core/input_listener.py
+++ b/src/core/input_listener.py
@@ -44,32 +44,32 @@ class InputListener(QObject):
        Only emit on release (when pressed is False)
        """
        if not pressed:
-            _, screenshot_path = get_screenshot()
+            screenshot, _ = get_screenshot(is_base64=True)
            self.action_detected.emit({
                "type": "mouse",
                "event": button.name + " click",
                "position": (x, y),
-                "screenshot_path": str(screenshot_path)
+                "base64_image": screenshot
            })

    def on_scroll(self, x, y, dx, dy, injected):
        """Handle mouse scroll events"""
-        _, screenshot_path = get_screenshot()
+        screenshot, _ = get_screenshot(is_base64=True)
        scroll_direction = 'down' if dy < 0 else 'up'
        self.action_detected.emit({
            "type": "mouse",
            "event": f"scroll {scroll_direction}",
            "position": (x, y),
-            "screenshot_path": str(screenshot_path)
+            "base64_image": screenshot
        })

    def on_release(self, key, injected):
        """Handle keyboard release events"""
-        _, screenshot_path = get_screenshot()
+        screenshot, _ = get_screenshot(is_base64=True)
        self.action_detected.emit({
            "type": "keyboard",
            "event": str(key),
-            "screenshot_path": str(screenshot_path)
+            "base64_image": screenshot
        })

    def stop_listen(self):
--- a/src/main.py
+++ b/src/main.py
@@ -17,10 +17,10 @@ def main():
    config = Config()
    base_url = "https://api.openai-next.com/v1"
    api_key = "sk-fb4R0ieuTV2OISKX715e7e4a588447F0A6A0AaE6123d16C7"
-    model = "gpt-4o"
+    model = "gpt-4o-2024-11-20"
    config.set_openai_config(base_url=base_url, api_key=api_key, model=model)

-    app = QApplication(sys.argv)
+    app = QApplication(sys .argv)
    window = MainWindow()
    window.show()
    sys.exit(app.exec())
--- a/src/ui/input_area.py
+++ b/src/ui/input_area.py
@@ -2,7 +2,8 @@
 Input area component for user message entry
 """
 from PyQt6.QtWidgets import (QWidget, QTextEdit, QPushButton, QHBoxLayout, QVBoxLayout)
-
+from PyQt6.QtGui import QFont
+from PyQt6.QtCore import Qt

 class InputArea(QWidget):
    """
@@ -108,8 +109,9 @@ class InputArea(QWidget):
        if message:
            # Call the callback
            self.message_callback(message)
-            # Clear the input
-            self.text_edit.clear()
+            # Clear the input only if there is text
+            if len(message) > 0:
+                self.text_edit.clear()
    
    def set_enabled(self, enabled):
        """
--- a/src/utils/screenshot.py
+++ b/src/utils/screenshot.py
@@ -1,6 +1,7 @@
 """
 Screenshot utility module for capturing screen content
 """
+import base64
 from io import BytesIO
 import os
 from pathlib import Path
@@ -12,7 +13,7 @@ import pyautogui
 OUTPUT_DIR = "./tmp/outputs"


-def get_screenshot(screen_region=None, is_cursor=True):
+def get_screenshot(screen_region=None, is_cursor=True, is_base64=False):
    """
    Capture a screenshot with or without cursor
    
@@ -34,8 +35,6 @@ def get_screenshot(screen_region=None, is_cursor=True):
        img_io = BytesIO()
        pyautogui_screenshot.save(img_io, 'PNG')
    
-    screenshot = Image.open(img_io)
-    
    # Apply region mask if specified
    if screen_region and len(screen_region) == 4:
        black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255))
@@ -45,8 +44,11 @@ def get_screenshot(screen_region=None, is_cursor=True):
        black_mask.paste(region, (x1, y1, x2, y2))
        # Use the modified image as screenshot
        screenshot = black_mask
-    
-    screenshot.save(path)
+    if is_base64:
+        screenshot = base64.b64encode(img_io.getvalue()).decode('utf-8')
+    else:
+        screenshot = Image.open(img_io)
+        screenshot.save(path)
    return screenshot, path


--- a/task_demonstration.json
+++ b/task_demonstration.json
@@ -1,110 +1 @@
-[
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1184,
-      1025
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_1d542843e6e745199a36fa367995a7be.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1188,
-      711
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_82bc33a76fda43c5b1faec1ff0dffe60.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1324,
-      577
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_7f8b51c9937e46e3a6e829e3426c2aab.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1402,
-      467
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_5b3e8d35a309483d9979fd1cfd991af1.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1457,
-      289
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_44de70ef74234ee082139da58d0512d2.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1444,
-      396
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_8364d28720c54f6cb4abf34c0b16ebc1.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1201,
-      385
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_b67f8a493fc144ceb656c8aad3d368b0.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1052,
-      344
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_c657989d97d94e54b5173f911eeacf29.png"
-  },
-  {
-    "type": "mouse",
-    "event": "right click",
-    "position": [
-      1007,
-      345
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_d80d3f85d51f41cc9ae4bf573a14106d.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      979,
-      453
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_458eca72c66f4fb8bb63a2b61897c209.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1137,
-      570
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_bd5a272513864f6f82b664ecf63084ac.png"
-  },
-  {
-    "type": "mouse",
-    "event": "left click",
-    "position": [
-      1947,
-      1250
-    ],
-    "screenshot_path": "tmp\\outputs\\screenshot_db51b11fe77b4819a34876117fbc85b3.png"
-  }
-]
+[]