update few_shot

2026-03-22 13:07:17 +08:00 · 2025-03-27 15:43:46 +08:00
parent b70842376f
commit fd0f6fc722
4 changed files with 103 additions and 213 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@ weights**
 .venv
 tmp**
 build**
-dist**
+dist**
+task_demonstration.json
--- a/src/core/conversation_manager.py
+++ b/src/core/conversation_manager.py
@@ -8,25 +8,9 @@ from src.core.few_shot_agent import FewShotGenerateAgent
 from src.core.input_listener import InputListener
 from xbrain.core.chat import run
 import multiprocessing
-from multiprocessing import Process, Queue
+from multiprocessing import Process, Queue, Manager
+

-def run_agent_analysis_process(action_data):
-    """
-    独立的进程函数，运行在单独的进程中
-    """
-    try:
-        from src.core.few_shot_agent import FewShotGenerateAgent
-        agent = FewShotGenerateAgent()
-        result = agent(action_data)
-        return {
-            'step_number': action_data['step_number'],
-            'analysis': result
-        }
-    except Exception as e:
-        return {
-            'step_number': action_data['step_number'],
-            'analysis': f"Error analyzing step: {str(e)}"
-        }

 class ConversationManager(QObject):
    """
@@ -56,6 +40,11 @@ class ConversationManager(QObject):
        self.analysis_results = {}  # Store analysis results by step number
        self.pool = multiprocessing.Pool(processes=1)  # 使用进程池
        
+        # 使用Manager创建共享字典
+        self.manager = Manager()
+        self.analysis_results = self.manager.dict()
+        self.pool = multiprocessing.Pool(processes=1)
+        self.user_instruction = ""
        # Start the conversation
        self.start_conversation()
    
@@ -82,8 +71,6 @@ class ConversationManager(QObject):
            self.handle_greeting_response(message)
        elif self.conversation_state == "ask_for_demo":
            self.handle_demo_request(message)
-        elif self.conversation_state == "task_demonstration" and self.is_recording:
-            self.handle_task_demonstration(message)
        elif self.conversation_state == "ready":
            self.handle_ready_state(message)
    
@@ -92,6 +79,7 @@ class ConversationManager(QObject):
        response = "Nice to meet you! I heard you want to demonstrate a task for me, " + \
                  "so I can learn and help you with similar tasks in the future. When would you like to start?"
        self.chat_area.add_message("Xiao Hong", response)
+        self.user_instruction = message
        self.conversation_state = "ask_for_demo"
    
    def handle_demo_request(self, message):
@@ -117,7 +105,8 @@ class ConversationManager(QObject):
        action_data = {
            'type': action['type'],
            'event': str(action['event']),
-            'step_number': self.step_counter
+            'step_number': self.step_counter,
+            'base64_image': action['base64_image']
        }
        
        if action['type'] == 'keyboard' and self.text_buffer:
@@ -125,7 +114,7 @@ class ConversationManager(QObject):
            
        # 记录动作
        action['step_number'] = self.step_counter
-        self.task_demonstration.append(action)
+        self.task_demonstration.append(action_data)
        
        # 状态文本
        status_text = f"Step {self.step_counter}: "
@@ -165,31 +154,10 @@ class ConversationManager(QObject):
                status_text += f"Keyboard action: {action['event']} (current input: \"{self.text_buffer}\")"
            
            self.last_keypress_time = current_time
-        
-        # 异步提交分析任务
-        self.pool.apply_async(
-            run_agent_analysis_process, 
-            args=(action_data,),
-            callback=self._handle_analysis_result
-        )
-        
        # 更新状态显示
        self.update_mini_window_status(status_text)
    
-    def _handle_analysis_result(self, result):
-        """处理分析结果的回调函数"""
-        if result and 'step_number' in result:
-            self.analysis_results[result['step_number']] = result.get('analysis', '')

-    def _get_combined_results(self):
-        """Combine all available results in step order"""
-        if not self.analysis_results:
-            return None
-            
-        combined_text = "Analysis summary:\n"
-        for step in sorted(self.analysis_results.keys()):
-            combined_text += f"Step {step}: {self.analysis_results[step]}\n"
-        return combined_text

    def update_mini_window_status(self, text):
        """
@@ -226,10 +194,7 @@ class ConversationManager(QObject):
    
    def finish_demonstration(self):
        """Complete the demonstration recording process"""
-        # 关闭进程池并等待所有任务完成
-        self.pool.close()
-        self.pool.join()
-        
+        # 关闭进程池并等待所有任务完成        
        # Clean up
        self.keyboard_mouse_listen.stop_listen()
        
@@ -243,41 +208,33 @@ class ConversationManager(QObject):
        self.is_recording = False
        self.save_task_demonstration()
        
-        # 显示分析结果
-        combined_results = self._get_combined_results()
-        if combined_results:
-            self.chat_area.add_message("System", "Task Analysis Summary:")
-            self.chat_area.add_message("System", combined_results)
-        
-        # 重新初始化进程池
+        # 显示学习中的消息
+        self.chat_area.add_message("System", "Learning in progress, please wait...")
+        # Create process pool for few shot agent
        self.pool = multiprocessing.Pool(processes=1)
        
-        # Show summary
-        response = f"I've successfully learned this task! Recorded and analyzed {self.step_counter} steps. " + \
-                  "Feel free to assign similar tasks to me in the future. 😊"
-        self.chat_area.add_message("Xiao Hong", response)
-        self.step_counter = 0  # Reset step counter
-        self.conversation_state = "ready"
+        # Call few shot agent asynchronously
+        agent = FewShotGenerateAgent()
+        # Get user instruction from main window
+        result = self.pool.apply_async(agent, args=(self.task_demonstration, self.user_instruction))
+        
+        try:
+            # Get result with timeout
+            response = result.get(timeout=999)
+            # Display response from agent
+            self.chat_area.add_message("Xiao Hong", "I've analyzed your demonstration. Here's what I learned:\n" + response)
+            
+        except TimeoutError:
+            self.chat_area.add_message("System", "Analysis timed out. Please try again.")
+        except Exception as e:
+            self.chat_area.add_message("System", f"Error during analysis: {str(e)}")
+        finally:
+            # Clean up pool
+            self.pool.close()
+            self.pool.join()
+       
+    
    
-    def handle_task_demonstration(self, message):
-        """
-        Handle messages during task demonstration
-        
-        Args:
-            message: User message
-        """
-        self.task_demonstration.append(message)
-        
-        if any(keyword in message.lower() for keyword in ["done", "finish", "completed", "complete"]):
-            self.is_recording = False
-            self.save_task_demonstration()
-            response = "I've learned this task! Thank you for the demonstration. " + \
-                      "You can now assign similar tasks to me in the future. 😊"
-            self.chat_area.add_message("Xiao Hong", response)
-            self.conversation_state = "ready"
-        else:
-            response = "I'm still learning... Please continue your demonstration."
-            self.chat_area.add_message("Xiao Hong", response)
    
    def handle_ready_state(self, message):
        """
@@ -302,4 +259,6 @@ class ConversationManager(QObject):
        """析构函数，确保进程池正确关闭"""
        if hasattr(self, 'pool'):
            self.pool.close()
-            self.pool.join() 
+            self.pool.join()
+        if hasattr(self, 'manager'):
+            self.manager.shutdown() 
--- a/src/core/few_shot_agent.py
+++ b/src/core/few_shot_agent.py
@@ -1,30 +1,72 @@
 from xbrain.core.chat import run
 class FewShotGenerateAgent:
-    def __call__(self, action):
+    def __call__(self, action_list, user_instruction):
        # Create content list with text-image pairs for each action
        # Create action message without base64 image
-        action_copy = action.copy()
-        action_copy.pop('base64_image', None)
-        messages = [
-            {"role": "user", "content": [
-            {"type": "text", "text": f"action:\n {action_copy}"},
-            {
-                "type": "image_url", 
-                "image_url": {"url": f"data:image/png;base64,{action['base64_image']}"}
-            }]}
-        ]
+        action_list_copy = action_list.copy()
+        action_list_copy = [i.pop('base64_image') for i in action_list_copy]
+        messages = [{"role": "user", "content": 
+                     [{"type": "text", "text": "用户的指令是" + user_instruction + "\n\n 用户的动作序列是：\n".join(action_list_copy)}]}]
+        print("action_list", action_list)
+        for action in action_list:
+            print("action", action)
+            action_copy = action.copy()
+            action_copy.pop('base64_image', None)
+            messages[0]["content"].append(
+                {
+                    "type": "image_url", 
+                    "image_url": {"url": f"data:image/png;base64,{action['base64_image']}"}
+                }
+            )
        response = run(
            messages,
            user_prompt=prompt)
-        return "【THINKING】\n" + response
+        return response

-prompt = """Please analyze this sequence of user input actions and create few-shot learning examples.
-The recorded actions include mouse clicks, keyboard inputs, and special key presses, along with their timing and UI context.
+prompt = """
+角色： 你的角色是分析用户界面交互、并为用于任务自动化的多模态大模型生成few-shot案例的专家。

-Please create structured examples that show:
-1. The user's intent and context
-2. The sequence of actions needed
-3. Important UI elements involved
-4. Any timing or order dependencies
+背景： 我正在开发一个能够理解视觉UI元素并给出自动化步骤多模态推理的智能体。为了训练或调整（condition）这个智能体，我需要将记录下来的用户交互序列转换为清晰、结构化的few-shot示例。

-Format each example to demonstrate the complete interaction pattern."""
+目标： 根据提供的用户指令、动作序列（包括事件类型、步骤编号和相应截图），生成一个简洁准确的few-shot示例。这个示例应清晰地将用户的高级指令和视觉上下文映射到执行的低级动作，使其适用于智能体的学习上下文。
+
+你将收到的输入：
+
+[{
+'type':动作类型（例如 'mouse', 'keyboard'）。
+'event':具体事件（例如 'left click', 'type', 'scroll down'）。
+'step_number':动作的顺序编号,每一个动作都对应着一张图片。
+'text_buffer':如果是键盘动作，则记录的是输入的文本缓冲内容。
+}]
+
+
+分析提供的type、event，并仔细检查图片中的视觉内容。精确地按照以下格式生成一个连贯的few-shot示例：
+
+```
+**指令：** [在此处插入准确的用户意图]
+
+**初始状态：**
+* [根据步骤1的图像，简要描述与指令相关的初始屏幕状态。提及关键可见元素。]
+
+**演示动作序列：**
+1.  **动作：** `[标准化的动作类型，例如 CLICK, TYPE, SCROLL, SELECT_TEXT]`
+    * **目标：** `[描述此动作针对的具体UI元素，参考其在对应图像中的外观或文本内容。要精确。例如：“以‘1. 熟悉 C/C++’开头的文本块”，“标签为‘项目经历’的按钮”，“主内容区域的滚动条”]`
+    * **值 (如适用)：** `[插入输入或选择的值]`
+    * *(基于步骤 [step_number] 的图像)*
+2.  **动作：** `[标准化的动作类型]`
+    * **目标：** `[描述此动作针对的具体UI元素]`
+    * **值 (如适用)：** `[插入值]`
+    * *(基于步骤 [step_number] 的图像)*
+... （对Action_Sequence中的每一步重复）
+
+**最终状态（可选但推荐）：**
+* [根据最后一步动作后的图像，描述结果状态，表明任务完成或进入下一阶段。]
+
+生成时的关键注意事项：
+
+标准化动作： 为动作使用一致的动词（例如 CLICK, TYPE, SCROLL, DRAG, SELECT_TEXT 等）。
+视觉定位： 目标描述必须基于对应步骤图像中的视觉信息和任何提供的元素描述。使其足够具体，以便智能体能够定位。
+简洁性： 信息要丰富，但避免不必要的术语。
+准确性： 确保生成的序列准确反映提供的Action_Sequence和视觉上下文。
+重点： 突出与完成User_Instruction相关的交互点，重点关注鼠标位置周围的情况，不要关注其他无关的元素。
+"""
--- a/task_demonstration.json
+++ b/task_demonstration.json