diff --git a/gradio_ui/agent/task_plan_agent.py b/gradio_ui/agent/task_plan_agent.py index b68eaef..4a082f9 100644 --- a/gradio_ui/agent/task_plan_agent.py +++ b/gradio_ui/agent/task_plan_agent.py @@ -7,7 +7,6 @@ from gradio_ui.tools.computer import Action class TaskPlanAgent(BaseAgent): def __call__(self, messages, parsed_screen_result): - screen_info = str([{"box_id": i.element_id, "caption": i.caption, "text": i.text} for i in parsed_screen_result['parsed_content_list']]) messages[-1] = {"role": "user", "content": [ {"type": "text", "text": messages[-1]["content"]}, @@ -17,7 +16,7 @@ class TaskPlanAgent(BaseAgent): } ] } - response = run(messages, user_prompt=system_prompt.format(screen_info=screen_info, action_list=str(Action)), response_format=TaskPlanResponse) + response = run(messages, user_prompt=system_prompt.format(action_list=str(Action)), response_format=TaskPlanResponse) print("task_plan_agent response: ", response) return json.loads(response) @@ -31,8 +30,6 @@ system_prompt = """ ### 目标 ### 你是自动化操作规划专家,根据屏幕内容和用户需求,规划精确可执行的操作序列。 -当前屏幕内容如下: -{screen_info} ### 输入 ### 1. 用户需求:文本描述形式的任务目标 diff --git a/gradio_ui/agent/task_run_agent.py b/gradio_ui/agent/task_run_agent.py index 58db06a..d83387e 100644 --- a/gradio_ui/agent/task_run_agent.py +++ b/gradio_ui/agent/task_run_agent.py @@ -25,8 +25,7 @@ class TaskRunAgent(BaseAgent): task_list = json.loads(messages[1]['content'])['task_list'] # Convert task_list to a numbered format formatted_task_list = "\n".join([f"{i}.{task}" for i, task in enumerate(task_list)]) - screen_info = str([{"box_id": i.element_id, "caption": i.caption, "text": i.text} for i in parsed_screen_result['parsed_content_list']]) - system_prompt = prompt.format(screen_info=screen_info, task_list=formatted_task_list) + system_prompt = prompt.format(task_list=formatted_task_list) vlm_response = run( messages, user_prompt=system_prompt, @@ -34,14 +33,35 @@ class TaskRunAgent(BaseAgent): ) vlm_response_json = json.loads(vlm_response) response_content = [BetaTextBlock(text=vlm_response_json["reasoning"], type='text')] - if "box_id" in vlm_response_json and vlm_response_json["next_action"] not in ["None", "key", "type", "scroll_down", "scroll_up","cursor_position", "wait"]: - bbox = self.find_element_by_id(parsed_screen_result, vlm_response_json["box_id"]).coordinates - box_centroid_coordinate = [int((bbox[0] + bbox[2]) / 2 ), int((bbox[1] + bbox[3]) / 2 )] - move_cursor_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}', - input={'action': 'mouse_move', 'coordinate': box_centroid_coordinate}, - name='computer', type='tool_use') - response_content.append(move_cursor_block) - + # Handle cursor movement based on box_id + if "box_id" in vlm_response_json: + action_types_without_cursor = ["None", "key", "type", "scroll_down", "scroll_up", "cursor_position", "wait"] + + if vlm_response_json["box_id"] != -1 and vlm_response_json["next_action"] not in action_types_without_cursor: + # Move cursor to the center of the identified element + element = self.find_element_by_id(parsed_screen_result, vlm_response_json["box_id"]) + bbox = element.coordinates + box_centroid_coordinate = [ + int((bbox[0] + bbox[2]) / 2), + int((bbox[1] + bbox[3]) / 2) + ] + move_cursor_block = BetaToolUseBlock( + id=f'toolu_{uuid.uuid4()}', + input={'action': 'mouse_move', 'coordinate': box_centroid_coordinate}, + name='computer', + type='tool_use' + ) + response_content.append(move_cursor_block) + + elif vlm_response_json["box_id"] == -1 and len(vlm_response_json["coordinates"]) == 2: + # Move cursor to specified coordinates + move_cursor_block = BetaToolUseBlock( + id=f'toolu_{uuid.uuid4()}', + input={'action': 'mouse_move', 'coordinate': vlm_response_json["coordinates"]}, + name='computer', + type='tool_use' + ) + response_content.append(move_cursor_block) if vlm_response_json["next_action"] == "None": print("Task paused/completed.") elif vlm_response_json["next_action"] == "type": @@ -66,6 +86,7 @@ class TaskRunAgent(BaseAgent): def create_dynamic_response_model(parsed_screen_result): available_box_ids = [item.element_id for item in parsed_screen_result['parsed_content_list']] + available_box_ids.append(-1) task_run_agent_response = create_model( 'TaskRunAgentResponse', reasoning = (str, Field( @@ -78,11 +99,14 @@ def create_dynamic_response_model(parsed_screen_result): } )), box_id = (int, Field( - description="要操作的框ID", + description="要操作的框ID,如果框ID不存在就返回-1", json_schema_extra={ "enum": available_box_ids } )), + coordinates = (list[int], Field( + description="当 box_id 为-1时,直接返回要操作对象的坐标,只返回x,y这2个整数" + )), value = (str, Field( description="仅当next_action为type时提供,否则为None" )), @@ -97,15 +121,12 @@ prompt = """ ### 目标 ### 你是一个任务执行者。请你根据屏幕截图和【所有元素】确定接下来要做什么,如果任务完成把next_action设置为None: -以下是当前屏幕上的【所有元素】,caption和text是辅助你理解当前屏幕内容的,你的决策主要依靠这两个信息截图仅限参考,图标左上角的数字为box_id: -{screen_info} - 请根据以下任务列表判断一下你正在执行第几个任务(current_task_id),第一个任务是0,任务列表如下: {task_list} ########## ### 注意 ### -- box_id 要严格参考【所有元素】中的box_id给出。 +- 要结合用户传入的屏幕图片观察其中的 box_id 框框和标号,确定要操作哪一个box_id,如果没有合适的请返回-1,然后通过coordinates给出要操作对象的坐标。 - 每次应该只给出一个操作,告诉我要对哪个box_id进行操作、输入什么内容或者滚动或者其他操作。 - 应该对当前屏幕进行分析,通过查看历史记录反思已完成的工作,然后描述您如何实现任务的逐步思考。 - 避免连续多次选择相同的操作/元素,如果发生这种情况,反思自己,可能出了什么问题,并预测不同的操作。 @@ -122,7 +143,8 @@ prompt = """ "next_action": str, # 要执行的动作。 "box_id": int, # 要操作的框ID,当next_action为left_click、right_click、double_click、hover时提供,否则为None "value": "xxx" # 仅当操作为type时提供value字段,否则不包括value键 - "current_task_id": int # 当前正在执行第几个任务,第一个任务是0 + "current_task_id": int # 当前正在执行第几个任务,第一个任务是0, + "coordinates": list[int] # 仅当box_id为-1时提供,返回要操作对象的坐标,只返回x,y这2个整数 }} ``` diff --git a/gradio_ui/agent/vision_agent.py b/gradio_ui/agent/vision_agent.py index e5e8cdf..6957f75 100644 --- a/gradio_ui/agent/vision_agent.py +++ b/gradio_ui/agent/vision_agent.py @@ -1,72 +1,27 @@ -import os -from typing import List, Optional +from typing import List import cv2 -import torch from ultralytics import YOLO -from transformers import AutoModelForCausalLM, AutoProcessor -import easyocr import supervision as sv import numpy as np -import time from pydantic import BaseModel -import base64 -from PIL import Image -from transformers import AutoConfig -import os class UIElement(BaseModel): element_id: int coordinates: list[float] - caption: Optional[str] = None - text: Optional[str] = None class VisionAgent: - def __init__(self, yolo_model_path: str, florence_model_path: str): + def __init__(self, yolo_model_path: str): """ Initialize the vision agent Parameters: yolo_model_path: Path to YOLO model - caption_model_path: Path to image caption model """ # determine the available device and the best dtype - self.device, self.dtype = self._get_optimal_device_and_dtype() # load the YOLO model self.yolo_model = YOLO(yolo_model_path) - - # load the image caption model and processor - self.caption_processor = AutoProcessor.from_pretrained( - florence_model_path, - trust_remote_code=True, - local_files_only=True - ) - - - try: - self.caption_model = AutoModelForCausalLM.from_pretrained( - florence_model_path, # 这里使用包含代码和权重的完整目录 - torch_dtype=self.dtype, - trust_remote_code=True, - local_files_only=True - ).to(self.device) - - # 不需要额外加载权重,因为权重已经包含在 florence_base_path 中 - - except Exception as e: - print(f"Model loading failed: {e}") - raise e - self.prompt = "" - - # set the batch size - if self.device.type == 'cuda': - self.batch_size = 128 - elif self.device.type == 'mps': - self.batch_size = 128 - else: - self.batch_size = 16 self.elements: List[UIElement] = [] - self.ocr_reader = easyocr.Reader(['en', 'ch_sim']) def __call__(self, image_path: str) -> List[UIElement]: """Process an image from file path.""" @@ -76,26 +31,6 @@ class VisionAgent: raise FileNotFoundError(f"Vision agent: Failed to read image") return self.analyze_image(image) - def _get_optimal_device_and_dtype(self): - """determine the optimal device and dtype""" - if torch.cuda.is_available(): - device = torch.device("cuda") - # check if the GPU is suitable for using float16 - capability = torch.cuda.get_device_capability() - # only use float16 on newer GPUs - if capability[0] >= 7: - dtype = torch.float16 - else: - dtype = torch.float32 - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): - device = torch.device("mps") - dtype = torch.float32 - else: - device = torch.device("cpu") - dtype = torch.float32 - - return device, dtype - def _reset_state(self): """Clear previous analysis results""" self.elements = [] @@ -112,114 +47,15 @@ class VisionAgent: """ self._reset_state() - element_crops, boxes = self._detect_objects(image) - start = time.time() - element_texts = self._extract_text(element_crops) - end = time.time() - ocr_time = (end-start) * 10 ** 3 - print(f"Speed: {ocr_time:.2f} ms OCR of {len(element_texts)} icons.") - start = time.time() - element_captions = self._get_caption(element_crops, 5) - end = time.time() - caption_time = (end-start) * 10 ** 3 - print(f"Speed: {caption_time:.2f} ms captioning of {len(element_captions)} icons.") - for idx in range(len(element_crops)): + boxes = self._detect_objects(image) + + for idx in range(len(boxes)): new_element = UIElement(element_id=idx, - coordinates=boxes[idx], - text=element_texts[idx][0] if len(element_texts[idx]) > 0 else '', - caption=element_captions[idx] - ) + coordinates=boxes[idx]) self.elements.append(new_element) return self.elements - def _extract_text(self, images: np.ndarray) -> list[str]: - """ - Run OCR in sequential mode - TODO: It is possible to run in batch mode for a speed up, but the result quality needs test. - https://github.com/JaidedAI/EasyOCR/pull/458 - """ - texts = [] - for image in images: - text = self.ocr_reader.readtext(image, detail=0, paragraph=True, text_threshold=0.85) - texts.append(text) - # print(texts) - return texts - - def _get_caption(self, element_crops, batch_size=None): - """get the caption of the element crops""" - if not element_crops: - return [] - - # if batch_size is not specified, use the instance's default value - if batch_size is None: - batch_size = self.batch_size - - # resize the image to 64x64 - resized_crops = [] - for img in element_crops: - # convert to numpy array, resize, then convert back to PIL - img_np = np.array(img) - resized_np = cv2.resize(img_np, (64, 64)) - resized_crops.append(Image.fromarray(resized_np)) - - generated_texts = [] - device = self.device - - # process in batches - for i in range(0, len(resized_crops), batch_size): - batch = resized_crops[i:i+batch_size] - try: - # select the dtype according to the device type - if device.type == 'cuda': - inputs = self.caption_processor( - images=batch, - text=[self.prompt] * len(batch), - return_tensors="pt", - do_resize=False - ).to(device=device, dtype=torch.float16) - else: - # MPS and CPU use float32 - inputs = self.caption_processor( - images=batch, - text=[self.prompt] * len(batch), - return_tensors="pt" - ).to(device=device) - - # special treatment for Florence-2 - with torch.no_grad(): - if 'florence' in self.caption_model.config.model_type: - generated_ids = self.caption_model.generate( - input_ids=inputs["input_ids"], - pixel_values=inputs["pixel_values"], - max_new_tokens=20, - num_beams=5, - do_sample=False - ) - else: - generated_ids = self.caption_model.generate( - **inputs, - max_length=50, - num_beams=3, - early_stopping=True - ) - - # decode the generated IDs - texts = self.caption_processor.batch_decode( - generated_ids, - skip_special_tokens=True - ) - texts = [text.strip() for text in texts] - generated_texts.extend(texts) - - # clean the cache - if device.type == 'cuda' and torch.cuda.is_available(): - torch.cuda.empty_cache() - - except RuntimeError as e: - raise e - return generated_texts - def _detect_objects(self, image: np.ndarray) -> tuple[list[np.ndarray], list]: """Run object detection pipeline""" results = self.yolo_model(image)[0] @@ -250,42 +86,7 @@ class VisionAgent: # Map back to original indices keep_indices = sorted_indices[keep_sorted] filtered_boxes = boxes[keep_indices] - - # Extract element crops - element_crops = [] - for box in filtered_boxes: - x1, y1, x2, y2 = map(int, map(round, box)) - element = image[y1:y2, x1:x2] - element_crops.append(np.array(element)) - - return element_crops, filtered_boxes - - def load_image(self, image_source: str) -> np.ndarray: - try: - # Handle potential Data URL prefix (like "data:image/png;base64,") - if ',' in image_source: - _, payload = image_source.split(',', 1) - else: - payload = image_source - - # Base64 decode -> bytes -> numpy array - image_bytes = base64.b64decode(payload) - np_array = np.frombuffer(image_bytes, dtype=np.uint8) - - # OpenCV decode image - image = cv2.imdecode(np_array, cv2.IMREAD_COLOR) - - if image is None: - raise ValueError("Failed to decode image: Invalid image data") - - return self.analyze_image(image) - - except (base64.binascii.Error, ValueError) as e: - # Generate clearer error message - error_msg = f"Input is neither a valid file path nor valid Base64 image data" - raise ValueError(error_msg) from e - - + return filtered_boxes \ No newline at end of file diff --git a/gradio_ui/app.py b/gradio_ui/app.py index eb7af7b..bdefde5 100644 --- a/gradio_ui/app.py +++ b/gradio_ui/app.py @@ -324,8 +324,7 @@ def run(): model.change(fn=update_model, inputs=[model, state], outputs=None) api_key.change(fn=update_api_key, inputs=[api_key, state], outputs=None) chatbot.clear(fn=clear_chat, inputs=[state], outputs=[chatbot]) - vision_agent = VisionAgent(yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt"), - florence_model_path=FLORENCE_DIR) + vision_agent = VisionAgent(yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt")) vision_agent_state = gr.State({"agent": vision_agent}) submit_button.click(process_input, [chat_input, state, vision_agent_state], [chatbot, task_list]) stop_button.click(stop_app, [state], None) diff --git a/install.py b/install.py index 24b602a..71fd07c 100644 --- a/install.py +++ b/install.py @@ -1,64 +1,18 @@ import subprocess -import os import sys - from util import download_weights -def check_cuda_version(): - try: - # try to get cuda version from nvidia-smi - result = subprocess.run(['nvidia-smi'], capture_output=True, text=True) - for line in result.stdout.split('\n'): - if 'CUDA Version:' in line: - cuda_version = line.split('CUDA Version:')[1].strip() - return cuda_version - - # try to get cuda version from nvcc - result = subprocess.run(['nvcc', '--version'], capture_output=True, text=True) - for line in result.stdout.split('\n'): - if 'release' in line: - version = line.split('V')[-1].split('.')[0:2] - return '.'.join(version) - - return None - except: - return None - -def install_pytorch(): - cuda_version = check_cuda_version() - if cuda_version is None: - print("CUDA not found. Installing CPU version of PyTorch") - cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu --timeout 3000" - elif cuda_version.startswith("11."): - print(f"CUDA {cuda_version} found. Installing PyTorch for CUDA 11.8") - cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118 --timeout 3000" - elif cuda_version.startswith("12.4"): - print(f"CUDA {cuda_version} found. Installing PyTorch for CUDA 12.4") - cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124 --timeout 3000" - elif cuda_version.startswith("12.6"): - print(f"CUDA {cuda_version} found. Installing PyTorch for CUDA 12.6") - cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126 --timeout 3000" - else: - print(f"CUDA {cuda_version} found, but not in 11.8, 12.4, 12.6, please reinstall cuda and try again") - exit(1) - - print(f"Running: {cmd}") - subprocess.run(cmd, shell=True) - def install_requirements(): subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt']) - def adjust_python_env(): # check if python is 3.12 if sys.version_info.major != 3 or sys.version_info.minor != 12: print("Python version is not 3.12, please install python 3.12") exit(1) - def install(): adjust_python_env() - install_pytorch() install_requirements() # download the weight files download_weights.download() diff --git a/requirements.txt b/requirements.txt index 8993bad..6cfd63c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # torch # torchvision -easyocr +# easyocr supervision==0.18.0 -transformers +# transformers ultralytics==8.3.70 numpy==1.26.4 gradio diff --git a/util/config.json b/util/config.json deleted file mode 100644 index bafc7be..0000000 --- a/util/config.json +++ /dev/null @@ -1,239 +0,0 @@ -{ - "_name_or_path": "./Florence-2-base-ft", - "architectures": [ - "Florence2ForConditionalGeneration" - ], - "auto_map": { - "AutoConfig": "configuration_florence2.Florence2Config", - "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration" - }, - "bos_token_id": 2, - "eos_token_id": 1, - "ignore_index": -100, - "is_encoder_decoder": true, - "model_type": "florence2", - "pad_token_id": 0, - "projection_dim": 768, - "text_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "", - "activation_dropout": 0.1, - "activation_function": "gelu", - "add_bias_logits": false, - "add_cross_attention": false, - "add_final_layer_norm": false, - "architectures": null, - "attention_dropout": 0.1, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": 0, - "chunk_size_feed_forward": 0, - "classif_dropout": 0.1, - "classifier_dropout": 0.0, - "cross_attention_hidden_size": null, - "d_model": 768, - "decoder_attention_heads": 12, - "decoder_ffn_dim": 3072, - "decoder_layerdrop": 0.0, - "decoder_layers": 6, - "decoder_start_token_id": 2, - "diversity_penalty": 0.0, - "do_sample": false, - "dropout": 0.1, - "early_stopping": true, - "encoder_attention_heads": 12, - "encoder_ffn_dim": 3072, - "encoder_layerdrop": 0.0, - "encoder_layers": 6, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": 2, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": 0, - "forced_eos_token_id": 2, - "gradient_checkpointing": false, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1", - "2": "LABEL_2" - }, - "init_std": 0.02, - "is_decoder": false, - "is_encoder_decoder": true, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1, - "LABEL_2": 2 - }, - "length_penalty": 1.0, - "max_length": 20, - "max_position_embeddings": 1024, - "min_length": 0, - "model_type": "florence2_language", - "no_repeat_ngram_size": 3, - "normalize_before": false, - "num_beam_groups": 1, - "num_beams": 3, - "num_hidden_layers": 6, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": 1, - "prefix": null, - "problem_type": null, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1.0, - "return_dict": true, - "return_dict_in_generate": false, - "scale_embedding": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1.0, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1.0, - "torch_dtype": null, - "torchscript": false, - "typical_p": 1.0, - "use_bfloat16": false, - "use_cache": true, - "vocab_size": 51289 - }, - "torch_dtype": "float32", - "transformers_version": "4.46.1", - "vision_config": { - "_attn_implementation_autoset": false, - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "depths": [ - 1, - 1, - 9, - 1 - ], - "dim_embed": [ - 128, - 256, - 512, - 1024 - ], - "diversity_penalty": 0.0, - "do_sample": false, - "drop_path_rate": 0.1, - "early_stopping": false, - "enable_checkpoint": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "image_feature_source": [ - "spatial_avg_pool", - "temporal_avg_pool" - ], - "image_pos_embed": { - "max_pos_embeddings": 50, - "type": "learned_abs_2d" - }, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1.0, - "max_length": 20, - "min_length": 0, - "model_type": "davit", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_groups": [ - 4, - 8, - 16, - 32 - ], - "num_heads": [ - 4, - 8, - 16, - 32 - ], - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "patch_padding": [ - 3, - 1, - 1, - 1 - ], - "patch_prenorm": [ - false, - true, - true, - true - ], - "patch_size": [ - 7, - 3, - 3, - 3 - ], - "patch_stride": [ - 4, - 2, - 2, - 2 - ], - "prefix": null, - "problem_type": null, - "projection_dim": 768, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1.0, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1.0, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1.0, - "torch_dtype": null, - "torchscript": false, - "typical_p": 1.0, - "use_bfloat16": false, - "visual_temporal_embedding": { - "max_temporal_embeddings": 100, - "type": "COSINE" - }, - "window_size": 12 - }, - "vocab_size": 51289 -}