simplify ocr code

2025-12-26 05:16:21 +08:00 · 2025-03-19 22:02:08 +08:00 · 2025-03-19 22:02:08 +08:00 · a48ff2d37d
commit a48ff2d37d
parent b7435e9b17
7 changed files with 49 additions and 515 deletions
--- a/gradio_ui/agent/task_plan_agent.py
+++ b/gradio_ui/agent/task_plan_agent.py
@ -7,7 +7,6 @@ from gradio_ui.tools.computer import Action

 class TaskPlanAgent(BaseAgent):
    def __call__(self, messages, parsed_screen_result):
-        screen_info = str([{"box_id": i.element_id, "caption": i.caption, "text": i.text} for i in parsed_screen_result['parsed_content_list']])
        messages[-1] =  {"role": "user", 
             "content": [
                    {"type": "text", "text": messages[-1]["content"]},
@ -17,7 +16,7 @@ class TaskPlanAgent(BaseAgent):
                    }
                ]
            }
-        response = run(messages, user_prompt=system_prompt.format(screen_info=screen_info, action_list=str(Action)), response_format=TaskPlanResponse)
+        response = run(messages, user_prompt=system_prompt.format(action_list=str(Action)), response_format=TaskPlanResponse)
        print("task_plan_agent response: ", response)
        return json.loads(response)

@ -31,8 +30,6 @@ system_prompt = """
 ### 目标 ###
 你是自动化操作规划专家，根据屏幕内容和用户需求，规划精确可执行的操作序列。

-当前屏幕内容如下：
-{screen_info}

 ### 输入 ###
 1. 用户需求：文本描述形式的任务目标
--- a/gradio_ui/agent/task_run_agent.py
+++ b/gradio_ui/agent/task_run_agent.py
@ -25,8 +25,7 @@ class TaskRunAgent(BaseAgent):
        task_list = json.loads(messages[1]['content'])['task_list']
        # Convert task_list to a numbered format
        formatted_task_list = "\n".join([f"{i}.{task}" for i, task in enumerate(task_list)])
-        screen_info = str([{"box_id": i.element_id, "caption": i.caption, "text": i.text} for i in parsed_screen_result['parsed_content_list']])
-        system_prompt = prompt.format(screen_info=screen_info, task_list=formatted_task_list)
+        system_prompt = prompt.format(task_list=formatted_task_list)
        vlm_response = run(
            messages,
            user_prompt=system_prompt, 
@ -34,14 +33,35 @@ class TaskRunAgent(BaseAgent):
        )
        vlm_response_json = json.loads(vlm_response)
        response_content = [BetaTextBlock(text=vlm_response_json["reasoning"], type='text')]
-        if "box_id" in vlm_response_json and vlm_response_json["next_action"] not in ["None", "key", "type", "scroll_down", "scroll_up","cursor_position", "wait"]:
-            bbox = self.find_element_by_id(parsed_screen_result, vlm_response_json["box_id"]).coordinates
-            box_centroid_coordinate = [int((bbox[0] + bbox[2]) / 2 ), int((bbox[1] + bbox[3]) / 2 )]
-            move_cursor_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
-                                            input={'action': 'mouse_move', 'coordinate': box_centroid_coordinate},
-                                            name='computer', type='tool_use')
-            response_content.append(move_cursor_block)
-
+        # Handle cursor movement based on box_id
+        if "box_id" in vlm_response_json:
+            action_types_without_cursor = ["None", "key", "type", "scroll_down", "scroll_up", "cursor_position", "wait"]
+            
+            if vlm_response_json["box_id"] != -1 and vlm_response_json["next_action"] not in action_types_without_cursor:
+                # Move cursor to the center of the identified element
+                element = self.find_element_by_id(parsed_screen_result, vlm_response_json["box_id"])
+                bbox = element.coordinates
+                box_centroid_coordinate = [
+                    int((bbox[0] + bbox[2]) / 2),
+                    int((bbox[1] + bbox[3]) / 2)
+                ]
+                move_cursor_block = BetaToolUseBlock(
+                    id=f'toolu_{uuid.uuid4()}',
+                    input={'action': 'mouse_move', 'coordinate': box_centroid_coordinate},
+                    name='computer', 
+                    type='tool_use'
+                )
+                response_content.append(move_cursor_block)
+            
+            elif vlm_response_json["box_id"] == -1 and len(vlm_response_json["coordinates"]) == 2:
+                # Move cursor to specified coordinates
+                move_cursor_block = BetaToolUseBlock(
+                    id=f'toolu_{uuid.uuid4()}',
+                    input={'action': 'mouse_move', 'coordinate': vlm_response_json["coordinates"]},
+                    name='computer', 
+                    type='tool_use'
+                )
+                response_content.append(move_cursor_block)
        if vlm_response_json["next_action"] == "None":
            print("Task paused/completed.")
        elif vlm_response_json["next_action"] == "type":
@ -66,6 +86,7 @@ class TaskRunAgent(BaseAgent):

 def create_dynamic_response_model(parsed_screen_result):
    available_box_ids = [item.element_id for item in parsed_screen_result['parsed_content_list']]
+    available_box_ids.append(-1)
    task_run_agent_response = create_model(
        'TaskRunAgentResponse',
        reasoning = (str, Field(
@ -78,11 +99,14 @@ def create_dynamic_response_model(parsed_screen_result):
                }
        )),
        box_id = (int, Field(
-            description="要操作的框ID",
+            description="要操作的框ID，如果框ID不存在就返回-1",
            json_schema_extra={
                "enum": available_box_ids
            }
        )),
+        coordinates = (list[int], Field(
+            description="当 box_id 为-1时，直接返回要操作对象的坐标，只返回x,y这2个整数"
+        )),
        value = (str, Field(
            description="仅当next_action为type时提供，否则为None"
        )),
@ -97,15 +121,12 @@ prompt = """
 ### 目标 ###
 你是一个任务执行者。请你根据屏幕截图和【所有元素】确定接下来要做什么，如果任务完成把next_action设置为None：

-以下是当前屏幕上的【所有元素】，caption和text是辅助你理解当前屏幕内容的，你的决策主要依靠这两个信息截图仅限参考，图标左上角的数字为box_id：
-{screen_info}
-
 请根据以下任务列表判断一下你正在执行第几个任务（current_task_id），第一个任务是0，任务列表如下：
 {task_list}
 ##########

 ### 注意 ###
- box_id 要严格参考【所有元素】中的box_id给出。
+- 要结合用户传入的屏幕图片观察其中的 box_id 框框和标号，确定要操作哪一个box_id，如果没有合适的请返回-1，然后通过coordinates给出要操作对象的坐标。
 - 每次应该只给出一个操作，告诉我要对哪个box_id进行操作、输入什么内容或者滚动或者其他操作。
 - 应该对当前屏幕进行分析，通过查看历史记录反思已完成的工作，然后描述您如何实现任务的逐步思考。
 - 避免连续多次选择相同的操作/元素，如果发生这种情况，反思自己，可能出了什么问题，并预测不同的操作。
@ -122,7 +143,8 @@ prompt = """
    "next_action": str, # 要执行的动作。
    "box_id": int, # 要操作的框ID，当next_action为left_click、right_click、double_click、hover时提供，否则为None
    "value": "xxx" # 仅当操作为type时提供value字段，否则不包括value键
-    "current_task_id": int # 当前正在执行第几个任务，第一个任务是0
+    "current_task_id": int # 当前正在执行第几个任务，第一个任务是0,
+    "coordinates": list[int] # 仅当box_id为-1时提供，返回要操作对象的坐标，只返回x,y这2个整数
 }}
 ```

--- a/gradio_ui/agent/vision_agent.py
+++ b/gradio_ui/agent/vision_agent.py
@ -1,72 +1,27 @@
-import os
-from typing import List, Optional
+from typing import List
 import cv2
-import torch
 from ultralytics import YOLO
-from transformers import AutoModelForCausalLM, AutoProcessor
-import easyocr
 import supervision as sv
 import numpy as np
-import time
 from pydantic import BaseModel
-import base64
-from PIL import Image
-from transformers import AutoConfig
-import os

 class UIElement(BaseModel):
    element_id: int
    coordinates: list[float]
-    caption: Optional[str] = None
-    text: Optional[str] = None

 class VisionAgent:
-    def __init__(self, yolo_model_path: str, florence_model_path: str):
+    def __init__(self, yolo_model_path: str):
        """
        Initialize the vision agent
        
        Parameters:
            yolo_model_path: Path to YOLO model
-            caption_model_path: Path to image caption model
        """
        # determine the available device and the best dtype
-        self.device, self.dtype = self._get_optimal_device_and_dtype()        
        # load the YOLO model
        self.yolo_model = YOLO(yolo_model_path)
-        
-        # load the image caption model and processor
-        self.caption_processor = AutoProcessor.from_pretrained(
-            florence_model_path, 
-            trust_remote_code=True,
-            local_files_only=True
-        )
-
-        
-        try:            
-            self.caption_model = AutoModelForCausalLM.from_pretrained(
-                florence_model_path,  # 这里使用包含代码和权重的完整目录
-                torch_dtype=self.dtype,
-                trust_remote_code=True,
-                local_files_only=True
-            ).to(self.device)
-            
-            # 不需要额外加载权重，因为权重已经包含在 florence_base_path 中
-            
-        except Exception as e:
-            print(f"Model loading failed: {e}")
-            raise e
-        self.prompt = "<CAPTION>"
-        
-        # set the batch size
-        if self.device.type == 'cuda':
-            self.batch_size = 128
-        elif self.device.type == 'mps':
-            self.batch_size = 128
-        else:
-            self.batch_size = 16

        self.elements: List[UIElement] = []
-        self.ocr_reader = easyocr.Reader(['en', 'ch_sim'])

    def __call__(self, image_path: str) -> List[UIElement]:
        """Process an image from file path."""
@ -76,26 +31,6 @@ class VisionAgent:
            raise FileNotFoundError(f"Vision agent: Failed to read image")
        return self.analyze_image(image)
    
-    def _get_optimal_device_and_dtype(self):
-        """determine the optimal device and dtype"""
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            # check if the GPU is suitable for using float16
-            capability = torch.cuda.get_device_capability()
-            # only use float16 on newer GPUs
-            if capability[0] >= 7: 
-                dtype = torch.float16
-            else:
-                dtype = torch.float32
-        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
-            device = torch.device("mps")
-            dtype = torch.float32 
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32
-        
-        return device, dtype
-
    def _reset_state(self):
        """Clear previous analysis results"""
        self.elements = []
@ -112,114 +47,15 @@ class VisionAgent:
        """
        self._reset_state()

-        element_crops, boxes = self._detect_objects(image)
-        start = time.time()
-        element_texts = self._extract_text(element_crops)
-        end = time.time()
-        ocr_time = (end-start) * 10 ** 3
-        print(f"Speed: {ocr_time:.2f} ms OCR of {len(element_texts)} icons.")
-        start = time.time()
-        element_captions = self._get_caption(element_crops, 5)
-        end = time.time()
-        caption_time = (end-start) * 10 ** 3
-        print(f"Speed: {caption_time:.2f} ms captioning of {len(element_captions)} icons.")
-        for idx in range(len(element_crops)):
+        boxes = self._detect_objects(image)
+        
+        for idx in range(len(boxes)):
            new_element = UIElement(element_id=idx, 
-                                    coordinates=boxes[idx], 
-                                    text=element_texts[idx][0] if len(element_texts[idx]) > 0 else '', 
-                                    caption=element_captions[idx]
-                                    )
+                                    coordinates=boxes[idx])
            self.elements.append(new_element)

        return self.elements

-    def _extract_text(self, images: np.ndarray) -> list[str]:
-        """
-        Run OCR in sequential mode
-        TODO: It is possible to run in batch mode for a speed up, but the result quality needs test.
-        https://github.com/JaidedAI/EasyOCR/pull/458
-        """
-        texts = []
-        for image in images:
-            text = self.ocr_reader.readtext(image, detail=0, paragraph=True, text_threshold=0.85)
-            texts.append(text)
-        # print(texts)
-        return texts
-
-    def _get_caption(self, element_crops, batch_size=None):
-        """get the caption of the element crops"""
-        if not element_crops:
-            return []
-        
-        # if batch_size is not specified, use the instance's default value
-        if batch_size is None:
-            batch_size = self.batch_size
-        
-        # resize the image to 64x64
-        resized_crops = []
-        for img in element_crops:
-            # convert to numpy array, resize, then convert back to PIL
-            img_np = np.array(img)
-            resized_np = cv2.resize(img_np, (64, 64))
-            resized_crops.append(Image.fromarray(resized_np))
-        
-        generated_texts = []
-        device = self.device
-        
-        # process in batches
-        for i in range(0, len(resized_crops), batch_size):
-            batch = resized_crops[i:i+batch_size]
-            try:
-                # select the dtype according to the device type
-                if device.type == 'cuda':
-                    inputs = self.caption_processor(
-                        images=batch, 
-                        text=[self.prompt] * len(batch), 
-                        return_tensors="pt",
-                        do_resize=False
-                    ).to(device=device, dtype=torch.float16)
-                else:
-                    # MPS and CPU use float32
-                    inputs = self.caption_processor(
-                        images=batch, 
-                        text=[self.prompt] * len(batch), 
-                        return_tensors="pt"
-                    ).to(device=device)
-                
-                # special treatment for Florence-2
-                with torch.no_grad():
-                    if 'florence' in self.caption_model.config.model_type:
-                        generated_ids = self.caption_model.generate(
-                            input_ids=inputs["input_ids"],
-                            pixel_values=inputs["pixel_values"],
-                            max_new_tokens=20,
-                            num_beams=5, 
-                            do_sample=False
-                        )
-                    else:
-                        generated_ids = self.caption_model.generate(
-                            **inputs, 
-                            max_length=50,
-                            num_beams=3,
-                            early_stopping=True
-                        )
-                
-                # decode the generated IDs
-                texts = self.caption_processor.batch_decode(
-                    generated_ids, 
-                    skip_special_tokens=True
-                )
-                texts = [text.strip() for text in texts]
-                generated_texts.extend(texts)
-                
-                # clean the cache
-                if device.type == 'cuda' and torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                    
-            except RuntimeError as e:
-                raise e
-        return generated_texts
-
    def _detect_objects(self, image: np.ndarray) -> tuple[list[np.ndarray], list]:
        """Run object detection pipeline"""
        results = self.yolo_model(image)[0]
@ -250,42 +86,7 @@ class VisionAgent:
        # Map back to original indices
        keep_indices = sorted_indices[keep_sorted]
        filtered_boxes = boxes[keep_indices]
-
-        # Extract element crops
-        element_crops = []
-        for box in filtered_boxes:
-            x1, y1, x2, y2 = map(int, map(round, box))
-            element = image[y1:y2, x1:x2]
-            element_crops.append(np.array(element))
-
-        return element_crops, filtered_boxes
-    
-    def load_image(self, image_source: str) -> np.ndarray:
-        try:
-            # Handle potential Data URL prefix (like "data:image/png;base64,")
-            if ',' in image_source:
-                _, payload = image_source.split(',', 1)
-            else:
-                payload = image_source
-
-            # Base64 decode -> bytes -> numpy array
-            image_bytes = base64.b64decode(payload)
-            np_array = np.frombuffer(image_bytes, dtype=np.uint8)
-            
-            # OpenCV decode image
-            image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
-            
-            if image is None:
-                raise ValueError("Failed to decode image: Invalid image data")
-            
-            return self.analyze_image(image)
-
-        except (base64.binascii.Error, ValueError) as e:
-            # Generate clearer error message
-            error_msg = f"Input is neither a valid file path nor valid Base64 image data"
-            raise ValueError(error_msg) from e
-
-
+        return filtered_boxes
    

    
--- a/gradio_ui/app.py
+++ b/gradio_ui/app.py
@ -324,8 +324,7 @@ def run():
        model.change(fn=update_model, inputs=[model, state], outputs=None)
        api_key.change(fn=update_api_key, inputs=[api_key, state], outputs=None)
        chatbot.clear(fn=clear_chat, inputs=[state], outputs=[chatbot])
-        vision_agent = VisionAgent(yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt"),
-                                  florence_model_path=FLORENCE_DIR)
+        vision_agent = VisionAgent(yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt"))
        vision_agent_state = gr.State({"agent": vision_agent})
        submit_button.click(process_input, [chat_input, state, vision_agent_state], [chatbot, task_list])
        stop_button.click(stop_app, [state], None)
--- a/install.py
+++ b/install.py
@ -1,64 +1,18 @@
 import subprocess
-import os
 import sys
-
 from util import download_weights

-def check_cuda_version():
-    try:
-        # try to get cuda version from nvidia-smi
-        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
-        for line in result.stdout.split('\n'):
-            if 'CUDA Version:' in line:
-                cuda_version = line.split('CUDA Version:')[1].strip()
-                return cuda_version
-        
-        # try to get cuda version from nvcc
-        result = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
-        for line in result.stdout.split('\n'):
-            if 'release' in line:
-                version = line.split('V')[-1].split('.')[0:2]
-                return '.'.join(version)
-        
-        return None
-    except:
-        return None
-
-def install_pytorch():
-    cuda_version = check_cuda_version()
-    if cuda_version is None:
-        print("CUDA not found. Installing CPU version of PyTorch")
-        cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu --timeout 3000"
-    elif cuda_version.startswith("11."):
-        print(f"CUDA {cuda_version} found. Installing PyTorch for CUDA 11.8")
-        cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118 --timeout 3000"
-    elif cuda_version.startswith("12.4"):
-        print(f"CUDA {cuda_version} found. Installing PyTorch for CUDA 12.4")
-        cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124 --timeout 3000"
-    elif cuda_version.startswith("12.6"):
-        print(f"CUDA {cuda_version} found. Installing PyTorch for CUDA 12.6")
-        cmd = "pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126 --timeout 3000"
-    else:
-        print(f"CUDA {cuda_version} found, but not in 11.8, 12.4, 12.6, please reinstall cuda and try again")
-        exit(1)
-    
-    print(f"Running: {cmd}")
-    subprocess.run(cmd, shell=True)
-
 def install_requirements():
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'])

-
 def adjust_python_env():
    # check if python is 3.12
    if sys.version_info.major != 3 or sys.version_info.minor != 12:
        print("Python version is not 3.12, please install python 3.12")
        exit(1)

-
 def install():
    adjust_python_env()
-    install_pytorch()
    install_requirements()
    # download the weight files
    download_weights.download() 
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,8 @@
 # torch
 # torchvision
-easyocr
+# easyocr
 supervision==0.18.0
-transformers
+# transformers
 ultralytics==8.3.70
 numpy==1.26.4
 gradio
--- a/util/config.json
+++ b/util/config.json
@ -1,239 +0,0 @@
-{
-  "_name_or_path": "./Florence-2-base-ft",
-  "architectures": [
-    "Florence2ForConditionalGeneration"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_florence2.Florence2Config",
-    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
-  },
-  "bos_token_id": 2,
-  "eos_token_id": 1,
-  "ignore_index": -100,
-  "is_encoder_decoder": true,
-  "model_type": "florence2",
-  "pad_token_id": 0,
-  "projection_dim": 768,
-  "text_config": {
-    "_attn_implementation_autoset": true,
-    "_name_or_path": "",
-    "activation_dropout": 0.1,
-    "activation_function": "gelu",
-    "add_bias_logits": false,
-    "add_cross_attention": false,
-    "add_final_layer_norm": false,
-    "architectures": null,
-    "attention_dropout": 0.1,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "classif_dropout": 0.1,
-    "classifier_dropout": 0.0,
-    "cross_attention_hidden_size": null,
-    "d_model": 768,
-    "decoder_attention_heads": 12,
-    "decoder_ffn_dim": 3072,
-    "decoder_layerdrop": 0.0,
-    "decoder_layers": 6,
-    "decoder_start_token_id": 2,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dropout": 0.1,
-    "early_stopping": true,
-    "encoder_attention_heads": 12,
-    "encoder_ffn_dim": 3072,
-    "encoder_layerdrop": 0.0,
-    "encoder_layers": 6,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": 0,
-    "forced_eos_token_id": 2,
-    "gradient_checkpointing": false,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1",
-      "2": "LABEL_2"
-    },
-    "init_std": 0.02,
-    "is_decoder": false,
-    "is_encoder_decoder": true,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1,
-      "LABEL_2": 2
-    },
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "max_position_embeddings": 1024,
-    "min_length": 0,
-    "model_type": "florence2_language",
-    "no_repeat_ngram_size": 3,
-    "normalize_before": false,
-    "num_beam_groups": 1,
-    "num_beams": 3,
-    "num_hidden_layers": 6,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "scale_embedding": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "use_cache": true,
-    "vocab_size": 51289
-  },
-  "torch_dtype": "float32",
-  "transformers_version": "4.46.1",
-  "vision_config": {
-    "_attn_implementation_autoset": false,
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "depths": [
-      1,
-      1,
-      9,
-      1
-    ],
-    "dim_embed": [
-      128,
-      256,
-      512,
-      1024
-    ],
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "drop_path_rate": 0.1,
-    "early_stopping": false,
-    "enable_checkpoint": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_feature_source": [
-      "spatial_avg_pool",
-      "temporal_avg_pool"
-    ],
-    "image_pos_embed": {
-      "max_pos_embeddings": 50,
-      "type": "learned_abs_2d"
-    },
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "davit",
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_groups": [
-      4,
-      8,
-      16,
-      32
-    ],
-    "num_heads": [
-      4,
-      8,
-      16,
-      32
-    ],
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_padding": [
-      3,
-      1,
-      1,
-      1
-    ],
-    "patch_prenorm": [
-      false,
-      true,
-      true,
-      true
-    ],
-    "patch_size": [
-      7,
-      3,
-      3,
-      3
-    ],
-    "patch_stride": [
-      4,
-      2,
-      2,
-      2
-    ],
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim": 768,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "visual_temporal_embedding": {
-      "max_temporal_embeddings": 100,
-      "type": "COSINE"
-    },
-    "window_size": 12
-  },
-  "vocab_size": 51289
-}