mirror of
https://github.com/yuruotong1/autoMate.git
synced 2025-12-25 21:06:47 +08:00
继续尝试绕开HF
This commit is contained in:
parent
9c5ede79b1
commit
8af1a66ffc
@ -18,13 +18,13 @@ class UIElement(BaseModel):
|
||||
text: Optional[str] = None
|
||||
|
||||
class VisionAgent:
|
||||
def __init__(self, yolo_model_path: str, caption_model_path: str = 'microsoft/Florence-2-base-ft'):
|
||||
def __init__(self, yolo_model_path: str, caption_model_path: str):
|
||||
"""
|
||||
Initialize the vision agent
|
||||
|
||||
Parameters:
|
||||
yolo_model_path: Path to YOLO model
|
||||
caption_model_path: Path to image caption model, default is Florence-2
|
||||
caption_model_path: Path to image caption model
|
||||
"""
|
||||
# determine the available device and the best dtype
|
||||
self.device, self.dtype = self._get_optimal_device_and_dtype()
|
||||
@ -33,14 +33,15 @@ class VisionAgent:
|
||||
|
||||
# load the image caption model and processor
|
||||
self.caption_processor = AutoProcessor.from_pretrained(
|
||||
"processor",
|
||||
trust_remote_code=True
|
||||
"weights/AI-ModelScope/Florence-2-base",
|
||||
trust_remote_code=True,
|
||||
local_files_only=True
|
||||
)
|
||||
|
||||
try:
|
||||
self.caption_model = AutoModelForCausalLM.from_pretrained(
|
||||
caption_model_path,
|
||||
torch_dtype=torch.float32,
|
||||
torch_dtype=self.dtype,
|
||||
trust_remote_code=True
|
||||
).to(self.device)
|
||||
|
||||
@ -53,7 +54,7 @@ class VisionAgent:
|
||||
if self.device.type == 'cuda':
|
||||
self.batch_size = 128
|
||||
elif self.device.type == 'mps':
|
||||
self.batch_size = 32
|
||||
self.batch_size = 128
|
||||
else:
|
||||
self.batch_size = 16
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
50001
processor/merges.txt
50001
processor/merges.txt
File diff suppressed because it is too large
Load Diff
@ -1,33 +0,0 @@
|
||||
{
|
||||
"auto_map": {
|
||||
"AutoProcessor": "microsoft/Florence-2-base--processing_florence2.Florence2Processor"
|
||||
},
|
||||
"crop_size": {
|
||||
"height": 768,
|
||||
"width": 768
|
||||
},
|
||||
"do_center_crop": false,
|
||||
"do_convert_rgb": null,
|
||||
"do_normalize": true,
|
||||
"do_rescale": true,
|
||||
"do_resize": true,
|
||||
"image_mean": [
|
||||
0.485,
|
||||
0.456,
|
||||
0.406
|
||||
],
|
||||
"image_processor_type": "CLIPImageProcessor",
|
||||
"image_seq_length": 577,
|
||||
"image_std": [
|
||||
0.229,
|
||||
0.224,
|
||||
0.225
|
||||
],
|
||||
"processor_class": "Florence2Processor",
|
||||
"resample": 3,
|
||||
"rescale_factor": 0.00392156862745098,
|
||||
"size": {
|
||||
"height": 768,
|
||||
"width": 768
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
259573
processor/tokenizer.json
259573
processor/tokenizer.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
239
util/config.json
Normal file
239
util/config.json
Normal file
@ -0,0 +1,239 @@
|
||||
{
|
||||
"_name_or_path": "./Florence-2-base-ft",
|
||||
"architectures": [
|
||||
"Florence2ForConditionalGeneration"
|
||||
],
|
||||
"auto_map": {
|
||||
"AutoConfig": "configuration_florence2.Florence2Config",
|
||||
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
|
||||
},
|
||||
"bos_token_id": 2,
|
||||
"eos_token_id": 1,
|
||||
"ignore_index": -100,
|
||||
"is_encoder_decoder": true,
|
||||
"model_type": "florence2",
|
||||
"pad_token_id": 0,
|
||||
"projection_dim": 768,
|
||||
"text_config": {
|
||||
"_attn_implementation_autoset": true,
|
||||
"_name_or_path": "",
|
||||
"activation_dropout": 0.1,
|
||||
"activation_function": "gelu",
|
||||
"add_bias_logits": false,
|
||||
"add_cross_attention": false,
|
||||
"add_final_layer_norm": false,
|
||||
"architectures": null,
|
||||
"attention_dropout": 0.1,
|
||||
"bad_words_ids": null,
|
||||
"begin_suppress_tokens": null,
|
||||
"bos_token_id": 0,
|
||||
"chunk_size_feed_forward": 0,
|
||||
"classif_dropout": 0.1,
|
||||
"classifier_dropout": 0.0,
|
||||
"cross_attention_hidden_size": null,
|
||||
"d_model": 768,
|
||||
"decoder_attention_heads": 12,
|
||||
"decoder_ffn_dim": 3072,
|
||||
"decoder_layerdrop": 0.0,
|
||||
"decoder_layers": 6,
|
||||
"decoder_start_token_id": 2,
|
||||
"diversity_penalty": 0.0,
|
||||
"do_sample": false,
|
||||
"dropout": 0.1,
|
||||
"early_stopping": true,
|
||||
"encoder_attention_heads": 12,
|
||||
"encoder_ffn_dim": 3072,
|
||||
"encoder_layerdrop": 0.0,
|
||||
"encoder_layers": 6,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"eos_token_id": 2,
|
||||
"exponential_decay_length_penalty": null,
|
||||
"finetuning_task": null,
|
||||
"forced_bos_token_id": 0,
|
||||
"forced_eos_token_id": 2,
|
||||
"gradient_checkpointing": false,
|
||||
"id2label": {
|
||||
"0": "LABEL_0",
|
||||
"1": "LABEL_1",
|
||||
"2": "LABEL_2"
|
||||
},
|
||||
"init_std": 0.02,
|
||||
"is_decoder": false,
|
||||
"is_encoder_decoder": true,
|
||||
"label2id": {
|
||||
"LABEL_0": 0,
|
||||
"LABEL_1": 1,
|
||||
"LABEL_2": 2
|
||||
},
|
||||
"length_penalty": 1.0,
|
||||
"max_length": 20,
|
||||
"max_position_embeddings": 1024,
|
||||
"min_length": 0,
|
||||
"model_type": "florence2_language",
|
||||
"no_repeat_ngram_size": 3,
|
||||
"normalize_before": false,
|
||||
"num_beam_groups": 1,
|
||||
"num_beams": 3,
|
||||
"num_hidden_layers": 6,
|
||||
"num_return_sequences": 1,
|
||||
"output_attentions": false,
|
||||
"output_hidden_states": false,
|
||||
"output_scores": false,
|
||||
"pad_token_id": 1,
|
||||
"prefix": null,
|
||||
"problem_type": null,
|
||||
"pruned_heads": {},
|
||||
"remove_invalid_values": false,
|
||||
"repetition_penalty": 1.0,
|
||||
"return_dict": true,
|
||||
"return_dict_in_generate": false,
|
||||
"scale_embedding": false,
|
||||
"sep_token_id": null,
|
||||
"suppress_tokens": null,
|
||||
"task_specific_params": null,
|
||||
"temperature": 1.0,
|
||||
"tf_legacy_loss": false,
|
||||
"tie_encoder_decoder": false,
|
||||
"tie_word_embeddings": true,
|
||||
"tokenizer_class": null,
|
||||
"top_k": 50,
|
||||
"top_p": 1.0,
|
||||
"torch_dtype": null,
|
||||
"torchscript": false,
|
||||
"typical_p": 1.0,
|
||||
"use_bfloat16": false,
|
||||
"use_cache": true,
|
||||
"vocab_size": 51289
|
||||
},
|
||||
"torch_dtype": "float32",
|
||||
"transformers_version": "4.46.1",
|
||||
"vision_config": {
|
||||
"_attn_implementation_autoset": false,
|
||||
"_name_or_path": "",
|
||||
"add_cross_attention": false,
|
||||
"architectures": null,
|
||||
"bad_words_ids": null,
|
||||
"begin_suppress_tokens": null,
|
||||
"bos_token_id": null,
|
||||
"chunk_size_feed_forward": 0,
|
||||
"cross_attention_hidden_size": null,
|
||||
"decoder_start_token_id": null,
|
||||
"depths": [
|
||||
1,
|
||||
1,
|
||||
9,
|
||||
1
|
||||
],
|
||||
"dim_embed": [
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024
|
||||
],
|
||||
"diversity_penalty": 0.0,
|
||||
"do_sample": false,
|
||||
"drop_path_rate": 0.1,
|
||||
"early_stopping": false,
|
||||
"enable_checkpoint": false,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"eos_token_id": null,
|
||||
"exponential_decay_length_penalty": null,
|
||||
"finetuning_task": null,
|
||||
"forced_bos_token_id": null,
|
||||
"forced_eos_token_id": null,
|
||||
"id2label": {
|
||||
"0": "LABEL_0",
|
||||
"1": "LABEL_1"
|
||||
},
|
||||
"image_feature_source": [
|
||||
"spatial_avg_pool",
|
||||
"temporal_avg_pool"
|
||||
],
|
||||
"image_pos_embed": {
|
||||
"max_pos_embeddings": 50,
|
||||
"type": "learned_abs_2d"
|
||||
},
|
||||
"is_decoder": false,
|
||||
"is_encoder_decoder": false,
|
||||
"label2id": {
|
||||
"LABEL_0": 0,
|
||||
"LABEL_1": 1
|
||||
},
|
||||
"length_penalty": 1.0,
|
||||
"max_length": 20,
|
||||
"min_length": 0,
|
||||
"model_type": "davit",
|
||||
"no_repeat_ngram_size": 0,
|
||||
"num_beam_groups": 1,
|
||||
"num_beams": 1,
|
||||
"num_groups": [
|
||||
4,
|
||||
8,
|
||||
16,
|
||||
32
|
||||
],
|
||||
"num_heads": [
|
||||
4,
|
||||
8,
|
||||
16,
|
||||
32
|
||||
],
|
||||
"num_return_sequences": 1,
|
||||
"output_attentions": false,
|
||||
"output_hidden_states": false,
|
||||
"output_scores": false,
|
||||
"pad_token_id": null,
|
||||
"patch_padding": [
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
],
|
||||
"patch_prenorm": [
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"patch_size": [
|
||||
7,
|
||||
3,
|
||||
3,
|
||||
3
|
||||
],
|
||||
"patch_stride": [
|
||||
4,
|
||||
2,
|
||||
2,
|
||||
2
|
||||
],
|
||||
"prefix": null,
|
||||
"problem_type": null,
|
||||
"projection_dim": 768,
|
||||
"pruned_heads": {},
|
||||
"remove_invalid_values": false,
|
||||
"repetition_penalty": 1.0,
|
||||
"return_dict": true,
|
||||
"return_dict_in_generate": false,
|
||||
"sep_token_id": null,
|
||||
"suppress_tokens": null,
|
||||
"task_specific_params": null,
|
||||
"temperature": 1.0,
|
||||
"tf_legacy_loss": false,
|
||||
"tie_encoder_decoder": false,
|
||||
"tie_word_embeddings": true,
|
||||
"tokenizer_class": null,
|
||||
"top_k": 50,
|
||||
"top_p": 1.0,
|
||||
"torch_dtype": null,
|
||||
"torchscript": false,
|
||||
"typical_p": 1.0,
|
||||
"use_bfloat16": false,
|
||||
"visual_temporal_embedding": {
|
||||
"max_temporal_embeddings": 100,
|
||||
"type": "COSINE"
|
||||
},
|
||||
"window_size": 12
|
||||
},
|
||||
"vocab_size": 51289
|
||||
}
|
||||
@ -1,8 +1,11 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from modelscope import snapshot_download
|
||||
import subprocess
|
||||
import shutil
|
||||
__WEIGHTS_DIR = Path("weights")
|
||||
MODEL_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "OmniParser-v2___0")
|
||||
MODEL_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "OmniParser-v2___0")
|
||||
PROCESSOR_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "Florence-2-base")
|
||||
def download():
|
||||
# Create weights directory
|
||||
|
||||
@ -13,9 +16,14 @@ def download():
|
||||
"icon_detect/train_args.yaml",
|
||||
"icon_detect/model.pt",
|
||||
"icon_detect/model.yaml",
|
||||
"icon_caption/config.json",
|
||||
"icon_caption/generation_config.json",
|
||||
"icon_caption/model.safetensors"
|
||||
"icon_caption/model.safetensors",
|
||||
]
|
||||
|
||||
# Extra config files downloaded from Florence2 repo
|
||||
config_files = [
|
||||
"configuration_florence2.py",
|
||||
"modeling_florence2.py"
|
||||
]
|
||||
|
||||
# Check and download missing files
|
||||
@ -32,9 +40,25 @@ def download():
|
||||
|
||||
snapshot_download(
|
||||
'AI-ModelScope/OmniParser-v2.0',
|
||||
cache_dir='weights'
|
||||
cache_dir='weights',
|
||||
ignore_file_pattern=['config.json']
|
||||
)
|
||||
|
||||
snapshot_download(
|
||||
'AI-ModelScope/Florence-2-base',
|
||||
cache_dir='weights',
|
||||
allow_file_pattern=['*.py', '*.json']
|
||||
)
|
||||
|
||||
# Move downloaded Florence config files into icon_caption
|
||||
for file_path in config_files:
|
||||
source_dir = os.path.join(PROCESSOR_DIR, file_path)
|
||||
dest_dir = os.path.join(MODEL_DIR, "icon_caption", file_path)
|
||||
shutil.copy(source_dir, dest_dir)
|
||||
|
||||
# Move customized config.json into icon_caption to load the model from local path
|
||||
shutil.copy(os.path.join("util", "config.json"), os.path.join(MODEL_DIR, "icon_caption", "config.json"))
|
||||
|
||||
print("Download complete")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user