继续尝试绕开HF

This commit is contained in:
Dan Li 2025-03-18 02:00:54 +03:00
parent 9c5ede79b1
commit 8af1a66ffc
11 changed files with 274 additions and 327106 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -18,13 +18,13 @@ class UIElement(BaseModel):
text: Optional[str] = None
class VisionAgent:
def __init__(self, yolo_model_path: str, caption_model_path: str = 'microsoft/Florence-2-base-ft'):
def __init__(self, yolo_model_path: str, caption_model_path: str):
"""
Initialize the vision agent
Parameters:
yolo_model_path: Path to YOLO model
caption_model_path: Path to image caption model, default is Florence-2
caption_model_path: Path to image caption model
"""
# determine the available device and the best dtype
self.device, self.dtype = self._get_optimal_device_and_dtype()
@ -33,14 +33,15 @@ class VisionAgent:
# load the image caption model and processor
self.caption_processor = AutoProcessor.from_pretrained(
"processor",
trust_remote_code=True
"weights/AI-ModelScope/Florence-2-base",
trust_remote_code=True,
local_files_only=True
)
try:
self.caption_model = AutoModelForCausalLM.from_pretrained(
caption_model_path,
torch_dtype=torch.float32,
torch_dtype=self.dtype,
trust_remote_code=True
).to(self.device)
@ -53,7 +54,7 @@ class VisionAgent:
if self.device.type == 'cuda':
self.batch_size = 128
elif self.device.type == 'mps':
self.batch_size = 32
self.batch_size = 128
else:
self.batch_size = 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,33 +0,0 @@
{
"auto_map": {
"AutoProcessor": "microsoft/Florence-2-base--processing_florence2.Florence2Processor"
},
"crop_size": {
"height": 768,
"width": 768
},
"do_center_crop": false,
"do_convert_rgb": null,
"do_normalize": true,
"do_rescale": true,
"do_resize": true,
"image_mean": [
0.485,
0.456,
0.406
],
"image_processor_type": "CLIPImageProcessor",
"image_seq_length": 577,
"image_std": [
0.229,
0.224,
0.225
],
"processor_class": "Florence2Processor",
"resample": 3,
"rescale_factor": 0.00392156862745098,
"size": {
"height": 768,
"width": 768
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

239
util/config.json Normal file
View File

@ -0,0 +1,239 @@
{
"_name_or_path": "./Florence-2-base-ft",
"architectures": [
"Florence2ForConditionalGeneration"
],
"auto_map": {
"AutoConfig": "configuration_florence2.Florence2Config",
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
},
"bos_token_id": 2,
"eos_token_id": 1,
"ignore_index": -100,
"is_encoder_decoder": true,
"model_type": "florence2",
"pad_token_id": 0,
"projection_dim": 768,
"text_config": {
"_attn_implementation_autoset": true,
"_name_or_path": "",
"activation_dropout": 0.1,
"activation_function": "gelu",
"add_bias_logits": false,
"add_cross_attention": false,
"add_final_layer_norm": false,
"architectures": null,
"attention_dropout": 0.1,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": 0,
"chunk_size_feed_forward": 0,
"classif_dropout": 0.1,
"classifier_dropout": 0.0,
"cross_attention_hidden_size": null,
"d_model": 768,
"decoder_attention_heads": 12,
"decoder_ffn_dim": 3072,
"decoder_layerdrop": 0.0,
"decoder_layers": 6,
"decoder_start_token_id": 2,
"diversity_penalty": 0.0,
"do_sample": false,
"dropout": 0.1,
"early_stopping": true,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"encoder_layerdrop": 0.0,
"encoder_layers": 6,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": 2,
"exponential_decay_length_penalty": null,
"finetuning_task": null,
"forced_bos_token_id": 0,
"forced_eos_token_id": 2,
"gradient_checkpointing": false,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1",
"2": "LABEL_2"
},
"init_std": 0.02,
"is_decoder": false,
"is_encoder_decoder": true,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1,
"LABEL_2": 2
},
"length_penalty": 1.0,
"max_length": 20,
"max_position_embeddings": 1024,
"min_length": 0,
"model_type": "florence2_language",
"no_repeat_ngram_size": 3,
"normalize_before": false,
"num_beam_groups": 1,
"num_beams": 3,
"num_hidden_layers": 6,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": 1,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"scale_embedding": false,
"sep_token_id": null,
"suppress_tokens": null,
"task_specific_params": null,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false,
"use_cache": true,
"vocab_size": 51289
},
"torch_dtype": "float32",
"transformers_version": "4.46.1",
"vision_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "",
"add_cross_attention": false,
"architectures": null,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"depths": [
1,
1,
9,
1
],
"dim_embed": [
128,
256,
512,
1024
],
"diversity_penalty": 0.0,
"do_sample": false,
"drop_path_rate": 0.1,
"early_stopping": false,
"enable_checkpoint": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"exponential_decay_length_penalty": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"image_feature_source": [
"spatial_avg_pool",
"temporal_avg_pool"
],
"image_pos_embed": {
"max_pos_embeddings": 50,
"type": "learned_abs_2d"
},
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "davit",
"no_repeat_ngram_size": 0,
"num_beam_groups": 1,
"num_beams": 1,
"num_groups": [
4,
8,
16,
32
],
"num_heads": [
4,
8,
16,
32
],
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"patch_padding": [
3,
1,
1,
1
],
"patch_prenorm": [
false,
true,
true,
true
],
"patch_size": [
7,
3,
3,
3
],
"patch_stride": [
4,
2,
2,
2
],
"prefix": null,
"problem_type": null,
"projection_dim": 768,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"suppress_tokens": null,
"task_specific_params": null,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false,
"visual_temporal_embedding": {
"max_temporal_embeddings": 100,
"type": "COSINE"
},
"window_size": 12
},
"vocab_size": 51289
}

View File

@ -1,8 +1,11 @@
import os
from pathlib import Path
from modelscope import snapshot_download
import subprocess
import shutil
__WEIGHTS_DIR = Path("weights")
MODEL_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "OmniParser-v2___0")
MODEL_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "OmniParser-v2___0")
PROCESSOR_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "Florence-2-base")
def download():
# Create weights directory
@ -13,9 +16,14 @@ def download():
"icon_detect/train_args.yaml",
"icon_detect/model.pt",
"icon_detect/model.yaml",
"icon_caption/config.json",
"icon_caption/generation_config.json",
"icon_caption/model.safetensors"
"icon_caption/model.safetensors",
]
# Extra config files downloaded from Florence2 repo
config_files = [
"configuration_florence2.py",
"modeling_florence2.py"
]
# Check and download missing files
@ -32,9 +40,25 @@ def download():
snapshot_download(
'AI-ModelScope/OmniParser-v2.0',
cache_dir='weights'
cache_dir='weights',
ignore_file_pattern=['config.json']
)
snapshot_download(
'AI-ModelScope/Florence-2-base',
cache_dir='weights',
allow_file_pattern=['*.py', '*.json']
)
# Move downloaded Florence config files into icon_caption
for file_path in config_files:
source_dir = os.path.join(PROCESSOR_DIR, file_path)
dest_dir = os.path.join(MODEL_DIR, "icon_caption", file_path)
shutil.copy(source_dir, dest_dir)
# Move customized config.json into icon_caption to load the model from local path
shutil.copy(os.path.join("util", "config.json"), os.path.join(MODEL_DIR, "icon_caption", "config.json"))
print("Download complete")
if __name__ == "__main__":