huggingface CLIP 相关模型下载与使用
下载模型到本地
model_name = "google/siglip2-base-patch16-224"
model_path = "models/huggingface/siglip2-base-patch16-224"snapshot_download(repo_id=model_name,local_dir=model_path,local_dir_use_symlinks=False,revision="main",#use_auth_token="<YOUR_ACCESS_TOKEN>",resume_download=True)
image 模型特征提取
第一种,有显式的VisonModel
from transformers import CLIPVisionModel, CLIPImageProcessordef _crop_and_resize_pad(image, height=480, width=720):image = np.array(image)image_height, image_width, _ = image.shapeif image_height / image_width < height / width:pad = int((((height / width) * image_width) - image_height) / 2.)padded_image = np.ones((image_height + pad * 2, image_width, 3), dtype=np.uint8) * 255# padded_image = np.zeros((image_height + pad * 2, image_width, 3), dtype=np.uint8)padded_image[pad:pad+image_height, :] = imageimage = Image.fromarray(padded_image).resize((width, height))else:pad = int((((width / height) * image_height) - image_width) / 2.)padded_image = np.ones((image_height, image_width + pad * 2, 3), dtype=np.uint8) * 255# padded_image = np.zeros((image_height, image_width + pad * 2, 3), dtype=np.uint8)padded_image[:, pad:pad+image_width] = imageimage = Image.fromarray(padded_image).resize((width, height))return imageimage_encoder = CLIPVisionModel.from_pretrained(pipeline_path)
image_processor = CLIPImageProcessor.from_pretrained(pipeline_path)image_encoder = image_encoder.to(torch.float32).to(device)img = np.random.rand( 236, 621, 3 ) * 255
image = Image.fromarray(img.astype(np.uint8))
image = _crop_and_resize_pad( image, height=512, width=512 )
image.save("tmp/res.jpg")
key = "pixel_values"
image = image_processor(images=image, return_tensors="pt")image = image[key].to(device)
# torch.Size([1, 3, 336, 336])
image_embeds = image_encoder(pixel_values=image, output_hidden_states=True)
res = image_embeds.hidden_states[-2]
第二种,使用Auto 方式
from transformers import AutoProcessor, AutoModel
pipeline_path="models/huggingface/siglip2-large-patch16-512"
model = AutoModel.from_pretrained(pipeline_path)
processor = AutoProcessor.from_pretrained(pipeline_path)key = "pixel_values"
image = processor(images=image, return_tensors="pt")#
#print( image.keys() )
image = image[key] #.to(device)
# torch.Size([1, 3, 336, 336])
print( image.shape )
image_embeds = model.vision_model(pixel_values=image, output_hidden_states=True)for k,v in image_embeds.items():if k == "hidden_states":print(k)[ print(e.shape) for e in v ]else:print(k, v.shape)
res = image_embeds.hidden_states[-2]
打印的信息
# clip-vit-large-patch14-336
last_hidden_state torch.Size([1, 577, 1024])
pooler_output torch.Size([1, 1024])
hidden_states
torch.Size([1, 577, 1024])====-----====
# siglip2-large-patch16-512
last_hidden_state torch.Size([1, 1024, 1024])
pooler_output torch.Size([1, 1024])
hidden_states
torch.Size([1, 1024, 1024])