目标:获取图片的embedding
一、方法一:下载DINOv2模型
(前提是安装好pytorch)
1. pip安装
pip install transformers -i https://mirror.baidu.com/pypi/simple
2. 下载dinov2提供的预训练模型,需要自行前往HuggingFace网站上查找dinov2模型进行下载。模型需要下载三个文件:
config.json
preprocessor_config.json
pytorch_model.bin
下载完成后将上述三个文件放置在本地一个名为dinov2_base的文件夹中即可。
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import torch.nn as nn
import torch
# 定义环境
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
# 加载dinov2模型
model_folder = './dinov2_base'
processor = AutoImageProcessor.from_pretrained(model_folder)
model = AutoModel.from_pretrained(model_folder).to(device)
# 提取图片特征
image = Image.open('img.jpg')
with torch.no_grad():
inputs = processor(images=image1, return_tensors="pt").to(device)
outputs = model(**inputs1)
image_features = outputs1.last_hidden_state
image_features = image_features1.mean(dim=1)
二、方法二:安装dinov2包
基于dinoV2分类模型修改_github dinov2-CSDN博客
# 实例化模型代码
from functools import partial
from dinov2.eval.linear import create_linear_input
from dinov2.eval.linear import LinearClassifier
from dinov2.eval.utils import ModelWithIntermediateLayers
model = dinov2_vits14(weights={'LVD142M':'./model/dinoV2/dinov2_vits14_pretrain.pth'})
autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=torch.float16)
self.feature_model = ModelWithIntermediateLayers( model, n_last_blocks=1, autocast_ctx=autocast_ctx).to(device)
# 实例化分类模型全连接层。
self.embed_dim = model.embed_dim
# 100对应的是你需要分类的类别数量
self.classifier = LinearClassifier( self.embed_dim*2, use_n_blocks=1, use_avgpool=True, num_classes=100).to(device)
# 冻结骨干网络
for param in model.feature_model.parameters():
param.requires_grad = False
# neck结构,在输出后添加卷积的过程。
def autopad(k, p=None): # kernel, padding
# Pad to 'same'
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Conv(nn.Module):
# Standard convolution
def __init__(self, c1, c2, k=1, s=1, p=None, g=1,
act=True): # ch_in, ch_out, kernel, stride, padding, groups
super().__init__()
self.conv = nn.Conv1d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
self.bn = nn.BatchNorm1d(c2)
self.act = nn.ReLU()
def forward(self, x):
return self.act(self.bn(self.conv(x)))
class neck_dinov2(nn.Module):
def __init__(self, c0, c1, nc, dropout=0.5):
super().__init__()
self.conv1 = Conv1d(c0, c0)
self.conv2 = Conv1d(c0, c0)
self.drop = nn.Dropout(p=dropout)
self.linear = nn.Linear(c1 * 2, nc)
self.linear.weight.data.normal_(mean=0.0, std=0.01)
self.linear.bias.data.zero_()
def forward(self, x):
feature, class_token = x[0]
feature, class_token = torch.cat([feature.detach()], dim=-1), torch.cat([class_token.detach()], dim=-1)
feature = self.drop(self.conv2(self.conv1(feature)))
x0 = torch.cat((torch.mean(feature, dim=1), class_token), dim=-1)
return self.linear(x0)
class HubConf(nn.Module):
def __init__(self,cfg,pretrain_choice = 'frozen'):
super(HubConf, self).__init__()
model_path = cfg.MODEL.PRETRAIN_PATH
self.cfg = cfg
self.base = eval(cfg.MODEL.NAME)(weights={'LVD142M':model_path})
self.in_planes = self.base.embed_dim
self.consize = int((cfg.INPUT.SIZE_TRAIN[0]/14)*(cfg.INPUT.SIZE_TRAIN[1]/14))
autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=torch.float16)
self.feature_model = ModelWithIntermediateLayers(self.base, n_last_blocks=1, autocast_ctx=autocast_ctx)
if pretrain_choice == 'frozen':
for param in self.feature_model.parameters():
param.requires_grad = False
self.country_cls = neck_dinov2(self.consize, self.in_planes, cfg.MODEL.nc1, dropout=cfg.MODEL.DROPOUT) # 分类头1
self.cn_cls = neck_dinov2(self.consize,self.in_planes, cfg.MODEL.nc2, dropout=cfg.MODEL.DROPOUT) # 分类头2
self.ct_cls = neck_dinov2(self.consize,self.in_planes, cfg.MODEL.nc3, dropout=cfg.MODEL.DROPOUT) # 分类头3
def forward(self, x):
global_feat = self.feature_model(x) # ((bs, pach_h*pach_w,embed_dim ),(bs, embed_dim )) ((1,(224/14)*(224/14), 384),(1, 384))
country_score = self.country_cls(global_feat)
cn_score = self.cn_cls(global_feat)
ct_score = self.ct_cls(global_feat)
return (country_score, cn_score,ct_score)
def load_param(self, trained_path, device='cuda:0'):
param_dict = torch.load(trained_path, map_location=device)
for i in param_dict:
#if 'classifier' in i:
if i not in self.state_dict():
print('not load param ', i)
continue
self.state_dict()[i].copy_(param_dict[i])
import torch
import torchvision.transforms as T
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import matplotlib
from dinov2.hub.backbones import dinov2_vitb14, dinov2_vitg14, dinov2_vitl14, dinov2_vits14
patch_h = 50
patch_w = 100
feat_dim = 384
transform = T.Compose([
T.GaussianBlur(9, sigma=(0.1, 2.0)),
T.Resize((patch_h * 14, patch_w * 14)),
T.CenterCrop((patch_h * 14, patch_w * 14)),
T.ToTensor(),
T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])
# dinov2_vits14 = torch.hub.load('', 'dinov2_vits14', source='local').cuda()
vits14 = torch.hub.load('', 'dinov2_vits14', weights={'LVD142M':'./model/dinoV2/dinov2_vits14_pretrain.pth'},source='local').cuda()
features = torch.zeros(4, patch_h * patch_w, feat_dim)
imgs_tensor = torch.zeros(4, 3, patch_h * 14, patch_w * 14).cuda()
img_path = f'1.jpg'
img = Image.open(img_path).convert('RGB')
imgs_tensor[0] = transform(img)[:3]
with torch.no_grad():
features_dict = vits14.forward_features(imgs_tensor)
features = features_dict['x_norm_patchtokens']