【打卡】图像检索与重复图像识别3
任务3:深度全局特征:
CNN/VIT模型特征提取:介绍CNN和VIT模型在图像特征提取中的应用,包括如何利用预训练模型提取图像的全局特征。
CLIP模型特征提取:讲解CLIP模型的原理和应用,包括如何将图像和文本的特征嵌入到同一个向量空间中,以及如何利用CLIP模型进行图像检索和分类。
深度全局特征的优缺点:讨论深度全局特征和传统算法的差异,包括特征表达能力、泛化能力、计算效率等方面。
步骤1:使用CNN模型预训练模型(如ResNet18)提取图片的CNN特征,计算query与dataset最相似的图片
步骤2:使用VIT模型预训练模型提取图片特征,计算query与dataset最相似的图片
步骤3:使用CLIP模型预训练模型提取图片特征,计算query与dataset最相似的图片
步骤4:分别将每种思路的计算结果提交到实践比赛地址:https://competition.coggle.club/
代码中,CLIP使用openAI发布的CLIP模型,VIT使用huggingface中VIT base 21k预训练
# 使用CNN模型预训练模型(如ResNet18)提取图片的CNN特征,计算query与dataset最相似的图片
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import torch
import numpy as np
from torch.nn.functional import normalize
import glob
import torchvision
import pandas as pd
import argparse
import os
import clip
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
# 使用预训练模型提取图片的特征
def get_feat(args, img: torch.Tensor, model):
if args.model in ['resnet18', 'resnet50', 'resnet101', 'resnet152']:
img = img.cuda()
feat = model(img)
feat = normalize(feat)
return feat
# 计算query与dataset中图片的相似度
def get_sim(query_feat: torch.Tensor, dataset_feat: torch.Tensor):
"""
param:
query_feat: query图片的特征, shape: [query_num, feat_dim]
dataset_feat: dataset图片的特征, shape: [dataset_num, feat_dim]
return: 最佳匹配的图片索引, shape: [query_num, 1]
"""
dis = torch.mm(query_feat, dataset_feat.t())
# 计算每张query图片与dataset图片的相似度,取相似度最高的图片
top_indix = torch.argmax(dis, dim=1)
# 把top_indix转换成numpy数组
top_indix = top_indix.cpu().numpy()
return top_indix
# 读取图片
def read_img(path):
img = Image.open(path)
return img
# 生成csv文件,保存匹配结果
def save_csv(args, top_index):
dataset_path = np.array(glob.glob('./dataset/*.jpg'))
# 生成保存结果文件夹
os.system('mkdir -p submit')
# 提取所有对应元素
top_paths = dataset_path[top_index]
top_paths = [x.split('/')[-1] for x in top_paths]
pd.DataFrame({
'source':top_paths,
'query': [x.split('/')[-1] for x in glob.glob('./query/*.jpg')]
}).to_csv(os.path.join('./submit/', args.model + '.csv'), index=None)
# 定义参数
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='resnet18',\
choices=['resnet18', 'resnet50',\
'vit_base', 'clip_vit_base_patch16_224'], help='model name')
args = parser.parse_args()
return args
# 定义模型
def get_model(args):
if args.model == 'resnet18':
# 使用resnet18模型提取图片特征
model = models.resnet18(pretrained=True)
# 去掉模型最后一层
model = torch.nn.Sequential(*list(model.children())[:-1])
if args.model == 'resnet50':
# 使用resnet50模型提取图片特征
model = models.resnet50(pretrained=True)
# 去掉模型最后一层
model = torch.nn.Sequential(*list(model.children())[:-1])
return model
def get_vit_model(args):
if args.model == 'vit_base':
# 使用vit模型提取图片特征
preprocess = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
if args.model == 'clip_vit_base_patch16_224':
# 使用clip模型提取图片特征
# model = torchvision.models.clip_vit_base_patch16_224(pretrained=True)
model_name = 'ViT-B/16'
model, preprocess = clip.load(model_name, device='cuda')
return model, preprocess
# 数据增强
def data_aug(img):
w, h = img.size
aug = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
# resize成224*224
torchvision.transforms.Resize((224, 224)),
# torchvision.transforms.RandAugment(),
torchvision.transforms.RandomInvert(0.2),
torchvision.transforms.RandomGrayscale(0.2),
torchvision.transforms.RandomHorizontalFlip(p=0.5),
torchvision.transforms.RandomVerticalFlip(p=0.5),
torchvision.transforms.RandomAutocontrast(),
torchvision.transforms.RandomRotation(10),
torchvision.transforms.RandomAdjustSharpness(0.2),
torchvision.transforms.RandomChoice([
torchvision.transforms.Pad(10),
torchvision.transforms.RandomResizedCrop(size=(w - 30, h - 30),
scale=(0.8, 1))
]),
])
aug_test = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])
return aug(img).unsqueeze(0)
# 主函数
if __name__ == '__main__':
# 传入args参数
args = get_args()
if args.model in ['resnet18', 'resnet50', 'resnet101', 'resnet152']:
model = get_model(args)
if args.model in ['vit_base', 'vit_large', 'clip_vit_base_patch16_224', 'clip_vit_large_patch16_224']:
model, preprocess = get_vit_model(args)
model = model.cuda()
model.eval()
# 使用torch.no_grad()包装模型推理过程
with torch.no_grad():
# 计算dataset文件夹中所有图的特征
dataset_feat = [] # list of tensor
for i, path in enumerate(glob.glob('./dataset/*.jpg')):
if i%100 == 0:
print('dataset: ', i)
# if i==5:
# break
img = read_img(path)
if args.model == 'vit_base':
img = preprocess(images=img, return_tensors="pt")
img['pixel_values'] = img['pixel_values'].cuda()
outputs = model(**img)
last_hidden_states = outputs.last_hidden_state # torch.Size([1, 16*16+1, 768])
# feat取CLS的特征
feat = last_hidden_states[:, 0, :] # torch.Size([1, 768])
feat /= feat.norm(dim=-1, keepdim=True)
elif args.model == 'clip_vit_base_patch16_224':
img = preprocess(img).unsqueeze(0).cuda()
feat = model.encode_image(img)
feat /= feat.norm(dim=-1, keepdim=True)
else:
img = data_aug(img)
feat = get_feat(args, img, model)
dataset_feat.append(feat)
# 进行归一化
dataset_feat = torch.stack(dataset_feat, dim=0)
dataset_feat = dataset_feat.reshape(dataset_feat.shape[0], -1)
# dataset_feat = normalize(dataset_feat)
# 计算query文件夹中所有图像的特征
query_feat = []
for i, path in enumerate(glob.glob('./query/*.jpg')):
if i%50 == 0:
print('query: ', i)
# if i==5:
# break
img = read_img(path)
if args.model == 'vit_base':
img = preprocess(images=img, return_tensors="pt")
img['pixel_values'] = img['pixel_values'].cuda()
outputs = model(**img)
last_hidden_states = outputs.last_hidden_state # torch.Size([1, 16*16+1, 768])
feat = last_hidden_states[:, 0, :] # torch.Size([1, 768])
feat /= feat.norm(dim=-1, keepdim=True)
elif args.model == 'clip_vit_base_patch16_224':
img = preprocess(img).unsqueeze(0).cuda()
feat = model.encode_image(img)
feat /= feat.norm(dim=-1, keepdim=True)
else:
img = data_aug(img)
feat = get_feat(args, img, model)
query_feat.append(feat)
# 进行归一化
query_feat = torch.stack(query_feat, dim=0)
query_feat = query_feat.reshape(query_feat.shape[0], -1)
# query_feat = normalize(query_feat)
# 计算query与dataset中图片的相似度
top_index = get_sim(query_feat, dataset_feat)
# 保存匹配结果
save_csv(args, top_index)
重构了代码,同时在提高了准确率,CNN部分
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from PIL import Image
import glob
import pandas as pd
import numpy as np
import os
class ImageDataset(Dataset):
def __init__(self, path, transform=None):
super(ImageDataset, self).__init__()
self.path = []
for p in glob.glob(path):
self.path.append(p)
self.transform = transform
def __getitem__(self, index):
img = Image.open(self.path[index]).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img
def __len__(self):
return len(self.path)
# pretrain models
class ImageEmbeddingNet(nn.Module):
def __init__(self):
super(ImageEmbeddingNet, self).__init__()
model = ResNet50(pretrained=True)
model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
model.eval()
self.model = model
def forward(self, x):
x = self.model(x)
return x
# 写一个ResNet18的网络,导入预训练模型的参数,同时抽取每个stage的特征,concat起来作为返回值
class ResNet18(nn.Module):
def __init__(self, pretrained=True):
super(ResNet18, self).__init__()
self.model = models.resnet18(pretrained=pretrained)
# remove the last fc layer
self.model.fc = nn.Identity()
self.model.eval()
def forward(self, x):
x = self.model.conv1(x)
x = self.model.bn1(x)
x = self.model.relu(x)
x = self.model.maxpool(x)
x1 = self.model.layer1(x)
x2 = self.model.layer2(x1)
x3 = self.model.layer3(x2)
x4 = self.model.layer4(x3)
# concat in batch size dimension
# y = torch.cat([x1, x2, x3, x4], dim=0)
return x4
# 写一个ResNet50,调用torchvision的预训练模型
class ResNet50(nn.Module):
def __init__(self, pretrained=True):
super(ResNet50, self).__init__()
self.model = models.resnet50(pretrained=pretrained)
# remove the last fc layer
self.model.fc = nn.Identity()
self.model.eval()
def forward(self, x):
x = self.model.conv1(x)
x = self.model.bn1(x)
x = self.model.relu(x)
x = self.model.maxpool(x)
x1 = self.model.layer1(x)
x2 = self.model.layer2(x1)
x3 = self.model.layer3(x2)
x4 = self.model.layer4(x3)
# concat in batch size dimension
# y = torch.cat([x1, x2, x3, x4], dim=0)
return x4
imgmodel = ImageEmbeddingNet()
imgmodel = imgmodel.cuda()
# transform
trans = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
# resize成224*224
torchvision.transforms.Resize((224, 224)),
# torchvision.transforms.RandAugment(),
torchvision.transforms.RandomInvert(0.2),
torchvision.transforms.RandomGrayscale(0.2),
torchvision.transforms.RandomHorizontalFlip(p=0.5),
torchvision.transforms.RandomVerticalFlip(p=0.5),
torchvision.transforms.RandomAutocontrast(),
torchvision.transforms.RandomRotation(10),
torchvision.transforms.RandomAdjustSharpness(0.2),
torchvision.transforms.RandomChoice([
torchvision.transforms.Pad(10),
# torchvision.transforms.RandomResizedCrop(size=(w - 30, h - 30),
# scale=(0.8, 1))
]),
])
# load data
query_dataset = ImageDataset(path='./query/*.jpg',transform=trans)
query_loader = torch.utils.data.DataLoader(query_dataset, batch_size=4, shuffle=False, num_workers=4)
match_dataset = ImageDataset(path='./dataset/*.jpg',transform=trans)
match_loader = torch.utils.data.DataLoader(match_dataset, batch_size=1, shuffle=False, num_workers=4, drop_last=True)
# 生成csv文件,保存匹配结果
def save_csv(top_index):
dataset_path = np.array(glob.glob('./dataset/*.jpg'))
# 生成保存结果文件夹
os.system('mkdir -p submit')
# 提取所有对应元素
top_paths = dataset_path[top_index]
top_paths = [x.split('/')[-1] for x in top_paths]
pd.DataFrame({
'source':top_paths,
'query': [x.split('/')[-1] for x in glob.glob('./query/*.jpg')]
}).to_csv(os.path.join('./submit/', 'resnet50' + '.csv'), index=None)
# extract feature
query_feature = []
match_feature = []
with torch.no_grad():
for i, data in enumerate(query_loader):
if i % 100 == 0:
print('query', i)
data = data.cuda()
feature = imgmodel(data)
query_feature.append(feature)
for i, data in enumerate(match_loader):
if i % 100 == 0:
print('dataset', i)
data = data.cuda()
feature = imgmodel(data)
match_feature.append(feature)
query_feature = torch.stack(query_feature, dim=0)
match_feature = torch.stack(match_feature, dim=0)
# print(query_feature.shape, match_feature.shape) #torch.Size([625, 4, 512, 8, 8]) torch.Size([11139, 1, 512, 8, 8])
# 把query_feature变成2500,512*8*8
query_feature = query_feature.view(-1,query_feature.shape[-3]*query_feature.shape[-2]*query_feature.shape[-1])
# 把match_feature变成11139,512*8*8
match_feature = match_feature.view(-1, match_feature.shape[-3]*match_feature.shape[-2]*match_feature.shape[-1])
# compute similarity, cosine similarity
query_feature = F.normalize(query_feature, dim=1)
match_feature = F.normalize(match_feature, dim=1)
similarity = torch.mm(query_feature, match_feature.t()) # torch.Size([2500, 11139])
# calculate top1 for each query
top1 = torch.argmax(similarity, dim=1) # torch.Size([2500])
top1 = top1.cpu().numpy()
save_csv(top1)
任务6:特征压缩与扩展查询
PQ量化(Product Quantization)是一种压缩和加速高维向量相似性搜索的技术,通常用于图像和视频检索领域。PQ量化将高维向量划分成多个较小的子向量,并对每个子向量使用独立的编码器进行编码,从而将高维向量转换为一系列的子编码。这些子编码通常被存储在内存中,并通过查询相似性搜索引擎进行搜索。
特征PCA白化是一种预处理技术,它将输入数据的协方差矩阵进行特征分解,并将其变换为一个对角矩阵,然后将原始数据进行线性变换,使得变换后的数据的协方差矩阵为单位矩阵。这样做的好处是可以去除数据中的冗余信息和相关性,并且提高数据的稳定性和可解释性。
步骤1:对query与dataset特征进行白化进行处理。从新计算query与dataset最相似的图片,提交到实践比赛地址:https://competition.coggle.club/
步骤2:使用PCA对特征query与dataset特征降维到100/200。分别从新计算query与dataset最相似的图片,提交到实践比赛地址:https://competition.coggle.club/
在原有的代码中添加白化、PCA数据处理部分,很简单,只有几行。
同时,在使用PCA降维后,我们可以提取ResNet更多的特征(比如不经过maxPooling操作的特征),再进行PCA
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from PIL import Image
import glob
import pandas as pd
import numpy as np
import os
class ImageDataset(Dataset):
def __init__(self, path, transform=None):
super(ImageDataset, self).__init__()
self.path = []
for p in glob.glob(path):
self.path.append(p)
self.transform = transform
def __getitem__(self, index):
img = Image.open(self.path[index]).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img
def __len__(self):
return len(self.path)
# pretrain models
class ImageEmbeddingNet(nn.Module):
def __init__(self):
super(ImageEmbeddingNet, self).__init__()
model = ResNet18(pretrained=True)
model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
model.eval()
self.model = model
def forward(self, x):
x = self.model(x)
return x
# 写一个ResNet18的网络,导入预训练模型的参数,同时抽取每个stage的特征,concat起来作为返回值
class ResNet18(nn.Module):
def __init__(self, pretrained=True):
super(ResNet18, self).__init__()
self.model = models.resnet50(pretrained=pretrained)
# remove the last fc layer
self.model.fc = nn.Identity()
self.model.eval()
def forward(self, x):
return self.model(x)
# 写一个ResNet50,调用torchvision的预训练模型
class ResNet50(nn.Module):
def __init__(self, pretrained=True):
super(ResNet50, self).__init__()
self.model = models.resnet50(pretrained=pretrained)
# remove the last fc layer
self.model.fc = nn.Identity()
self.model.eval()
def forward(self, x):
y = self.model(x)
return y
imgmodel = ImageEmbeddingNet()
imgmodel = imgmodel.cuda()
# transform
trans = torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
# resize成224*224
torchvision.transforms.Resize((224, 224)),
# torchvision.transforms.RandAugment(),
torchvision.transforms.RandomInvert(0.2),
torchvision.transforms.RandomGrayscale(0.2),
torchvision.transforms.RandomHorizontalFlip(p=0.5),
torchvision.transforms.RandomVerticalFlip(p=0.5),
torchvision.transforms.RandomAutocontrast(),
torchvision.transforms.RandomRotation(10),
torchvision.transforms.RandomAdjustSharpness(0.2),
])
# load data
query_dataset = ImageDataset(path='./query/*.jpg',transform=trans)
query_loader = torch.utils.data.DataLoader(query_dataset, batch_size=100, shuffle=False, num_workers=4)
match_dataset = ImageDataset(path='./dataset/*.jpg',transform=trans)
match_loader = torch.utils.data.DataLoader(match_dataset, batch_size=1, shuffle=False, num_workers=4, drop_last=True)
# 生成csv文件,保存匹配结果
def save_csv(top_index):
dataset_path = np.array(glob.glob('./dataset/*.jpg'))
# 生成保存结果文件夹
os.system('mkdir -p submit')
# 提取所有对应元素
top_paths = dataset_path[top_index]
top_paths = [x.split('/')[-1] for x in top_paths]
pd.DataFrame({
'source':top_paths,
'query': [x.split('/')[-1] for x in glob.glob('./query/*.jpg')]
}).to_csv(os.path.join('./submit/', 'resnet-50-withwhite' + '.csv'), index=None)
def whiten_matrix(matrix):
# Subtract the mean from each row of the matrix
matrix = matrix - torch.mean(matrix, dim=1, keepdim=True)
# Compute the covariance matrix
cov_matrix = torch.mm(matrix, matrix.t()) / (matrix.size(1) - 1)
# Compute the eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = torch.symeig(cov_matrix, eigenvectors=True)
# Compute the diagonal matrix of eigenvalues to use for whitening
diag_matrix = torch.diag(torch.sqrt(1.0 / (eigenvalues + 1e-5)))
# Compute the matrix of whitening coefficients
whitening_matrix = torch.mm(torch.mm(eigenvectors, diag_matrix), eigenvectors.t())
# Whiten the matrix
matrix = torch.mm(whitening_matrix, matrix)
return matrix
# 用pytorch写一个pca,包括whiten
def pca(matrix, n_components):
# Subtract the mean from each row of the matrix
# print(matrix.shape)
matrix = matrix - torch.mean(matrix, dim=1, keepdim=True)
# Compute the covariance matrix
cov_matrix = torch.mm(matrix, matrix.t()) / (matrix.size(1) - 1)
# Compute the eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = torch.symeig(cov_matrix, eigenvectors=True)
# Sort the eigenvalues and eigenvectors in descending order
idx = torch.argsort(eigenvalues, descending=True)
eigenvalues = eigenvalues[idx][:n_components]
eigenvectors = eigenvectors[:, idx][:, :n_components]
# Compute the diagonal matrix of eigenvalues to use for whitening
diag_matrix = torch.diag(torch.sqrt(1.0 / (eigenvalues + 1e-5)))
# Compute the matrix of whitening coefficients
whitening_matrix = torch.mm(torch.mm(eigenvectors, diag_matrix), eigenvectors.t())
# Whiten the matrix
matrix = torch.mm(whitening_matrix, matrix)
return matrix
# extract feature
query_feature = []
match_feature = []
with torch.no_grad():
for i, data in enumerate(query_loader):
if i % 5 == 0:
print('query', i)
data = data.cuda()
feature = imgmodel(data)
query_feature.append(feature)
for i, data in enumerate(match_loader):
if i % 100 == 0:
print('dataset', i)
data = data.cuda()
feature = imgmodel(data)
match_feature.append(feature)
query_feature = torch.stack(query_feature, dim=0)
match_feature = torch.stack(match_feature, dim=0)
print(query_feature.shape, match_feature.shape)
# 把query_feature变成2500,512*8*8
query_feature = query_feature.view(-1,query_feature.shape[-1])
# 把match_feature变成11139,512*8*8
match_feature = match_feature.view(-1, match_feature.shape[-1])
use_PCA = True
if not use_PCA:
# whiten
query_feature = whiten_matrix(query_feature)
match_feature = whiten_matrix(match_feature)
else:
# use whiten PCA to reduce dimension of query_feature and match_feature
query_feature = pca(query_feature, 200)
match_feature = pca(match_feature, 200)
# compute similarity, cosine similarity
query_feature = F.normalize(query_feature, dim=1)
match_feature = F.normalize(match_feature, dim=1)
# compute similarity
similarity = torch.mm(query_feature, match_feature.t()) # torch.Size([2500, 11139])
# calculate top1 for each query
top1 = torch.argmax(similarity, dim=1) # torch.Size([2500])
top1 = top1.cpu().numpy()
save_csv(top1)
准确率可达32.6