图搜-图文对比学习 实践

图搜-图文对比学习实践

经典图搜方法

传统图搜技术包括PHash,DHash,局部敏感哈希等等;随着深度学习的快速发展,深度学习模型在图片认知和分类上达到与人相媲美的能力;这也使得像Vgg,Resnet
系列的网络越来越多的用来对图片内容进行理解,比如FasterRCNN,CenterNet等常见的物品检测骨干网络都可以使用常见Resnet这些常见的Backbone,进行物体地位以及目标物体分类;鉴于这些网络高层次特征的理解能力,因此被越来越多的应用到图搜领域;以VGG和Resnet50为例,结合Pytorch框架进行实现

使用方法:直接输入图片,执行脚本,输入片地址即可

VGG特征提取代码

import cv2
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
import torchvision.models as models
import torch.nn.functional as F

class VGGFeaExtractorl(object):
    def __init__(self):
        self.image_size = 224
        self.dimision = 512
        self.load_model()

    def load_model(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = models.vgg16(pretrained=True, init_weights=False).to(self.device)
        self.model = self.model.eval()
        self.PIXEL_MEANS = torch.tensor((0.485, 0.456, 0.406)).to(self.device)
        self.PIXEL_STDS = torch.tensor((0.229, 0.224, 0.225)).to(self.device)
        self.num = torch.tensor(255.0).to(self.device)

    def preprocess_input(self, image):
        image = cv2.resize(image, (self.image_size, self.image_size))
        # gpu version
        image_tensor = torch.from_numpy(image.copy()).to(self.device).float()
        image_tensor /= self.num
        image_tensor -= self.PIXEL_MEANS
        image_tensor /= self.PIXEL_STDS
        image_tensor = image_tensor.permute(2, 0 ,1)
        return image_tensor

    def forward(self, x):
        x = self.preprocess_input(x).unsqueeze(0)
        x = self.model.features(x)
        x = F.max_pool2d(x, kernel_size=(7, 7))
        x = x.view(x.size(0),-1)
        return self.torch2list(x)

    def torch2list(self, torch_data):
        return torch_data.cpu().detach().numpy().tolist()
    
def test(url):
    model = VGGFeaExtractorl()
    model.load_model()
    import urllib.request
    def test_url(imageurl):
        resp = urllib.request.urlopen(imageurl).read()
        image = np.asarray(bytearray(resp), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        feat = model.forward(image)
        return feat[0]/np.linalg.norm(feat[0])
    print(test_url(url))
if __name__ == "__main__":
    import sys
test(sys.argv[1])

Resnet特征提取代码

import cv2
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
import torchvision.models as models
import torch.nn.functional as F

class Resnet50FeaExtractor(object):

    def __init__(self):
        self.image_size = 224
        self.dimision = 2048
        self.load_model()

    def load_model(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = models.resnet50(pretrained=True).to(self.device)
        self.model = self.model.eval()
        self.PIXEL_MEANS = torch.tensor((0.485, 0.456, 0.406)).to(self.device)
        self.PIXEL_STDS = torch.tensor((0.229, 0.224, 0.225)).to(self.device)
        self.num = torch.tensor(255.0).to(self.device)
        
    def preprocess_input(self, image):
        image = cv2.resize(image, (self.image_size, self.image_size))
        image_tensor = torch.from_numpy(image.copy()).to(self.device).float()
        image_tensor /= self.num
        image_tensor -= self.PIXEL_MEANS
        image_tensor /= self.PIXEL_STDS
        image_tensor = image_tensor.permute(2, 0 ,1)
        return image_tensor

    def forward(self, x):
        x = self.preprocess_input(x).unsqueeze(0)
        x = self.model.conv1(x)
        x = self.model.bn1(x)
        x = self.model.relu(x)
        x = self.model.maxpool(x)

        x = self.model.layer1(x)
        x = self.model.layer2(x)
        x = self.model.layer3(x)
        x = self.model.layer4(x)
        x = F.avg_pool2d(x, kernel_size=x.size()[2:])
        x = torch.squeeze(x,-1)
        x = torch.squeeze(x,-1)
        return self.torch2list(x)

    def torch2list(self, torch_data):
        return torch_data.cpu().detach().numpy().tolist()

def load_model():
    return Resnet50FeaExtractor()

def test():
    model = Resnet50FeaExtractor()
    model.load_model()
    import urllib.request
    def test_url(imageurl):
        resp = urllib.request.urlopen(imageurl).read()
        image = np.asarray(bytearray(resp), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        feat = model.forward(image)
        return (feat[0]/np.linalg.norm(feat[0])).tolist()
    print(test_url(url))

if __name__ == "__main__":
    import sys
    test(sys.argv[1])

对比学习方法-CLIP

pip install git+https://github.com/openai/CLIP.git

import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load(" RN50x16", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
image_features = model.encode_image(image) # 提取特征注意特征维度;

Resnet50 VS CLIP-RN50x16

虽然CLIP-RN50x16 视觉分支与ResNet50 不完全一致,但是在视觉特征表达上特性一致;
电商图片上的一些对比:Recall指标对比如下,横轴表示recall 1~20;请添加图片描述

结果对比

请添加图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Mira-Tableau

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值