MoCo算法代码详解
本文代码来源:
- https://colab.research.google.com/github/facebookresearch/moco/blob/colab-notebook/colab/moco_cifar10_demo.ipynb#scrollTo=C69VJZiNxgzn
本文记录我学习代码的一般过程,共勉!
1.导入包
from datetime import datetime
from functools import partial
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import CIFAR10
from torchvision.models import resnet
from tqdm import tqdm
import argparse
import json
import math
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from model.MoCo import ModelMoCo
没有什么好说的
2.参数设置
打印参数就是这个样子
Namespace(arch='resnet18', batch_size=512, bn_splits=8, cos=True, epochs=1, knn_k=200, knn_t=0.1, lr=0.06, moco_dim=128, moco_k=4096, moco_m=0.99, moco_t=0.1, results_dir='./cache-2022-06-12-18-41-15-moco', resume='', schedule=[], symmetric=False, wd=0.0005)
这里有两点需要注意:第一个圈设置权重的保存路径,如果不设置就会用它设置的保存路径,第二个圈是这个是在cmd 的黑框中运行还是在jupyter中运行,可自行测试。
说明,本文中很多维度信息,都可以参考这里的参数和自己数据集的规模。比如这里数据集用的是cifar 10,batch size 为512。那么看到50000就是指数据规模,看到512就是batch size 大小。
3.数据预处理
class CIFAR10Pair(CIFAR10):
"""CIFAR10 Dataset.
"""
def __getitem__(self, index):
img = self.data[index]
img = Image.fromarray(img)
if self.transform != None:
im_1 = self.transform(img)
im_2 = self.transform(img)
return im_1, im_2
train_transform = transforms.Compose([
transforms.RandomResizedCrop(32),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
transforms.RandomGrayscale(p=0.2),
transforms.ToTensor(),
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
# data prepare
train_data = CIFAR10Pair(root='E:\HSI_Classification\datasets\cifar', train=True, transform=train_transform, download=True)
train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True)
# memory_data 的作用?
memory_data = CIFAR10(root='E:\HSI_Classification\datasets\cifar', train=True, transform=test_transform, download=True)
memory_loader = DataLoader(memory_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True)
test_data = CIFAR10(root='E:\HSI_Classification\datasets\cifar', train=False, transform=test_transform, download=True)
test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True)
这里有三点需要注意:
- 第一这里CIFAR10Pair 就是构造一个样本对。输出的 target 就是输入。所谓对比学习,无监督学习的精髓。
- 第二这里的train_loader, memory_loader 和 test_loader。 train和 memory 都是训练集的数据,他们的不同之处在于, 数据增广的方式不同,数据的组成也不同。 train的增广是用来训练的,标签就是图像本身,memory是用来测试的,它和test是一致的,标签就是标注的标签。memory主要用于在测试的时候,如果用到 knn,则可以构造memory bank 监控学习精度,test在测试的时候评估模型精度
- 第三这里可以改成自己的数据集。照猫画虎我的数据集:
TrainPair:
train shape: (10249, 33, 33, 3)
# 数据增强
train_transform = transforms.Compose([
transforms.RandomResizedCrop(32),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
transforms.RandomGrayscale(p=0.2),
transforms.ToTensor(),
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
test_transform = transforms.Compose([
transforms.RandomResizedCrop(32),
transforms.ToTensor(),
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
""" Training dataset"""
class TrainPair(torch.utils.data.Dataset):
def __init__(self, transform=None):
self.transform = transform
self.len = train.shape[0]
self.data = train
self.classes = np.max(train_label)
def __getitem__(self, index):
img = self.data[index]
img = Image.fromarray(np.uint8(img))
if self.transform != None:
im_1 = self.transform(img)
im_2 = self.transform(img)
return im_1, im_2
def __len__(self):
# 返回文件数据的数目
return self.len
""" memory dataset"""
class TrainDS(torch.utils.data.Dataset):
def __init__(self, transform=None):
self.transform = transform
self.len = train.shape[0]
self.data = train
self.targets = train_label
self.classes = np.max(train_label)
def __getitem__(self, index):
img = self.data[index]
img = Image.fromarray(np.uint8(img))
img = self.transform(img)
target = self.targets[index]
# 根据索引返回数据和对应的标签
return img, target
def __len__(self):
# 返回文件数据的数目
return self.len
""" Test dataset"""
class TestDS(torch.utils.data.Dataset):
def __init__(self, transform=None):
self.transform = transform
self.len = test.shape[0]
self.data = test
self.targets = test_label
self.classes = np.max(train_label)
def __getitem__(self, index):
img = self.data[index]
img = Image.fromarray(np.uint8(img))
img = self.transform(img)
target = self.targets[index]
# 根据索引返回数据和对应的标签
return img, target
def __len__(self):
# 返回文件数据的数目
return self.len
# # 创建 trainloader 和 testloader
train_data = TrainPair(transform=train_transform)
memory_data = TrainDS(transform=test_transform)
test_data = TestDS(transform=test_transform)
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=args.batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True)
memory_loader = torch.utils.data.DataLoader(dataset=memory_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=True)
4. 模型
# create model
model = ModelMoCo(
dim=args.moco_dim,
K=args.moco_k,
m=args.moco_m,
T=args.moco_t,
arch=args.arch,
bn_splits=args.bn_splits,
symmetric=args.symmetric,
).cuda()
# print(model.encoder_q)
模型的理解是重点但是不是难点,因为对比学习的重思想,而不是模型的复杂程度。下面是训练过程的数据流图
有四个点需要关注一下:
4.1moment update key encoder
就是每次更新的的时候用上一部分权重的α 加上另一个分支的权重(1-α)。α 是自己设置的比例,即 moco_m。第二个是deqeue_and_enqeue。我模拟了一下这个过程。
4.2进队出队
做了一个小例子
import torch
# 输入
K = 12
dim = 2
queue = torch.rand(dim, K)
print(queue)
ptr = 3
batch_size = 3
keys = torch.randn(batch_size, dim) # shape : batch, dim
print(keys)
------------------------
tensor([[0.4031, 0.8880, 0.2820, 0.4275, 0.2731, 0.0415, 0.8560, 0.7286, 0.4262,
0.5516, 0.7505, 0.9933],
[0.6802, 0.3471, 0.5023, 0.6566, 0.1310, 0.8469, 0.0206, 0.3822, 0.8273,
0.7649, 0.0019, 0.9808]])
tensor([[-0.7294, 0.0370],
[ 0.0453, 1.8152],
[-0.0329, 0.7642]])
------------------------
# replace the keys at ptr (dequeue and enqueue)
queue[:, ptr:ptr + batch_size] = keys.t() # transpose
print("queue", queue.shape) # [128, 4096]
print(queue)
------------------------
queue torch.Size([2, 12])
tensor([[ 0.4031, 0.8880, 0.2820, **-0.7294, 0.0453, -0.0329**, 0.8560, 0.7286,
0.4262, 0.5516, 0.7505, 0.9933],
[ 0.6802, 0.3471, 0.5023, **0.0370, 1.8152, 0.7642**, 0.0206, 0.3822,
0.8273, 0.7649, 0.0019, 0.9808]])
------------------------
for i in range(10):
ptr = (ptr + batch_size) % K # move pointer
print(ptr)
------------------------
6
9
0
3
---
------------------------
如上面那个小例子,随机初始化之后,每次更新队列中的一个batch的值,代码中用双星表示。这样就扩充了负样本。
4.3shuffle
在为GPU分配数据之前先将数据进行打乱在进行分配。论文中提出仅对于关键字的网络的输入进行shuffle。不是很重要,有兴趣的可以看一看。我看很帖子说BN层不好,少用,这个shuffle 也是为了克服bn层泄露样本的。
4.4损失计算
损失计算中作者把正样本计算的值放在了第一个位置,即
logits = torch.cat([l_pos, l_neg], dim=1)
那么,根据(带温度的)交叉熵的计算规则,我们只需要取第一个值的位置就行了,而不用管其它位置的值,所以直接可以把所有标签设置为0。这里我理解了好久,看了不少帖子,感觉算是理解了。最重要的一点就是,这里不是标签全设置成了0,而是因为标签正样本的位置在0位置,所以这里是全零的矩阵。具体如果有必要我放在下篇博客里吧,这里太多了。在这里举一个小例子。假设预测pred = tensor.size(4,6).常规做法,标签是随机分布对,即target = [0,1,5,3],正样本的位置在0,1,5,3。
损失为:
-(log_result[0][0] + log_result[1][1] + log_result[2][5]+ log_result[3][3])/4
如果我们把正样本都放在零位置,那么target = [0,0,0,0]:
-(log_result[0][0] + log_result[1][0] + log_result[2][0]+ log_result[3][0])/4
这个例子如此优秀,我感觉我不用写 loss 函数的了。
5.训练
# train for one epoch
def train(net, data_loader, train_optimizer, epoch, args):
net.train()
adjust_learning_rate(optimizer, epoch, args)
total_loss, total_num, train_bar = 0.0, 0, tqdm(data_loader)
for im_1, im_2 in train_bar:
im_1, im_2 = im_1.cuda(non_blocking=True), im_2.cuda(non_blocking=True)
loss = net(im_1, im_2)
train_optimizer.zero_grad()
loss.backward()
train_optimizer.step()
total_num += data_loader.batch_size
total_loss += loss.item() * data_loader.batch_size
train_bar.set_description('Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format(epoch, args.epochs, optimizer.param_groups[0]['lr'], total_loss / total_num))
return total_loss / total_num
# lr scheduler for training
def adjust_learning_rate(optimizer, epoch, args):
"""Decay the learning rate based on schedule"""
lr = args.lr
if args.cos: # cosine lr schedule
lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs))
else: # stepwise lr schedule
for milestone in args.schedule:
lr *= 0.1 if epoch >= milestone else 1.
for param_group in optimizer.param_groups:
param_group['lr'] = lr
常规训练,没有什么好说的。
6.测试
# test using a knn monitor
def test(net, memory_data_loader, test_data_loader, epoch, args):
net.eval()
classes = len(memory_data_loader.dataset.classes)
# print("classes", classes) # 10
total_top1, total_top5, total_num, feature_bank = 0.0, 0.0, 0, []
with torch.no_grad():
# generate feature bank
for data, target in tqdm(memory_data_loader, desc='Feature extracting'):
# print("data1", data.shape, "target1", target.shape)
feature = net(data.cuda(non_blocking=True))
feature = F.normalize(feature, dim=1)
# print("feature1", feature.shape)
feature_bank.append(feature)
# print("feature_bank1", feature_bank.shape)
# [D, N]
feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
# print("feature_bank2", feature_bank.shape) # 512,50000
# [N]
feature_labels = torch.tensor(memory_data_loader.dataset.targets, device=feature_bank.device)
# print("feature_labels", feature_labels.shape) # 50000
# loop test data to predict the label by weighted knn search
test_bar = tqdm(test_data_loader)
for data, target in test_bar:
data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
# print("data2", data.shape, "target2", target.shape) # 512.3,32,32, 512
feature = net(data)
feature = F.normalize(feature, dim=1)
# print("feature2", feature.shape) # 512,128
pred_labels = knn_predict(feature, feature_bank, feature_labels, classes, args.knn_k, args.knn_t)
# print("pred_labels", pred_labels.shape) # 512*10
total_num += data.size(0)
# print("total_num", total_num) # 512
total_top1 += (pred_labels[:, 0] == target).float().sum().item()
test_bar.set_description('Test Epoch: [{}/{}] Acc@1:{:.2f}%'.format(epoch, args.epochs, total_top1 / total_num * 100))
return total_top1 / total_num * 100
这里值得说。我自认为这是个难点。有两种精度的测试模式,一个是利用Knn,一个是利用线性预测,这里也会用到上面的 memeory_data。但是这个代码只用到了knn预测,没有线性预测器。我应该不会写有关线性预测器的了,我会把有线性预测器的代码连接放在下面,不难理解,就像深度学习的迁移学习,冻结主干网络,换个头,继续训练而已。
6.1 Knn monitor
with torch.no_grad():
# generate feature bank
for data, target in tqdm(memory_data_loader, desc='Feature extracting'):
# print("data1", data.shape, "target1", target.shape)
feature = net(data.cuda(non_blocking=True))
feature = F.normalize(feature, dim=1)
# print("feature1", feature.shape) # 512 * 128
feature_bank.append(feature)
# print("feature_bank1", feature_bank.shape) # # 1,2,3....->50000
# [D, N]
feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
# print("feature_bank2", feature_bank.shape) # 128,50000
# [N]
feature_labels = torch.tensor(memory_data_loader.dataset.targets, device=feature_bank.device)
图片的上半部分这一部分代码。feature_bank 记录了所有训练集数据。数据维度如图中和代码中所示。整个过程即为把所有数据的特征记录下来。每个输入的图片转化为一个128维的向量。
test_bar = tqdm(test_data_loader, desc="KNN")
for data, target in test_bar:
data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
# print("data2", data.shape, "target2", target.shape) # 512.3,32,32, 512
feature = net(data)
feature = F.normalize(feature, dim=1)
# print("feature2", feature.shape) # 512,128
pred_labels = knn_predict(feature, feature_bank, feature_labels, classes, args.knn_k, args.knn_t)
# print("pred_labels", pred_labels.shape) # 512*10
total_num += data.size(0)
# print("total_num", total_num) # 512
total_top1 += (pred_labels[:, 0] == target).float().sum().item()
test_bar.set_description('Test Epoch: [{}/{}] Acc@1:{:.2f}%'.format(epoch, args.epochs, total_top1 / total_num * 100))
下面对应这个代码。最重要的是knn_predict。对应图片的下半部分。knn_predict 函数为:
# knn monitor as in InstDisc https://arxiv.org/abs/1805.01978
# implementation follows http://github.com/zhirongw/lemniscate.pytorch and https://github.com/leftthomas/SimCLR
def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t):
# compute cos similarity between each feature vector and feature bank ---> [B, N]
sim_matrix = torch.mm(feature, feature_bank)
# print(feature.shape, feature_bank.shape, "sim_matrix", sim_matrix.shape) # [512, 50000]
# [B, K], 从 50000 中选择 200 个最大的?
sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1)
# torch.Size([512, 200]) torch.Size([512, 200]), 200
# print("sim_weight.shape, sim_indices.shape", sim_weight.shape, sim_indices.shape, knn_k)
# [B, K]
# feature_labels 50000 --> 512, 50000
# print("inter", feature_labels.expand(feature.size(0), -1).shape) # [512, 50000]
sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1), dim=-1, index=sim_indices)
# print("feature", feature.size, "sim_labels", sim_labels.shape) # 512,128; 512,200
# weight 扩大了 10 倍?
sim_weight = (sim_weight / knn_t).exp()
# print("sim_weight", sim_weight.shape) # [512, 200]
# counts for each class, 这里为什么搞这么长?
one_hot_label = torch.zeros(feature.size(0) * knn_k, classes, device=sim_labels.device)
# print("one_hot_label", one_hot_label.shape) # [102400, 10]
# print("one_hot_label", one_hot_label)
# [B*K, C]
one_hot_label = one_hot_label.scatter(dim=-1, index=sim_labels.view(-1, 1), value=1.0)
# print("one_hot_label", one_hot_label.shape) # [102400, 10]
# print("one_hot_label", one_hot_label)
# weighted score ---> [B, C]
# 512*200*10 * 512*200*1 ----> 512*200*10 ---> 512*10
pred_scores = torch.sum(one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1), dim=1)
# print("pred_scores", pred_scores.shape) # [512, 10]
pred_labels = pred_scores.argsort(dim=-1, descending=True)
# print("pred_labels", pred_labels.shape) # [512, 10]
return pred_labels
这个knn的计算过程可以说是个难点,但不是重点。这里极力推荐这个博客:术之多。我就不重复造轮子了。简单的说,就是计算每个预测值和这50000个样本的 相似度,选择最多相似度的。只不过这里还加了一个相似度系数,越相似的样本,他的权重越大,用 knn_k 控制,细节可以看上面的博客。
6.2 Linear eval
这个代码没有线性预测,这里给出几个有线性预测的代码。
博客:术之多
BYOL
SimSiam
7 总结
学习无监督学习断断续续几个月了,看了几篇论文,又看了几个代码,终于做了一个阶段性总结,对于于无监督对比学习有了比一个不再模糊的认识。但是也只能说刚入门。学习过程中,对于pytorch,python的理解加深了不少,继续努力。
1.代码来源
MoCo 官方
2.MoCo 代码解读
3.Loss function
4.Knn monitor
4.SimSiam 代码解读
5.MoCo 代码解读
6.BYOL 代码
7.SimSiam 代码。注:这里面的BYOL的代码没写全,只看SimSiam 和SimCLR即可,也可以根据上一个代码自己更改,推荐。