前言
本文为9月29日计算机视觉基础学习笔记——AlexNet,分为两个章节:
- Week 6 homework——Lenet;
- AlexNet。
一、Week 6 homework——Lenet
#coding:utf-8
# code for week2,recognize_computer_vision.py
# houchangligong,zhaomingming,20200602,
import torch
from torch import nn
from itertools import product
import sys
from mnist import MNIST
import cv2
import numpy as np
def model(feature,layers):
y=-1
B = len(feature)
fea=torch.tensor(feature).view(B,1,28,28).float()
fea= torch.relu(layers[0](fea))
fea= layers[1](fea)
fea= torch.relu(layers[2](fea))
fea= layers[3](fea)
fea= torch.relu(layers[4](fea))
fea = fea.view(B,120)
fea= torch.relu(layers[5](fea))
output= torch.sigmoid(layers[6](fea))
y=output
#y=torch.softmax(output,1)
#y = 1.0/(1.0+torch.exp(-1.*h))
return y
def get_acc(image_data,image_label,layers,start_i,end_i):
correct=0
for i in range(start_i,end_i):
y = model(image_data[i:i+1],layers)
gt = image_label[i]
pred = torch.argmax(y).item()
if gt==pred:
correct+=1
#print("acc=%s"%(float(correct/20.0)))
return float(correct/float(end_i-start_i))
def train_model(image_data,image_label,layers,lr):
loss_value_before=1000000000000000.
loss_value=10000000000000.
for epoch in range(0,300):
loss_value_before=loss_value
loss_value=0
#print(image_label[i])
B = len(image_data)
B = 80
y = model(image_data[0:B],layers)
gt=torch.tensor(image_label[0:B]).view(B,1)
# get one_hot
gt_vector = torch.zeros(B,10).scatter_(1,gt,1)
# 关心所有值
loss = torch.sum((y-gt_vector).mul(y-gt_vector))
# 优化loss,正样本接近1,负样本远离1
#loss1 = (y-1.0).mul(y-1.0)
#loss = loss1[0,gt]+torch.sum(1.0/(loss1[0,0:gt]))+torch.sum(1.0/(loss1[0,gt:-1]))
loss_value += loss.data.item()
# 更新公式
# w = w - (y-y1)*x*lr
loss.backward()
for i in [0,2,4,5,6]:
layers[i].weight.data.sub_(layers[i].weight.grad.data*lr)
layers[i].weight.grad.data.zero_()
layers[i].bias.data.sub_(layers[i].bias.grad.data*lr)
layers[i].bias.grad.data.zero_()
train_acc=get_acc(image_data,image_label,layers,0,80)
test_acc =get_acc(image_data,image_label,layers,80,100)
print("epoch=%s,loss=%s/%s,train/test_acc:%s/%s"%(epoch,loss_value,loss_value_before,train_acc,test_acc))
return layers
if __name__=="__main__":
# 从输入中获取学习率
lr = float(sys.argv[1])
layers=[]
# add conv1
conv1=nn.Conv2d(1,6,kernel_size = 5,stride=1,padding=2)
layers.append(conv1)
pool2=nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
layers.append(pool2)
# add conv3
conv3=nn.Conv2d(6,16,kernel_size = 5,stride=1,padding=0)
layers.append(conv3)
pool4=nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
layers.append(pool4)
# add conv5
conv5=nn.Conv2d(16,120,kernel_size = 5,stride=1,padding=0)
layers.append(conv5)
f6 = nn.Linear(120, 84)
layers.append(f6)
output=nn.Linear(84,10)
layers.append(output)
# 加载数据
# minst 2828 dataset 60000 samples
mndata = MNIST('../week4/mnist/python-mnist/data/')
image_data_all, image_label_all = mndata.load_training()
image_data=image_data_all[0:100]
image_label=image_label_all[0:100]
# 使用未训练的模型处理数据
y = model(image_data,layers)
# 使用未训练得模型测试
print("初始的未训练时模型的acc=%s"%(get_acc(image_data,image_label,layers,80,100)))
# 对模型进行训练:
train_model(image_data,image_label,layers,lr)
# 训练完成,对模型进行测试,给出测试结果:
print("训练完成后模型的acc=%s"%(get_acc(image_data,image_label,layers,80,100)))
二、AlexNet
1、Dataset
- ImageNet;
- ILSVRC-2010:有测试集标签。
2、前向计算
- Input image: 224 × 224 × 3 224\times 224\times 3 224×224×3;
- Kernel of the first layer: 96 × ( 11 × 11 × 3 ) 96\times (11\times 11\times 3) 96×(11×11×3), stride = 4;
- Kernel of the second layer: 256 × ( 5 × 5 × 48 ) 256\times (5\times 5\times 48) 256×(5×5×48);
- Kernel of the third layer: 384 × ( 3 × 3 × 256 ) 384\times (3\times 3\times 256) 384×(3×3×256);
- Kernel of the fourth layer: 384 × ( 3 × 3 × 192 ) 384\times (3\times 3\times 192) 384×(3×3×192);
- Kernel of the fifth layer: 256 × ( 3 × 3 × 192 ) 256\times (3\times 3\times 192) 256×(3×3×192)。
3、反向传播
参数更新规则:
v
i
+
1
:
=
0.9
v
i
−
0.0005
ϵ
ω
i
−
ϵ
(
∂
L
∂
ω
∣
ω
i
)
D
i
ω
i
+
1
:
=
ω
i
+
v
i
+
1
v_{i+1} := 0.9v_i - 0.0005\epsilon\omega_i - \epsilon(\frac{\partial L}{\partial \omega}|_{\omega_i})_{D_i}\\ \omega_{i+1} := \omega_i + v_{i+1}
vi+1:=0.9vi−0.0005ϵωi−ϵ(∂ω∂L∣ωi)Diωi+1:=ωi+vi+1
4、定量对比 & 定性对比
代码如下:
#coding:utf-8
# code for week2,recognize_computer_vision.py
# houchangligong,zhaomingming,20200602,
import torch
from torch import nn
from itertools import product
import sys
from mnist import MNIST
import cv2
import numpy as np
import time
def get_flops(layer,fea):
#layer :conv2d,linear
output_pixels = fea.data.view(-1).shape
flops_per_pixel=layer.kernel_size[0]**2*layer.in_channels
flops =output_pixels[0]*flops_per_pixel
flops = float(flops)
print_f(layer)
print_f("flops=%s / %.2f M / %.2f G "%(flops,flops/(1000.**2),flops/(1000.**3)))
return flops
def forword_flops(layer,fea):
start_time = time.clock()
fea= torch.relu(layer(fea))
end_time = time.clock()
flops=get_flops(layer,fea)
time_cost=end_time-start_time
print_f("time cost:%s S,computer flops:%s "%(time_cost,flops/(1000.**3)/time_cost))
return fea,time_cost,flops
def dropout(fea,flag="train"):
#pdb()
if flag=="train":
size= fea.shape
a = torch.empty(size[0],size[1]).uniform_(0, 1)
p=torch.bernoulli(a)
fea=fea*p
elif flag=="evluate":
fea=fea*0.5
return fea
def model(feature,layers):
y=-1
# time cost sum
tcs=0
# flops sum
fls=0
B = len(feature)
fea=torch.tensor(feature).view(B,1,28,28).float()
#放大到alexnet需要的尺寸
fea = nn.functional.interpolate(fea,(224,224),mode='nearest')
#fea = nn.functional.upsample_bilinear(fea, (224,224))
#fea= torch.rand(100,3,224,224)
fea=torch.cat([fea,fea,fea],1)
B = fea.shape[0]
import pdb
pdb.set_trace()
print_f("feature map size:[%s,%s,%s,%s]"%(fea.shape))
start_time = time.clock()
fea= torch.relu(layers[0](fea))
end_time = time.clock()
flops=get_flops(layers[0],fea)
time_cost=end_time-start_time
print_f("time cost:%s S"%(end_time-start_time))
tcs+=time_cost
fls+=flops
#pool
fea= layers[1](fea)
#fea= torch.relu(layers[2](fea))
fea,tc,fl=forword_flops(layers[2],fea)
tcs+=tc
fls+=fl
fea= layers[3](fea)
#fea= torch.relu(layers[4](fea))
fea,tc,fl=forword_flops(layers[4],fea)
tcs+=tc
fls+=fl
#fea= torch.relu(layers[5](fea))
fea,tc,fl=forword_flops(layers[5],fea)
tcs+=tc
fls+=fl
#fea= torch.relu(layers[6](fea))
fea,tc,fl=forword_flops(layers[6],fea)
tcs+=tc
fls+=fl
print_f("sum_time_cost:%s,sum_flops:%s,computer_flops:%s"%(tcs,fls,fls/tcs/(1024.**3)))
fea= layers[7](fea)
fea = fea.view(B,9216)
fea= torch.relu(layers[8](fea))
fea=dropout(fea)
fea= torch.relu(layers[9](fea))
fea=dropout(fea)
output= torch.sigmoid(dropout(layers[10](fea)))
y=output
#y=torch.softmax(output,1)
#y = 1.0/(1.0+torch.exp(-1.*h))
return y
def get_acc(image_data,image_label,layers,start_i,end_i):
correct=0
for i in range(start_i,end_i):
y = model(image_data[i:i+1],layers)
gt = image_label[i]
pred = torch.argmax(y).item()
if gt==pred:
correct+=1
#print("acc=%s"%(float(correct/20.0)))
return float(correct/float(end_i-start_i))
def train_model(image_data,image_label,layers,lr):
loss_value_before=1000000000000000.
loss_value=10000000000000.
for epoch in range(0,300):
loss_value_before=loss_value
loss_value=0
#print(image_label[i])
B = len(image_data)
B = 80
y = model(image_data[0:B],layers)
gt=torch.tensor(image_label[0:B]).view(B,1)
# get one_hot
gt_vector = torch.zeros(B,1000).scatter_(1,gt,1)
# 关心所有值
loss = torch.sum((y-gt_vector).mul(y-gt_vector))
# 优化loss,正样本接近1,负样本远离1
#loss1 = (y-1.0).mul(y-1.0)
#loss = loss1[0,gt]+torch.sum(1.0/(loss1[0,0:gt]))+torch.sum(1.0/(loss1[0,gt:-1]))
loss_value += loss.data.item()
# 更新公式
# w = w - (y-y1)*x*lr
loss.backward()
for i in [0,2,4,5,6,8,9,10]:
layers[i].weight.data.sub_(layers[i].weight.grad.data*lr)
layers[i].weight.grad.data.zero_()
layers[i].bias.data.sub_(layers[i].bias.grad.data*lr)
layers[i].bias.grad.data.zero_()
train_acc=get_acc(image_data,image_label,layers,0,80)
test_acc =get_acc(image_data,image_label,layers,80,100)
print("epoch=%s,loss=%s/%s,train/test_acc:%s/%s"%(epoch,loss_value,loss_value_before,train_acc,test_acc))
return layers
def print_params_num(layers):
params_num=0
params_num_K=0
params_num_M=0
print("-"*20)
for i in [0,2,4,5,6,8,9,10]:
nw=layers[i].weight.data.view(-1).shape
nb=layers[i].bias.data.view(-1).shape
layer_num = nw[0]+nb[0]
layer_num_K = layer_num/1000.
layer_num_M = layer_num_K/1000.
print(layers[i])
print("layer[%s] has %s / %sK / %sM params"%(i,layer_num,layer_num_K,layer_num_M))
params_num +=layer_num
params_num_K +=layer_num_K
params_num_M +=layer_num_M
print("alexnet has %s / %sK / %sM params need to train"%(params_num,params_num_K,params_num_M))
print("-"*20)
if __name__=="__main__":
# 从输入中获取学习率
lr = float(sys.argv[1])
layers=[]
# add conv1
conv1=nn.Conv2d(3,96,kernel_size = 11,stride=4,padding=2)
layers.append(conv1)
#pool2=nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
pool2=nn.MaxPool2d(kernel_size=3, stride=2, padding=0,ceil_mode=True)
layers.append(pool2)
# add conv3
conv3=nn.Conv2d(96,256,kernel_size = 5,stride=1,padding=2)
layers.append(conv3)
pool4=nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
layers.append(pool4)
# add conv5
conv5=nn.Conv2d(256,384,kernel_size = 3,stride=1,padding=1)
layers.append(conv5)
conv6=nn.Conv2d(384,384,kernel_size = 3,stride=1,padding=1)
layers.append(conv6)
conv7=nn.Conv2d(384,256,kernel_size = 3,stride=1,padding=1)
layers.append(conv7)
pool8=nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
layers.append(pool8)
#fc9 = nn.Linear(36864, 4096)
fc9 = nn.Linear(9216, 4096)
layers.append(fc9)
fc10 = nn.Linear(4096, 4096)
layers.append(fc10)
fc11 = nn.Linear(4096, 1000)
layers.append(fc11)
#参数量:
print_params_num(layers)
#pdb()
# 记载数据
# minst 2828 dataset 60000 samples
mndata = MNIST('../week4/mnist/python-mnist/data/')
image_data_all, image_label_all = mndata.load_training()
image_data=image_data_all[0:100]
image_label=image_label_all[0:100]
# 使用未训练的模型处理数据
import pdb
pdb.set_trace()
y = model(image_data,layers)
# 使用为训练得模型测试
print("初始的未训练时模型的acc=%s"%(get_acc(image_data,image_label,layers,80,100)))
# 对模型进行训练:
train_model(image_data,image_label,layers,lr)
# 训练完成,对模型进行测试,给出测试结果:
print("训练完成后模型的acc=%s"%(get_acc(image_data,image_label,layers,80,100)))