文章参考:
http://pytorch123.com/SecondSection/neural_networks/
cs231n assignment2 - http://cs231n.github.io/assignments2019/assignment2/
整体流程
- 定义网络结构
- 输入网络得到输出
- 计算损失
- 反向传播
- 更新权重
1. 定义网络结构
导入最基本的三个模块:
import torch
import torch.nn as nn
import torch.nn.functional as F
然后定义一个网络类,定义网络结构、参数并实例化:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
net = Net()
这时候也可以查看一下网络的结构和参数:
# 查看网络结构
print(net)
# 查看网络参数
params = list(net.parameters())
print(len(params))
print(params[0].size()) # conv1's .weight
2. 前向传播
根据定义的网络结构给出所需的输入,网络直接计算得到输出:
output = net(input)
print(output)
3. 定义损失函数(Loss)
根据网络的输出output以及target求取两者之间的loss,可以定义不同的评价函数(criterion):
output = net(input)
target = torch.randn(10) # a dummy target, for example
target = target.view(1, -1) # make it the same shape as output
criterion = nn.MSELoss()
loss = criterion(output, target)
print(loss)
4. 反向传播
现在,如果跟随损失到反向传播路径,可以使用它的 .grad_fn属性,将会看到一个这样的计算图:
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
-> view -> linear -> relu -> linear -> relu -> linear
-> MSELoss
-> loss
因为之前的操作都是requires_grad的,所以这时候一句命令就能够调用autograd来求取梯度:
# 一句命令,pytorch会自动根据上述的计算图来求取各个参数的梯度
# 这里因为loss是一个标量,所以.backward()不需要给参数,默认是Tensor(1.)
loss.backward()
同时一般来讲,为了梯度是每一次迭代的梯度,防止梯度累加,需要将历史梯度清空(置零):
net.zero_grad() # zeroes the gradient buffers of all parameters
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)
输出:
conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0054, 0.0011, 0.0012, 0.0148, -0.0186, 0.0087])
5. 更新权重
求取各个参数的梯度之后需要对参数进行更新,最基础的方法:
weight = weight - learning_rate * gradient
但是一般情况下都会使用到各种各样的优化器,也即optim模块:
import torch.optim as optim
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)
# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # Does the update
以上就是用pytorch定义一个网络的基本步骤,我们也可以对这些步骤进行封装,封装成一些函数便于调用,下面是cs231n中assignment2的封装。
实例-AlexNet-pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torchvision.datasets as dset
import torchvision.transforms as T
import numpy as np
import matplotlib.pyplot as plt
import datetime
NUM_TRAIN = 49000
# The torchvision.transforms package provides tools for preprocessing data
# and for performing data augmentation; here we set up a transform to
# preprocess the data by subtracting the mean RGB value and dividing by the
# standard deviation of each RGB value; we've hardcoded the mean and std.
transform = T.Compose([
T.ToTensor(),
T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms minibatches. We divide the CIFAR-10
# training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
cifar10_train = dset.CIFAR10('./cs231n/datasets', train=True, download=True,
transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64,
sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))
cifar10_val = dset.CIFAR10('./cs231n/datasets', train=True, download=True,
transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64,
sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))
cifar10_test = dset.CIFAR10('./cs231n/datasets', train=False, download=True,
transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)
USE_GPU = True
dtype = torch.float32 # we will be using float throughout this tutorial
if USE_GPU and torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
# Constant to control how frequently we print train loss
print_every = 100
print('using device:', device)
def check_accuracy_part34(loader, model):
if loader.dataset.train:
print('Checking accuracy on validation set')
else:
print('Checking accuracy on test set')
num_correct = 0
num_samples = 0
model.eval() # set model to evaluation mode
with torch.no_grad():
for x, y in loader:
x = x.to(device=device, dtype=dtype) # move to device, e.g. GPU
y = y.to(device=device, dtype=torch.long)
scores = model(x)
_, preds = scores.max(1)
num_correct += (preds == y).sum()
num_samples += preds.size(0)
acc = float(num_correct) / num_samples
print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
return acc
下面这段是最重要的,定义了网络的训练过程:
def train_part34(model, optimizer, epochs=1):
"""
Train a model on CIFAR-10 using the PyTorch Module API.
Inputs:
- model: A PyTorch Module giving the model to train.
- optimizer: An Optimizer object we will use to train the model
- epochs: (Optional) A Python integer giving the number of epochs to train for
Returns: Nothing, but prints model accuracies during training.
"""
model = model.to(device=device) # move the model parameters to CPU/GPU
history = []
for e in range(epochs):
for t, (x, y) in enumerate(loader_train):
model.train() # put model to training mode
x = x.to(device=device, dtype=dtype) # move to device, e.g. GPU
y = y.to(device=device, dtype=torch.long)
scores = model(x)
loss = F.cross_entropy(scores, y)
# Zero out all of the gradients for the variables which the optimizer
# will update.
optimizer.zero_grad()
# This is the backwards pass: compute the gradient of the loss with
# respect to each parameter of the model.
loss.backward()
# Actually update the parameters of the model using the gradients
# computed by the backwards pass.
optimizer.step()
if t % print_every == 0:
print('Iteration %d, loss = %.4f' % (t, loss.item()))
acc = check_accuracy_part34(loader_val, model)
history.append(acc)
print()
plt.figure(figsize=(15,6))
plt.xlabel('epoch')
plt.ylabel('acc')
plt.plot(list(range(len(history))), history, 'bo-')
plt.show()
model = None
optimizer = None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 定义我的CONVNET
class LRN(nn.Module):#这个网络貌似后续被其它正则化手段代替,如dropout、batch normalization等。目前该网络基本上很少使用了,这里为了原生的AlexNet而实现
def __init__(self, local_size=1, alpha=1.0, beta=0.75, ACROSS_CHANNELS=False):
super(LRN, self).__init__()
self.ACROSS_CHANNELS = ACROSS_CHANNELS
if self.ACROSS_CHANNELS:
self.average=nn.AvgPool3d(kernel_size=(local_size, 1, 1), #0.2.0_4会报错,需要在最新的分支上AvgPool3d才有padding参数
stride=1,
padding=(int((local_size-1.0)/2), 0, 0))
else:
self.average=nn.AvgPool2d(kernel_size=local_size,
stride=1,
padding=int((local_size-1.0)/2))
self.alpha = alpha
self.beta = beta
def forward(self, x):
if self.ACROSS_CHANNELS:
div = x.pow(2).unsqueeze(1)
div = self.average(div).squeeze(1)
div = div.mul(self.alpha*5).add(1.0).pow(self.beta)#这里的1.0即为bias
else:
div = x.pow(2)
div = self.average(div)
div = div.mul(self.alpha).add(1.0).pow(self.beta)
x = x.div(div)
return x
class AlexNet(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
LRN(local_size=5, alpha=1e-4, beta=0.75, ACROSS_CHANNELS=True),
nn.Conv2d(64, 192, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
LRN(local_size=5, alpha=1e-4, beta=0.75, ACROSS_CHANNELS=True),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
)
self.classifier = nn.Sequential(
nn.Linear(256 * 2 * 2, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), 256 * 2 * 2)
x = self.classifier(x)
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = AlexNet(10)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01,
momentum=0.9, nesterov=True, weight_decay=0.0005)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
################################################################################
# END OF YOUR CODE
################################################################################
# You should get at least 70% accuracy
start = datetime.datetime.now()
train_part34(model, optimizer, epochs=10)
end = datetime.datetime.now()
print(end-start)