目录
梯度
梯度的本意是一个向量(矢量), 表示某一函数在该点处的方向导数沿着该方向取得最大值,即函数在该点处沿着该方向(此梯度的方向)变化最快,变化率最大(为该梯度的模)。
影响梯度的因素:initialization status(初始状态) learning rate(学习率)momentom(动量)
激活函数
神经网络中的每个神经元节点接受上一层神经元的输出值作为本神经元的输入值,并将输入值传递给下一层,输入层神经元节点会将输入属性值直接传递给下一层(隐层或输出层)。在多层神经网络中,上层节点的输出和下层节点的输入之间具有一个函数关系,这个函数称为激活函数(又称激励函数)。
如果不用激励函数(其实相当于激励函数是f(x) = x),在这种情况下你每一层节点的输入都是上层输出的线性函数,很容易验证,无论你神经网络有多少层,输出都是输入的线性组合,与没有隐藏层效果相当,这种情况就是最原始的感知机(Perceptron)了,那么网络的逼近能力就相当有限。正因为上面的原因,我们决定引入非线性函数作为激励函数,这样深层神经网络表达能力就更加强大(不再是输入的线性组合,而是几乎可以逼近任意函数)。
2D函数的最优化
import torch
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def himmelblau(x):
return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2
#指定x y 范围
x=np.arange(-6,6,0.1)
y=np.arange(-6,6,0.1)
print('x,y range:',x.shape,y.shape)
#将1维数组x,y拼接成一个二维矩阵 作为Z的坐标
X,Y=np.meshgrid(x, y)
print('X,Y maps:',X.shape,Y.shape)
Z=himmelblau([X,Y])
fig = plt.figure('himmelblau')
ax=fig.gca(projection='3d')
ax.plot_surface(X,Y,Z)
ax.view_init(60,-30)#设置观察视角
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()
if __name__ == '__main__':
# [1., 0.], [-4, 0.], [4, 0.] 其他初始化值
# x_y存储的是坐标值(x,y),目的就是求解一个最优的x_y。
x_y = torch.tensor([0., 0.], requires_grad=True)#初始化值不同 结果可能不同 不可随意初始化
# 定义优化器,优化器的目标就是x_y,学习速率learningrate是0.001
optimizer = torch.optim.Adam([x_y], lr=1e-3)
for step in range(20000):
# 输入坐标,得到预测值
pred = himmelblau(x_y)
# 当网络参量进行反馈时,梯度是被积累的而不是被替换掉,所以把梯度信息清零
optimizer.zero_grad()
# 获取x坐标和y坐标的梯度信息
pred.backward()
# 调用一次.step(),就会优化一次x坐标 x'=x-learningrate*▽x
# 调用一次.step(),就会优化一次y坐标 y'=y-learningrate*▽y
optimizer.step()
if step % 2000 == 0:
print('step {}: x_y = {}, f(x) = {}'
.format(step, x_y.tolist(), pred.item()))
多分类问题
import torch
import os
import torchvision
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as datasets
from torchvision import transforms
batch_size=200
learning_rate=0.01
epochs=10
# 加载的代码操作
train_loader=torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,),(0.3081,))
])),
batch_size=batch_size,shuffle=True
)
test_loader=torch.utils.data.DataLoader(
datasets.MNIST('../data',train=False,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,),(0.3081,))
])),
batch_size=batch_size,shuffle=True
)
def forward(x):
x=x@w1.t()+b1
x=F.relu(x) # 因为后边使用的cross entropy包含softmax,所以这里不能用
x = x @ w2.t() + b2
x = F.relu(x)
x = x @ w3.t() + b3
x = F.relu(x) # 返回的是一个logits(即没有经过sigmoid或者softmax的层) 使用relu也可以
return x
#定义参数
# 新建三个线性层,FC1、FC2、FC3
# 在pytorch中的定义(a,b)a是ch-out输出,b是ch-in输入,也就是(输出,输入)
w1,b1=torch.randn(200,784,requires_grad=True),\
torch.zeros(200,requires_grad=True)
w2,b2=torch.randn(200,200,requires_grad=True),\
torch.zeros(200,requires_grad=True)
#10分类 最后输出是10个
w3,b3=torch.randn(10,200,requires_grad=True),\
torch.zeros(10,requires_grad=True)
#初始化
torch.nn.init.kaiming_normal_(w1)
torch.nn.init.kaiming_normal_(w2)
torch.nn.init.kaiming_normal_(w3)
# 定义一个优化器,优化的目标:w1,b1 w2 b2,w3 b3
optimizer=torch.optim.SGD([w1,b1,w2,b2,w3,b3],lr=learning_rate)
criteon=nn.CrossEntropyLoss()
for epoch in range(epochs):
for batch_idx,(data,target) in enumerate(train_loader):
data=data.view(-1,28*28)
logits=forward(data)
loss=criteon(logits,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch:{}[{}/{} ({:.0f}%)]\tLoss:{:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()
))
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
logits = forward(data)
test_loss += criteon(logits, target).item()
pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100 * correct / len(test_loader.dataset)
))
使用全连接方式
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
#设置相关参数
batch_size=200
learning_rate=0.01
epochs=10
#加载数据
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
#定义模型
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.model = nn.Sequential(
nn.Linear(784, 200),
nn.ReLU(inplace=True),
nn.Linear(200, 200),
nn.ReLU(inplace=True),
nn.Linear(200, 10),
nn.ReLU(inplace=True),
)
def forward(self, x):
x = self.model(x)
return x
#实例化
"""
在该方式下,pytorch会自动初始化参数
所以不需要使用torch.nn.init.kaiming_normal_(w1)
"""
net = MLP()
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss()
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28*28)
logits = net(data)
loss = criteon(logits, target)
optimizer.zero_grad()
loss.backward()
# print(w1.grad.norm(), w2.grad.norm())
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
logits = net(data)
test_loss += criteon(logits, target).item()
pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
使用GPU加速
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
#设置相关参数
batch_size=200
learning_rate=0.01
epochs=10
#加载数据
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=batch_size, shuffle=True)
#定义模型
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.model = nn.Sequential(
nn.Linear(784, 200),
nn.ReLU(inplace=True),
nn.Linear(200, 200),
nn.ReLU(inplace=True),
nn.Linear(200, 10),
nn.ReLU(inplace=True),
)
def forward(self, x):
x = self.model(x)
return x
#实例化
"""
在该方式下,pytorch会自动初始化参数
所以不需要使用torch.nn.init.kaiming_normal_(w1)
"""
device=torch.device('cuda:0')
net = MLP().to(device)
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss().to(device)
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
data = data.view(-1, 28*28)
data,target=data.to(device),target.to(device)
logits = net(data)
loss = criteon(logits, target)
optimizer.zero_grad()
loss.backward()
# print(w1.grad.norm(), w2.grad.norm())
optimizer.step()
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28)
data, target = data.to(device), target.to(device)
logits = net(data)
test_loss += criteon(logits, target).item()
pred = logits.data.max(1)[1]
correct += pred.eq(target.data).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))