优化与深度学习
优化与估计
尽管优化方法可以最小化深度学习中的损失函数值,但是本质上优化方法达到的目标与深度学习的目标并不相同
· 优化方法目标:训练集损失函数值
· 深度学习目标:测试集损失函数值(泛化性)
import matplotlib.pyplot as plt
import sys
import d2lzh as d2l
import numpy as np
import mpl_toolkits as mplot3d
def f(x):
return x * np.cos(np.pi * x)
def g(x):
return f(x) + 0.2 * np.cos(5 * np.pi * x)
d2l.set_figsize((5, 3))
x = np.arange(0.5, 1.5, 0.01)
fig_f, = d2l.plt.plot(x, f(x), label='training error')
fig_g, = d2l.plt.plot(x, g(x), '--', c='purple', label='test error')
fig_f.axes.annotate('empirical risk', (1.0, -1.2), (0.5, -1.1), arrowprops=dict(arrowstyle='->'))
fig_g.axes.annotate('expected risk', (1.1, -1.05), (0.95, -0.5), arrowprops=dict(arrowstyle='->'))
d2l.plt.xlabel('x')
d2l.plt.ylabel('risk')
d2l.plt.legend(loc='upper right')
plt.show()
优化在深度学习中的挑战
1.局部最小值
2.鞍点
3.梯度消失
· 局部最小值
# optimizer in deep learning
# local minimum value
plt.figure(2)
d2l.set_figsize((4.5, 2.5))
x = np.arange(-1.0, 2.0, 0.1)
fig, = d2l.plt.plot(x, f(x))
fig.axes.annotate('local minimum', (-0.3, -0.25), (-0.77, -1.0), arrowprops=dict(arrowstyle='->'))
fig.axes.annotate('global minimum', (1.1, -0.95), (0.6, 0.8), arrowprops=dict(arrowstyle='->'))
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)')
plt.show()
· 鞍点(saddle point)
# saddle point
plt.figure(3)
x = np.arange(-2.0, 2.0, 0.1)
fig, = d2l.plt.plot(x, x**3)
fig.axes.annotate('saddle point', (0, -0.2), (-0.52, -5.0), arrowprops=dict(arrowstyle='->'))
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)')
plt.show()
fig4 = plt.figure(4)
x, y = np.mgrid[-1: 1: 31j, -1: 1: 31j]
z = x**2 - y**2
d2l.set_figsize((6, 4))
ax = fig4.add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, z, **{'rstride': 2, 'cstride': 2})
ax.plot([0], [0], [0], 'ro', markersize=10)
ticks = [-1, 0, 1]
d2l.plt.xticks(ticks)
d2l.plt.yticks(ticks)
ax.set_zticks(ticks)
d2l.plt.xlabel('x')
d2l.plt.ylabel('y')
plt.show()
· 梯度消失(vanishing gradient)
# vanishing gradient
x = np.arange(-2.0, 5.0, 0.01)
fig, = d2l.plt.plot(x, np.tanh(x))
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)')
fig.axes.annotate('vanishing gradient', (4, 1), (2, 0.0), arrowprops=dict(arrowstyle='->'))
plt.show()
凸性(convexity)
对于集合内的任意两个点的连线,如果连线上所有的点都在集合内,那么就称该集合为凸集合
两个凸集合的交集仍然是凸集合
两个凸集合的并集不是凸集合
函数
import numpy as np
import matplotlib.pyplot as plt
import d2lzh as d2l
def f(x):
return 0.5 * x**2 # convex
def g(x):
return np.cos(np.pi * x) # Nonconvex
def h(x):
return np.exp(0.5 * x) # convex
x, segment = np.arange(-2, 2, 0.01), np.array([-1.5, 1])
d2l.use_svg_display()
_, axes = d2l.plt.subplots(1, 3, figsize=(9, 3))
for ax, func in zip(axes, [f, g, h]):
ax.plot(x, func(x))
ax.plot(segment, func(segment), '--', color='purple')
plt.show()
Jensen不等式
对于凸函数而言,函数值的期望大于期望的函数值
性质:
· 无局部极小值
· 与凸集的关系
· 二阶条件
无局部最小值
与凸集的关系
x, y = np.meshgrid(np.linspace(-1, 1, 101), np.linspace(-1, 1, 101),
indexing='ij')
z = x**2 + 0.5 * np.cos(2 * np.pi * y)
# Plot the 3D surface
d2l.set_figsize((6, 4))
ax = d2l.plt.figure().add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, z, **{'rstride': 10, 'cstride': 10})
ax.contour(x, y, z, offset=-1)
ax.set_zlim(-1, 1.5)
# Adjust labels
for func in [d2l.plt.xticks, d2l.plt.yticks, ax.set_zticks]:
func([-1, 0, 1])
图函数与二阶导数
def f(x):
return 0.5 * x**2
x = np.arange(-2, 2, 0.01)
axb, ab = np.array([-1.5, -0.5, 1]), np.array([-1.5, 1])
d2l.set_figsize((3.5, 2.5))
fig_x, = d2l.plt.plot(x, f(x))
fig_axb, = d2l.plt.plot(axb, f(axb), '-.',color="purple")
fig_ab, = d2l.plt.plot(ab, f(ab),'g-.')
fig_x.axes.annotate('a', (-1.5, f(-1.5)), (-1.5, 1.5),arrowprops=dict(arrowstyle='->'))
fig_x.axes.annotate('b', (1, f(1)), (1, 1.5),arrowprops=dict(arrowstyle='->'))
fig_x.axes.annotate('x', (-0.5, f(-0.5)), (-1.5, f(-0.5)),arrowprops=dict(arrowstyle='->'))
梯度下降
梯度下降就是自变量按照梯度方向的反方向进行迭代小区间减小,使得因变量随着自变量的减小也减小,逐渐收敛的一个过程
一维梯度下降
Example:
# it's the objective function
def f(x):
return x**2
# it's the derivative
def gradf(x):
return 2 * x
def gd(eta):
x = 10
results = [x]
for i in range(10):
x -= eta * gradf(x)
results.append(x)
print('epoch 10, x:', x)
return results
res = gd(0.2)
Show the Trace
# show the trace
def show_trace(x):
n = max(abs(min(x)), abs(max(x)))
f_line = np.arange(-n, n, 0.01)
d2l.set_figsize((3.5, 2.5))
plt.plot(f_line, [f(y) for y in f_line], '-')
plt.plot(x, [f(y) for y in x], '-o')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.show()
局部极小值
局部最小值是梯度下降方法的一个挑战,原因是局部最小值附近的梯度下降方向是改变的,而梯度下降方法本身依赖于自变量梯度方向反方向减小,使得因变量变小
学习率过小会使收敛速度过慢
学习率过大会使收敛发散,影响结果
Program:
def f(x):
c = 0.15 * np.pi
return x * math.cos(c * x)
def gradf(x):
c = 0.15 * np.pi
return math.cos(c * x) - c * x * math.sin(c * x)
def gd(x):
res = [x]
x -= x * gradf(x)
res.append(x)
return res
def show_trace(x):
n = max(abs(min(x)), abs(max(x)))
nn = np.arange(-n, n, 0.01)
d2l.set_figsize((3.5, 2.5))
plt.plot(nn, [f(y) for y in nn], '-')
plt.plot(x, [f(y) for y in x], '-o')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.show()
res = gd(10)
show_trace(res)
多维梯度下降
# multidimensional gradient descent
def train_2d(trainer, steps = 20):
x1, x2 = -5, -2
results = [(x1, x2)]
for i in range(steps):
x1, x2 = trainer(x1, x2)
results.append((x1, x2))
print('epoch %d, x1 %f, x2 %f' % (i + 1, x1, x2))
return results
def show_trace_2d(f, results):
plt.plot(*zip(*results), '-o', color='#ff7f0e')
x1, x2 = np.meshgrid(np.arange(-5.5, 1.0, 0.1), np.arange(-3.0, 1.0, 0.1))
plt.contour(x1, x2, f(x1, x2))
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
# it's objective function
def f_2d(x1, x2):
return x1 ** 2 + 2 * x2 **2
# it's the derivative
def gd_2d(x1, x2):
eta = 0.1
return (x1 - eta * 2 * x1, x2 - eta * 4 * x2)
show_trace_2d(f_2d, train_2d(gd_2d))
自适应方法
牛顿法
牛顿法是通过Hessian二阶偏导数矩阵来完成对学习率的一个自适应
在局部最小值附近,Newton法还是可以通过调整学习率的方法来面对局部最小值附近存在的收敛发散的问题
program:
c = 0.5
def f(x):
return np.cosh(c * x)
def gradf(x):
return c * np.sinh(c * x)
def Hessian(x):
return c ** 2 * np.cosh(c * x)
# Hide learning rate for now
def Newton(eta=1):
x = 10
results = [x]
for i in range(10):
x -= eta * gradf(x) / Hessian(x)
results.append(x)
print('epoch 10, x :', x)
return results
def show_trace(res):
n = max(abs(max(res)), abs(min(res)))
nn = np.arange(-n, n, 0.1)
d2l.set_figsize((3.5, 2.5))
plt.plot(nn, [f(x) for x in nn], '-')
plt.plot(res, [f(x) for x in res], '-o')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.show()
show_trace(Newton())
收敛性分析
预处理(Hessian matrix辅助梯度下降)
随机梯度下降
随机梯度下降参数更新
Program:
def f(x1, x2):
return x1 ** 2 + x2 ** 2 * 2
def gradf(x1, x2):
return x1 * 2, x2 * 4
# simulate noise gradient
def sgd(x1, x2):
# Learning rate scheduler
global lr
# compute gradient
(g1, g2) = gradf(x1, x2)
(g1, g2) = (g1 + np.random.normal(0.1), g2 + np.random.normal(0.1))
# Learning rate at time t
eta_t = eta * lr()
# update the variables
return x1 - eta_t * g1, x2 - eta_t * g2
def show_trace_2d(f, results):
plt.plot(*zip(*results), '-o', color='#ff7f0e')
x1, x2 = np.meshgrid(np.arange(-5.5, 1.0, 0.1), np.arange(-3.0, 1.0, 0.1))
plt.contour(x1, x2, f(x1, x2))
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
def train_2d(trainer, steps=20):
x1, x2 = -5.0, -2.0
results = [(x1, x2)]
for i in range(steps):
x1, x2 = trainer(x1, x2)
results.append((x1, x2))
print('epoch %d, x1 %f, x2 %f' % (i + 1, x1, x2))
return results
eta = 0.1
lr = (lambda: 1)
show_trace_2d(f, train_2d(sgd, steps=50))
动态学习率
Program:
# exponential
# lr = exponential
def exponential():
global ctr
ctr += 1
return math.exp(-0.1 * ctr)
# polynomial
# lr = polynomial
def polynomial():
global ctr
ctr += 1
return (1 + 0.1 * ctr) ** 0.5
小批量随机梯度下降
def sgd(params, states, hyperparams):
for p in params:
p.data -= hyperparams['lr'] * p.grad.data
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def train_ch7(optimizer_fn, states, hyperparams, features, labels,
batch_size=10, num_epochs=2):
# 初始化模型
net, loss = d2l.linreg, d2l.squared_loss
w = torch.nn.Parameter(torch.tensor(np.random.normal(0, 0.01, size=(features.shape[1], 1)), dtype=torch.float32),
requires_grad=True)
b = torch.nn.Parameter(torch.zeros(1, dtype=torch.float32), requires_grad=True)
def eval_loss():
return loss(net(features, w, b), labels).mean().item()
ls = [eval_loss()]
data_iter = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True)
for _ in range(num_epochs):
start = time.time()
for batch_i, (X, y) in enumerate(data_iter):
l = loss(net(X, w, b), y).mean() # 使用平均损失
# 梯度清零
if w.grad is not None:
w.grad.data.zero_()
b.grad.data.zero_()
l.backward()
optimizer_fn([w, b], states, hyperparams) # 迭代模型参数
if (batch_i + 1) * batch_size % 100 == 0:
ls.append(eval_loss()) # 每100个样本记录下当前训练误差
# 打印结果和作图
print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
d2l.set_figsize()
d2l.plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
d2l.plt.xlabel('epoch')
d2l.plt.ylabel('loss')
def train_sgd(lr, batch_size, num_epochs=2):
train_ch7(sgd, None, {'lr': lr}, features, labels, batch_size, num_epochs)
批量归一化和残差网络(BatchNormalization)
对输入的标准化(浅层模型)
处理后的任意一个特征在数据集中所有样本上的均值是0,标准差是1
标准化处理输入数据使各个特征的分布相近
批量归一化(深度模型)
利用小批量上的均值和标准差,不断调整神经网络中间输出,从而使整个神经网络在各层的中间输出的数值更稳定
对全连接层作批量归一化
对卷积层作批量归一化
位置:卷积计算之后、应用激活函数之前
如果卷积计算输出多个通道,我们需要对这些通道的输出分别作批量归一化,且每一个通道都拥有独立的拉伸和偏移参数
计算:对于单通道,batchsize=m,卷积计算输出=pxq,对该通道中的mpq个元素同时作批量归一化,使用相同的均值和方差
预测时的批量归一化
训练:以batch为单位,对每个batch计算均值和方差
预测:用移动平均估算整个训练数据集的样本均值和方差
Program:
import time
import torch
from torch import nn, optim
import torch.nn.functional as F
import torchvision
import sys
sys.path.append("/home/kesci/input/")
import d2lzh1981 as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
# 判断当前模式是训练模式还是预测模式
if not is_training:
# 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差
X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全连接层的情况,计算特征维上的均值和方差
mean = X.mean(dim=0)
var = ((X - mean) ** 2).mean(dim=0)
else:
# 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差。这里我们需要保持
# X的形状以便后面可以做广播运算
mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
# 训练模式下用当前的均值和方差做标准化
X_hat = (X - mean) / torch.sqrt(var + eps)
# 更新移动平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸和偏移
return Y, moving_mean, moving_var
class BatchNorm(nn.Module):
def __init__(self, num_features, num_dims):
super(BatchNorm, self).__init__()
if num_dims == 2:
shape = (1, num_features) #全连接层输出神经元
else:
shape = (1, num_features, 1, 1) #通道数
# 参与求梯度和迭代的拉伸和偏移参数,分别初始化成0和1
self.gamma = nn.Parameter(torch.ones(shape))
self.beta = nn.Parameter(torch.zeros(shape))
# 不参与求梯度和迭代的变量,全在内存上初始化成0
self.moving_mean = torch.zeros(shape)
self.moving_var = torch.zeros(shape)
def forward(self, X):
# 如果X不在内存上,将moving_mean和moving_var复制到X所在显存上
if self.moving_mean.device != X.device:
self.moving_mean = self.moving_mean.to(X.device)
self.moving_var = self.moving_var.to(X.device)
# 保存更新过的moving_mean和moving_var, Module实例的traning属性默认为true, 调用.eval()后设成false
Y, self.moving_mean, self.moving_var = batch_norm(self.training,
X, self.gamma, self.beta, self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.9)
return Y
残差网络(ResNet)
深度学习的问题:深度CNN网络达到一定深度后再一味增加层数并不能带来进一步的分类性能提高,反而会招致网络收敛变得更慢,准确率也变得更差
残差块(residual block)
_在残差块中,输入可通过跨层的数据线路更快地向前传播
稠密连接网络(DenseNet)
主要构建模块
稠密块(dense dock):定义了输入和输出是如何连接的
过渡层(transition layer):用来控制通道数,使之不过大