一起学深度学习系列——Softmax
Softmax 回归 理论知识
回归VS分类
回归
- 单连续数输出
- 自然区间 R \mathbb{R} R
- 与真实值的差距作为损失
分类
- 通常是多个输出,输出的是第 i i i类的置信度.
多分类
- 对类别进行编码 y = [ y 1 , y 2 , … … , y n ] T y = [y_1,y_2,……,y_n]^T y=[y1,y2,……,yn]T
- 最大值作为预测 y ^ = a r g max i o i \hat y = arg \max_{i} o_i y^=argmaxioi,我们希望对正确类的置信度要远大于其他类别.
Softmax
- 输出匹配概率(非负,和为1)
y ^ = s o f t m a x ( o ) \hat y = softmax(\mathbb{o}) y^=softmax(o)
y ^ i = exp ( o i ) ∑ k exp ( o k ) \hat y_i = \frac{\exp{(o_i)}}{\sum_{k} \exp{(o_k)}} y^i=∑kexp(ok)exp(oi)
概率 y y y与 y ^ \hat y y^的差别作为损失.
交叉熵损失
- 交叉熵刻画的是两个概率向量的区别 H ( p , q ) = ∑ i − p i log ( q i ) H(p,q) = \sum_i -p_i \log(q_i) H(p,q)=∑i−pilog(qi)
- 损失函数 L = − ∑ i y i log ( y ^ i ) = − log ( y ^ ) y L = -\sum_i y_i \log(\hat y_i) = -\log(\hat y)_y L=−∑iyilog(y^i)=−log(y^)y
- 其梯度 ∂ o i l ( y , y ^ ) = s o f t m a x ( o ) i − y i \partial_{o_i}l(y,\hat y) = softmax(o)_i - y_i ∂oil(y,y^)=softmax(o)i−yi
常见的损失函数
- MSELoss l ( y , y ^ ) = 1 2 ( y − y ^ ) 2 l(y,\hat y) = \frac{1}{2}(y - \hat y)^2 l(y,y^)=21(y−y^)2
- L1 Loss l ( y , y ^ ) = ∣ y − y ^ ∣ l(y,\hat y) = |y - \hat y| l(y,y^)=∣y−y^∣(稳定、零点不可导)
- Huber’s Robust Loss
MNIST图像分类数据集导入
%matplotlib inline
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
from d2l import torch as d2l
d2l.use_svg_display()
下载数据集
trans = transforms.ToTensor()
mnist_train = torchvision.datasets.FashionMNIST(root='../data',train=True, transform=trans, download= True)
mnist_test = torchvision.datasets.FashionMNIST(root='../data',train=False, transform=trans, download= True)
len(mnist_train)/len(mnist_test)
可视化数据集
def get_fashion_mnist_labels(labels):
"""返回Fashion-MNIST数据集的文本标签。"""
text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
return [text_labels[int(i)] for i in labels]
def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
"""Plot a list of images."""
figsize = (num_cols * scale, num_rows * scale)
_, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize)
axes = axes.flatten()
for i, (ax, img) in enumerate(zip(axes, imgs)):
if torch.is_tensor(img):
# 图片张量
ax.imshow(img.numpy())
else:
# PIL图片
ax.imshow(img)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
if titles:
ax.set_title(titles[i])
return axes
构造迭代器
X, y = next(iter(data.DataLoader(mnist_train, batch_size=18)))
show_images(X.reshape(18, 28, 28), 2, 9, titles=get_fashion_mnist_labels(y));
batch_size = 256
def get_dataloader_workers():
"""使用4个进程来读取数据。"""
return 4
train_iter = data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=get_dataloader_workers())
读取训练数据所需的时间
timer = d2l.Timer()
for X, y in train_iter:
continue
f'{timer.stop():.2f} sec'
展平每张图片
num_inputs = 784
num_outputs = 10
w = torch.normal(0,0.01, size=(num_inputs, num_outputs), requires_grad = True)
b = torch.zeros(num_outputs, requires_grad = True)
定义 S o f t m a x Softmax Softmax
- 举个例子:矩阵的按轴求和
X = torch.tensor([[1.0 , 2.0 , 3.0],
[4.0 , 5.0 , 6.0]])
X.sum(0, keepdim = True)
tensor([[5., 7., 9.]])
X.sum(1, keepdim = True)
tensor([[ 6.],
[15.]])
s o f t m a x ( X ) i j = exp ( X i j ) ∑ K exp ( X i k ) softmax(X)_{ij} = \frac{\exp (X_{ij})}{\sum_K \exp(X_{ik})} softmax(X)ij=∑Kexp(Xik)exp(Xij)
def softmax(X):
X_exp = torch.exp(X)
partition = X_exp.sum(1, keepdim = True)#按行求和
return X_exp / partition #这里应用了广播机制
定义网络
def net(X):
return softmax(torch.matmul(X.reshape((-1, w.shape[0])),w) + b)
如何在预测之中根据标号,取出预测值?
y = torch.tensor([0,2])
y_hat = torch.tensor([[0.1, 0.3, 0.6],
[0.3, 0.2, 0.5]])
y_hat[[0,1],y]
tensor([0.1000, 0.5000])
def cross_entropy(y_hat,y):#交叉熵损失
return -torch.log(y_hat[range(len(y_hat)),y])
cross_entropy(y_hat,y)
tensor([2.3026, 0.6931])
def accuracy(y_hat,y):
"""计算正确的数量"""
if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
y_hat = y_hat.argmax(axis = 1)
cmp = y_hat.type(y.dtype) == y
return float(cmp.type(y.dtype).sum())
accuracy(y_hat,y) / len(y)#准确率
训练
def train_epoch_ch3(net, train_iter, loss, updater):
"""训练模型一个迭代周期(定义见第3章)。"""
# 将模型设置为训练模式
if isinstance(net, torch.nn.Module):
net.train()
# 训练损失总和、训练准确度总和、样本数
metric = Accumulator(3)
for X, y in train_iter:
# 计算梯度并更新参数
y_hat = net(X)
l = loss(y_hat, y)
if isinstance(updater, torch.optim.Optimizer):
# 使用PyTorch内置的优化器和损失函数
updater.zero_grad()
l.backward()
updater.step()
metric.add(float(l) * len(y), accuracy(y_hat, y),
y.size().numel())
else:
# 使用定制的优化器和损失函数
l.sum().backward()
updater(X.shape[0])
metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
# 返回训练损失和训练准确率
return metric[0] / metric[2], metric[1] / metric[2]
lr = 0.1
def updater(batch_size):
return d2l.sgd([w, b], lr, batch_size)
num_epochs = 10
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)