SoftMax
softmax回归虽然叫回归,但其实是一个分类问题,对输出的每一个值,通过softmax就可以得到一个概率,大体思路是先把输入数据输入全连接层,全连接层的输出是类别数,类别数个输出经过softmax函数处理,处理后的结果中,每一个值表示当前类别的概率,所有概率之和等于1。
输入数据的维度变化是这样的:
最初的数据: (批量大小, 通道数, 高, 宽) (256,1,28,28)
把图像平铺: (批量大小, 通道数*高 *宽) (256,1 *28 *28)
这样平铺之后,就可以使用全连接神经网络来处理了
3.6 softmax回归的从零开始实现
import torch
import torchvision
import numpy as np
import sys
sys.path.append("..") # 为了导入上层目录的d2lzh_pytorch
import d2lzh_pytorch as d2l
print(torch.__version__)
print(torchvision.__version__)
2.0.0
0.15.0
导入数据的函数,构建训练数据的迭代器和测试数据的迭代器
其中还包括了数据的预处理以及多进程的选择
def load_data_fashion_mnist(batch_size, resize=None, root='~/Datasets/FashionMNIST'):
"""Download the fashion mnist dataset and then load into memory."""
#数据预处理
trans = []
if resize:
trans.append(torchvision.transforms.Resize(size=resize))
trans.append(torchvision.transforms.ToTensor())
transform = torchvision.transforms.Compose(trans)
mnist_train = torchvision.datasets.FashionMNIST(root='Datasets/FashionMNIST', train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.FashionMNIST(root='Datasets/FashionMNIST', train=False, download=True, transform=transform)
# 如果是window系统,就不使用多进程
if sys.platform.startswith('win'):
num_workers = 0 # 0表示不用额外的进程来加速读取数据
else:
num_workers = 4
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
return train_iter, test_iter
3.6.1 获取和读取数据
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)
3.6.2 初始化模型参数
num_inputs = 784 # (1*28*28)
num_outputs = 10
# 这里的W和b相当于一个全连接层的参数
W = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_outputs)), dtype=torch.float)
b = torch.zeros(num_outputs, dtype=torch.float)
W和b是参数嘛,待更新,所以需要梯度
W.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
求sum并保留原形状
X = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(X.sum(dim=0, keepdim=True))
print(X.sum(dim=1, keepdim=True))
tensor([[5, 7, 9]])
tensor([[ 6],
[15]])
3.6.3 实现softmax运算
def softmax(X):
X_exp = X.exp()
partition = X_exp.sum(dim=1, keepdim=True)
return X_exp / partition # 这里应用了广播机制
X = torch.rand((2, 5))
X_prob = softmax(X)
print(X_prob, X_prob.sum(dim=1))
tensor([[0.1086, 0.2626, 0.2153, 0.2120, 0.2014],
[0.2400, 0.1702, 0.1998, 0.1895, 0.2006]]) tensor([1., 1.])
3.6.4 定义模型
def net(X):
# 是预先设定好的,其实是把图片平铺了
return softmax(torch.mm(X.view((-1, num_inputs)), W) + b)
3.6.5 定义损失函数
注意:这里的y_hat的形状是(batchsize,num_classes)
y的形状是batchsize 第几个class
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = torch.LongTensor([0, 2])
y_hat.gather(1, y.view(-1, 1))
tensor([[0.1000],
[0.5000]])
交叉熵
这里返回的形状也是(btachsize,熵)
def cross_entropy(y_hat, y):
return - torch.log(y_hat.gather(1, y.view(-1, 1)))
3.6.6 计算分类准确率
def accuracy(y_hat, y):
return (y_hat.argmax(dim=1) == y).float().mean().item()
print(accuracy(y_hat, y))
0.5
# 本函数已保存在d2lzh_pytorch包中方便以后使用。该函数将被逐步改进:它的完整实现将在“图像增广”一节中描述
def evaluate_accuracy(data_iter, net):
acc_sum, n = 0.0, 0
for X, y in data_iter:
acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
n += y.shape[0]
return acc_sum / n
print(evaluate_accuracy(test_iter, net))
0.0898
3.6.7 训练模型
num_epochs, lr = 5, 0.1
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, optimizer=None):
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
for X, y in train_iter:
y_hat = net(X)
l = loss(y_hat, y).sum()
# 梯度清零
if optimizer is not None:
optimizer.zero_grad()
elif params is not None and params[0].grad is not None:
for param in params:
param.grad.data.zero_()
l.backward()
if optimizer is None:
d2l.sgd(params, lr, batch_size)
else:
optimizer.step() # “softmax回归的简洁实现”一节将用到
train_l_sum += l.item()
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
n += y.shape[0]
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
% (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)
epoch 1, loss 0.7872, train acc 0.748, test acc 0.794
epoch 2, loss 0.5712, train acc 0.812, test acc 0.811
epoch 3, loss 0.5247, train acc 0.826, test acc 0.819
epoch 4, loss 0.5016, train acc 0.832, test acc 0.825
epoch 5, loss 0.4845, train acc 0.837, test acc 0.824
3.6.8 预测
X, y = next(iter(test_iter))
print(X.shape)
true_labels = d2l.get_fashion_mnist_labels(y.numpy())
pred_labels = d2l.get_fashion_mnist_labels(net(X).argmax(dim=1).numpy())
titles = [true + '\n' + pred for true, pred in zip(true_labels, pred_labels)]
d2l.show_fashion_mnist(X[0:10], titles[0:10])
torch.Size([256, 1, 28, 28])