此篇尝试了正常的测试,以及加入测试了偏置值为0的测试,以及层数的添加测试
1.正常的测试
import torch
from torch import nn
from d2l import torch as d2l
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
# 初始化模型参数
num_inputs, num_outputs, num_hiddens = 784, 10, 256
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad=True) * 0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))
W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad=True) * 0.01)
b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))
params = [W1, b1, W2, b2]
# 激活函数
def relu(X):
a = torch.zeros_like(X)
return torch.max(X,a)
# 模型
def net(X):
X = X.reshape((-1,num_inputs))
H = relu(torch.mm(X,W1)+b1)
return (torch.mm(H,W2)+b2)
# 损失函数
loss = nn.CrossEntropyLoss(reduction='none')
#训练
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr=lr)#优化器,更新参数
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)
结果如图所示
2.添加层数,修改以下内容
# 初始化模型参数
num_inputs, num_outputs, num_hiddens = 784,10,256
W1 = nn.Parameter(torch.randn(num_inputs,num_hiddens, requires_grad = True) * 0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens,requires_grad = True))
#b1 = torch.tensor(0.,requires_grad=True)
#额外添加的
W2 = nn.Parameter(torch.randn(256,128,requires_grad = True) * 0.01)
b2 = nn.Parameter(torch.zeros(128,requires_grad = True))
W3 = nn.Parameter(torch.randn(128,64,requires_grad = True) * 0.01)
b3 = nn.Parameter(torch.zeros(64,requires_grad = True))
W4 = nn.Parameter(torch.randn(64,10,requires_grad = True) * 0.01)
b4 = nn.Parameter(torch.zeros(num_outputs,requires_grad = True))
#b2 = torch.tensor(0.,requires_grad=True)
params = [W1,b1,W2,b2,W3,b3,W4,b4]
# 模型
def net(X):
X = X.reshape((-1,num_inputs))
H1 = relu(torch.mm(X,W1)+b1)
H2 = relu(torch.mm(H1,W2)+b2)
H3 = relu(torch.mm(H2,W3)+b3)
return (torch.mm(H3,W4)+b4)
训练极少数次数时,已经效果很不错了。
3.原本的模型下,将偏置值设置为0的结果
我没看出来有啥区别
4.将初始化模型数据的方法由torch.randn*0.01修改为数值较大的高斯分布后的情况如下所示
# 初始化模型参数
num_inputs, num_outputs, num_hiddens = 784, 10, 256
W1 = nn.Parameter(torch.normal(0,1,size=(num_inputs, num_hiddens), requires_grad=True))
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))
W2 = nn.Parameter(torch.normal(0,1,size=(num_hiddens, num_outputs), requires_grad=True))
b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))
params = [W1, b1, W2, b2]
结果图。数值的大小还是印象比较大的。