深度学习计算
model-construction
如果将MySequential中存储块的方式更改为Python列表,会出现什么样的问题?
#list实现
class MySequential(nn.Module):
def __init__(self, *args):
super().__init__()
self._module = []
for idx, module in enumerate(args):
self._module.append(module)
def forward(self, X):
for block in self._module:
X = block(X)
return X
对于网络的计算输出而言,没有任何区别。
下面尝试输出一下网络结构:改为list之后无法输出网络结构,只能输出MySequential(),这和module的内部函数定义有关。
实现一个块,它以两个块为参数,例如net1和net2,并返回前向传播中两个网络的串联输出。这也被称为平行块。
class combine_net(nn.Module):
def __init__(self, net1, net2):
super().__init__()
# 不计算梯度的随机权重参数。因此其在训练期间保持不变
self.net1 = net1
self.net2 = net2
def forward(self, X):
X1 = self.net1(X)
X2 = self.net2(X)
X = torch.cat((X1, X2))
return X
net1, net2 = NestMLP(), nn.Sequential(nn.Linear(20, 16)) #输出的维度一定要相同 不然不能cat
comb_net = combine_net(net1, net2)
print(comb_net)
X = torch.rand(1, 20)
comb_net(X)
测试结果:
combine_net(
(net1): NestMLP(
(net): Sequential(
(0): Linear(in_features=20, out_features=64, bias=True)
(1): ReLU()
(2): Linear(in_features=64, out_features=32, bias=True)
(3): ReLU()
)
(linear): Linear(in_features=32, out_features=16, bias=True)
)
(net2): Sequential(
(0): Linear(in_features=20, out_features=16, bias=True)
)
)
tensor([[-0.0784, -0.0353, -0.0376, -0.0017, 0.1877, 0.0836, 0.0472, 0.0599,
-0.0981, 0.0650, 0.1564, 0.1542, -0.1637, -0.1586, -0.0867, 0.0718],
[ 0.5999, -0.1808, -0.0843, -0.4204, -0.1008, -0.1373, 0.1633, 0.2282,
0.3579, 0.0151, -0.0042, 0.1367, -0.2625, -0.1420, -0.0266, 0.3955]],
grad_fn=<CatBackward0>)
假设你想要连接同一网络的多个实例。实现一个函数,该函数生成同一个块的多个实例,并在此基础上构建更大的网络。(这个题感觉做的很生硬。。)
class NestMLP(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
nn.Linear(64, 32), nn.ReLU())
self.linear = nn.Linear(32, 20)
def forward(self, X):
return self.linear(self.net(X))
# 3
class muti_module(nn.Module):
def __init__(self, basic_module=NestMLP, repeat_times=3):
super().__init__()
self.repeat_times = repeat_times
for i in range(repeat_times):
self._modules[str(i)] = basic_module()
def forward(self, X):
for i in range(self.repeat_times):
X = self._modules[str(i)](X)
return X
x = torch.rand((2, 20))
muti_net = muti_module()
muti_net(x)
muti_net
输出:tensor([[ 6.1060e-02, -7.1103e-02, 1.2472e-01, 1.3980e-01, 1.1481e-02,
4.9800e-02, -9.4085e-02, -3.8892e-02, -6.2542e-02, -3.8083e-02,
-1.9391e-04, -1.2720e-01, -3.4345e-02, -6.5131e-02, -1.7951e-02,
-1.9207e-01, -1.0449e-01, -9.1771e-02, 1.8197e-01, -1.5724e-01],
[ 6.1160e-02, -7.0994e-02, 1.2481e-01, 1.3945e-01, 1.1382e-02,
4.9842e-02, -9.3823e-02, -3.9059e-02, -6.2436e-02, -3.8149e-02,
-3.6633e-05, -1.2710e-01, -3.4107e-02, -6.5171e-02, -1.7617e-02,
-1.9224e-01, -1.0449e-01, -9.1874e-02, 1.8193e-01, -1.5737e-01]],
grad_fn=<AddmmBackward0>)
muti_module(
(0): NestMLP(
(net): Sequential(
(0): Linear(in_features=20, out_features=64, bias=True)
(1): ReLU()
(2): Linear(in_features=64, out_features=32, bias=True)
(3): ReLU()
)
(linear): Linear(in_features=32, out_features=20, bias=True)
)
(1): NestMLP(
(net): Sequential(
(0): Linear(in_features=20, out_features=64, bias=True)
(1): ReLU()
(2): Linear(in_features=64, out_features=32, bias=True)
(3): ReLU()
)
(linear): Linear(in_features=32, out_features=20, bias=True)
)
(2): NestMLP(
(net): Sequential(
(0): Linear(in_features=20, out_features=64, bias=True)
(1): ReLU()
(2): Linear(in_features=64, out_features=32, bias=True)
(3): ReLU()
)
(linear): Linear(in_features=32, out_features=20, bias=True)
)
)
第三题改进一下,在下一节中有更好的方式:
def block1():
return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
nn.Linear(8, 4), nn.ReLU())
def block2():
net = nn.Sequential()
for i in range(4):
# 在这里嵌套
net.add_module(f'block {i}', block1()) #add_module可以给层指定名称
return net
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
print(rgnet)
rgnet(X)
输出:
Sequential(
(0): Sequential(
(block 0): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 1): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 2): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 3): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
)
(1): Linear(in_features=4, out_features=1, bias=True)
)
tensor([[-0.1540],
[-0.1540]], grad_fn=<AddmmBackward0>)
parameters
使用 :numref:sec_model_construction 中定义的FancyMLP模型,访问各个层的参数。
注:实在是没找到Fancy MLP
class NestMLP(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
nn.Linear(64, 32), nn.ReLU())
self.linear = nn.Linear(32, 20)
def forward(self, X):
return self.linear(self.net(X))
nest = NestMLP()
print(nest)
for idx, param in nest.net.named_parameters():
print(idx, param.data[0])
print("-------")
for idx, param in nest.linear.named_parameters():
print(idx, param.data[0])
#####
output:NestMLP(
(net): Sequential(
(0): Linear(in_features=20, out_features=64, bias=True)
(1): ReLU()
(2): Linear(in_features=64, out_features=32, bias=True)
(3): ReLU()
)
(linear): Linear(in_features=32, out_features=20, bias=True)
)
0.weight tensor([-0.0121, 0.1502, 0.0206, 0.0811, -0.1472, 0.2232, 0.1844, 0.1815,
0.0644, -0.0657, -0.1510, 0.1794, 0.1211, -0.1227, 0.1690, -0.1584,
0.0820, -0.1977, 0.1818, -0.2221])
0.bias tensor(0.1070)
2.weight tensor([-0.0773, -0.0455, -0.0791, 0.0426, -0.0795, 0.1097, -0.1031, -0.0393,
0.0961, 0.0029, -0.0169, -0.1043, 0.0238, -0.0089, -0.0372, 0.0023,
0.1178, -0.0222, 0.0725, -0.0785, 0.0668, -0.1187, -0.0600, -0.0005,
0.0310, 0.0084, 0.0361, 0.1194, 0.0752, -0.0097, -0.0888, -0.1221,
-0.0299, 0.0015, 0.0216, 0.1208, 0.0181, 0.0659, 0.0177, 0.0562,
0.0993, -0.1146, -0.0326, -0.0545, -0.1118, 0.0917, 0.1242, 0.0218,
-0.1049, -0.0935, 0.0307, 0.0053, -0.0480, 0.0925, 0.0805, 0.1050,
-0.1102, 0.0749, -0.0075, -0.0605, 0.1057, 0.0090, -0.0954, 0.0724])
2.bias tensor(-0.0780)
-------
weight tensor([-0.0494, -0.0380, -0.0400, -0.1171, 0.1018, -0.0619, -0.0755, 0.1669,
-0.1748, 0.0405, 0.0753, -0.0784, 0.1215, 0.1486, -0.0595, -0.1509,
0.0341, 0.1427, -0.0625, 0.0702, -0.1014, 0.0671, -0.1065, -0.0852,
0.1293, 0.1286, 0.1733, -0.1364, -0.1190, 0.0666, 0.0622, 0.0041])
bias tensor(-0.0166)
查看初始化模块文档以了解不同的初始化方法。
https://pytorch.org/docs/stable/nn.init.html
构建包含共享参数层的多层感知机并对其进行训练。在训练过程中,观察模型各层的参数和梯度。
偷懒 用下前面写的代码
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optimzer
#数据生成
def data(w, b, num):
x = torch.normal(0, 1, (num,len(w)))
y = torch.matmul(x, w) + b
y = y + torch.normal(0, 0.01, y.shape)
return x, y
def data_loader(dataset, batch_size, is_train = True):
'''
:param dataset:
:param batch_size:
:param is_train: 是否需要训练 顺序打乱
:return:
'''
dataset = torch.utils.data.TensorDataset(*dataset)
data_loader = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = is_train)
return data_loader
def my_plot(x, y, xlabel="", ylabel="", title="", gird=True, **kwargs):
plt.scatter(x, y)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.grid(gird)
plt.title(title)
plt.show()
#参数和网络定义
true_w, true_b = torch.tensor([2, -3.4]),torch.tensor(4.2)
num = 1000
batchsize = 10
epoch = 10
net1 = nn.Linear(2,1)
net = nn.Sequential(net1, nn.Linear(1,2), net1)#网络的权重一定要初始化
# net[0].parameters().__init__()
optimzer = optimzer.Adam(net.parameters(), lr=0.03)
loss = nn.HuberLoss()
x, y = data(true_w, true_b, num)
train_loader = data_loader((x,y), batch_size=batchsize, is_train=True)
my_plot(x[:, 0].detach().numpy(), x[:, 1].detach().numpy())
#开始训练
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
y_hat = net(x)
loss_temp = loss(y, y_hat.reshape(y.shape))
loss_sum += loss_temp
optimzer.zero_grad()
loss_temp.backward()
print(net[0].weight.grad == net[2].weight.grad)
optimzer.step()
losses.append(loss_sum.detach().numpy()/num)
print("epoch: ",i, "loss=", loss_sum)
print('w_hat:', net[0].weight.data, 'b_hat', net[0].bias.data)
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
#######################output
tensor([[True, True]])
tensor([[True, True]])
tensor([[True, True]])
epoch: 9 loss= tensor(0.0120, grad_fn=<AddBackward0>)
w_hat: tensor([[-1.0646, 1.8100]]) b_hat tensor([-1.0874])
为什么共享参数是个好主意?
讨论区老师的解答
共享参数通常可以节省内存,并在以下方面具有特定的好处:
- 对于图像识别中的CNN,共享参数使网络能够在图像中的任何地方而不是仅在某个区域中查找给定的功能。
- 对于RNN,它在序列的各个时间步之间共享参数,因此可以很好地推广到不同序列长度的示例。
- 对于自动编码器,编码器和解码器共享参数。 在具有线性激活的单层自动编码器中,共享权重会在权重矩阵的不同隐藏层之间强制正交。