1.numpy()实现两层的神经网络
import numpy as np
N, D_in, H, D_out = 64, 1000, 100, 10
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for it in range(500):
# forwrd pass
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
# computer loss
loss = np.square(y_pred-y).sum()
print(it, loss)
# backward the gradient
# computer gradient
grad_y_ped = 2.0*(y_pred-y) # 2.0,注意写成小数
grad_w2 = h_relu.T.dot(grad_y_ped)
grad_h_relu = grad_y_ped.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h<0] = 0
grad_w1 = x.T.dot(grad_h)
w1 = w1-learning_rate*grad_w1
w2 = w2-learning_rate*grad_w2
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
y_pred
2.pytorch的实现方式
2.1方法一
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)
learning_rate = 1e-6
for it in range(500):
# forwrd pass
h = x.mm(w1)
h_relu = h.clamp(min=0)
y_pred = h_relu.mm(w2)
# computer loss
loss = (y_pred-y).pow(2).sum().item()
print(it, loss)
# backward the gradient
# computer gradient
grad_y_ped = 2.0*(y_pred-y) # 2.0,注意写成小数
grad_w2 = h_relu.t().mm(grad_y_ped)
grad_h_relu = grad_y_ped.mm(w2.t())
grad_h = grad_h_relu.clone()
grad_h[h<0] = 0
grad_w1 = x.t().mm(grad_h)
w1 = w1-learning_rate*grad_w1
w2 = w2-learning_rate*grad_w2
2.2方法二
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H, requires_grad=True )
w2 = torch.randn(H, D_out,requires_grad=True)
learning_rate = 1e-6
for it in range(500):
# forwrd pass
y_pred = x.mm(w1).clamp(min=0).mm(w2)
# computer loss
loss = (y_pred-y).pow(2).sum() # computation graph
print(it, loss.item())
# backward the gradient
# computer gradient
loss.backward()
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
w1.grad.zero_()
w2.grad.zero_()
2.3方法三
import torch
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
# 如果运行效果不佳,可以修改初始化权值,可以输出 model 模型看看效果。
# 发现效果不错。如果不修改初始值,发现 learning_rate=1e-4 时,效果依然很好
torch.nn.init.normal(model[0].weight)
torch.nn.init.normal(model[2].weight)
loss_fn = nn.MSELoss(reduction='sum')
learning_rate=1e-6
for it in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(it, loss.item())
model.zero_grad()
loss.backward()
with torch.no_grad():
for param in model.parameters(): # 注意加括号
param -= learning_rate*param.grad
model
运行结果:
Sequential(
(0): Linear(in_features=1000, out_features=100, bias=True)
(1): ReLU()
(2): Linear(in_features=100, out_features=10, bias=True)
)
model[0].weight
运行结果:
Parameter containing:
tensor([[-0.0266, -0.0164, 0.0012, ..., 0.0234, -0.0326, 0.0072],
[ 0.0200, 0.0349, 0.0044, ..., -0.0209, -0.0144, 0.0198],
[-0.0111, 0.0187, -0.0287, ..., 0.0016, -0.0272, -0.0213],
...,
[ 0.0111, 0.0292, 0.0020, ..., 0.0065, -0.0280, -0.0190],
[ 0.0114, 0.0102, -0.0292, ..., -0.0309, 0.0108, 0.0305],
[-0.0181, -0.0353, 0.0144, ..., -0.0284, -0.0189, 0.0016]],
requires_grad=True)
2.4方法四
- Adam常常设置的学习率在(1e-3,1e-4)
import torch
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
# 如果运行效果不佳,可以修改初始化权值,可以输出 model 模型看看效果。
# 发现效果不错。如果不修改初始值,发现 learning_rate=1e-4 时,效果依然很好
# torch.nn.init.normal(model[0].weight)
# torch.nn.init.normal(model[2].weight)
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
for it in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(it, loss.item())
optimizer.zero_grad()
# model.zero_grad()
loss.backward()
# update model parameters
optimizer.step()
2.5方法五
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, D_out): # define the model architecture
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in, H) # 在句尾多家一个逗号,会报错
self.linear2 = torch.nn.Linear(H, D_out)
def forward(self, x):
y_pred = self.linear2(self.linear1(x).clamp(min = 0))
return y_pred
model = TwoLayerNet(D_in, D_out)
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
for it in range(500):
y_pred = model(x)
loss = loss_fn(y_pred, y)
print(it, loss.item())
optimizer.zero_grad()
# model.zero_grad()
loss.backward()
# update model parameters
optimizer.step()