import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random, math
import sklearn
import scipy
# 大小定义
N, D_in, H, D_out =64,1000,100,10# 变量初始化
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)# 设置学习率
learning_rate =1e-6
1.2 核心部分
for i inrange(6000):# Forward pass
h = x.mm(w1)# N * H
h_relu = h.clamp(min=0)# N * H
y_pred = h_relu.mm(w2)# N * D_out# loss
loss =(y_pred - y).pow(2).sum().item()# 计算图,.item是从单个的tensor里面取出那个元素if i %300==0:print(i, loss, end ='||')# backward pass
grad_y_pred =2*(y_pred - y)# N * D_out
grad_w2 = h_relu.t().mm(grad_y_pred)# 因为是 N * H 矩阵 和 N * D_out 矩阵相乘,所以前者必须转置!输出为H * D_out
grad_h_relu = grad_y_pred.mm(w2.t())# N * H
grad_h = grad_h_relu.clone()
grad_h[h<0]=0# N * H
grad_w1 = x.t().mm(grad_h)# D_in * H 因为我最后的梯度肯定要能被同样维度的w1减去 # update weights of w1 and w2
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
2 第二次改进——使用autograd
2.1 初始化部分
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random, math
import sklearn
import scipy
# 大小定义
N, D_in, H, D_out =64,1000,100,10# 变量初始化
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H, requires_grad=True)# 这里要设置需要梯度
w2 = torch.randn(H, D_out, requires_grad=True)# 设置学习率
learning_rate =1e-6
2.2 核心部分
for i inrange(6000):# Forward pass
y_pred = x.mm(w1).clamp(min=0).mm(w2)# N * H# h_relu = h.clamp(min = 0) # N * H# y_pred = h_relu.mm(w2) # N * D_out# loss
loss =(y_pred - y).pow(2).sum()# 计算图,.item是从单个的tensor里面取出那个元素if i %300==0:print(i, loss.item(),)# backward pass# grad_y_pred = 2 * (y_pred - y) # N * D_out# grad_w2 = h_relu.t().mm(grad_y_pred) # 因为是 N * H 矩阵 和 N * D_out 矩阵相乘,所以前者必须转置!输出为H * D_out# grad_h_relu = grad_y_pred.mm(w2.t()) # N * H# grad_h = grad_h_relu.clone()# grad_h[h<0] = 0 # N * H# grad_w1 = x.t().mm(grad_h) # D_in * H 因为我最后的梯度肯定要能被同样维度的w1减去
loss.backward()# update weights of w1 and w2with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
# if i % 300 == 0:# print(w1.grad, w2.grad)
w1.grad.zero_()# 没有这两句会梯度消失
w2.grad.zero_()# if i % 300 == 0:# print(w1.grad, w2.grad) # 每一个张量都是0
3 第三次改进——使用pytorch:nn
3.1 初始化部分
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random, math
import sklearn
import scipy
N, D_in, H, D_out =64,1000,100,10
x = torch.randn(N, D_in)# 如果要用gpu,后面加上.cuda()或.to_device('gpu_name')
y = torch.randn(N, D_out)# 没有w1和w2了# 首先自己定义序列层次
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H, bias=False),# w_1 * x + b_1
torch.nn.ReLU(),
torch.nn.Linear(H, D_out, bias=False))# 此处为经验之谈!不加也可以!!!!!!# 学习率不变的情况下,初始化为正态分布之后模型收敛得明显加快!!(玄学。。。)
torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)# model = model.cuda() # 在gpu上执行# 初始化一个均方差的实例对象
loss_fn = nn.MSELoss(reduction='sum')
3.2 核心部分——实现质的飞跃!
learning_rate =1e-7for i inrange(20001):# forward pass
y_pred = model(x)# loss
loss = loss_fn(y_pred, y)# 计算图if i %2000==0:print(i, loss.item())# backward pass
loss.backward()# update w1, w2with torch.no_grad():for param in model.parameters():# param包括:(tensor, grad)
param -= learning_rate * param.grad
model.zero_grad()