HW2_2 Hessian Matrix
在训练模型的时候,我们常常尝试得到模型是否到达局部最小值、鞍点或者是啥都不是。这个时候就可以使用海塞矩阵来进行判断。
假设:1e-3以下自动忽略为0,如果最小ratio比0.5大且梯度在1e-3以下,则视为局部最小值
import notice part
因为我是guest学生,而且我不打算使用他的那个canvas,所以我这里直接用了视频教学里的id来蒙混过关
student_id = 'b06901135' # fill with your student ID
assert student_id != 'your_student_id', 'Please fill in your student_id before you start.'
package installation
!pip install autograd-lib #安装自动计算梯度的lib
import numpy as np
from math import pi
from collections import defaultdict
from autograd_lib import autograd_lib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import warnings
warnings.filterwarnings("ignore") #这两句在warning太多的时候使用,可以忽略掉所有红色的warning,十分的顺眼
NN Model define
class MathRegressor(nn.Module): ##数学归纳,用于让数学功能在之后的使用中更加的顺手
def __init__(self, num_hidden=128):
super().__init__()
self.regressor = nn.Sequential(
nn.Linear(1, num_hidden),
nn.ReLU(),
nn.Linear(num_hidden, 1)
)
def forward(self, x):
x = self.regressor(x)
return x
get pretrained checkpoints
这里的数据已经提前跑出来了,只要直接用就完事了
!gdown --id 1ym6G7KKNkbsqSnMmnxdQKHO1JBoF0LPR
# find the key from student_id
import re
key = student_id[-1] #学生数字的最后一位
if re.match('[0-9]', key) is not None: #查看是否有数字字符串,如果有的话转化为数字
key = int(key)
else: #无需转换直接使用
key = ord(key) % 10 #打印ASCII值的末尾
# load checkpoint and data corresponding to the key
model = MathRegressor()
autograd_lib.register(model)
data = torch.load('data.pth')[key]
model.load_state_dict(data['model'])
train, target = data['data']
计算梯度
# function to compute gradient norm
def compute_gradient_norm(model, criterion, train, target):
model.train()
model.zero_grad()
output = model(train)
loss = criterion(output, target) #MSE here
loss.backward()
grads = []
for p in model.regressor.children(): #只遍历所有的子模块,不会进行进一步的递归遍历
if isinstance(p, nn.Linear): #判断p是否为nn.Linear
param_norm = p.weight.grad.norm(2).item()
grads.append(param_norm)
grad_mean = np.mean(grads) # compute mean of gradient norms
return grad_mean
计算最小比例
# source code from the official document https://github.com/cybertronai/autograd-lib
# helper function to save activations
def save_activations(layer, A, _):
'''
A is the input of the layer, we use batch size of 6 here
layer 1: A has size of (6, 1)
layer 2: A has size of (6, 128)
'''
activations[layer] = A
# helper function to compute Hessian matrix
def compute_hess(layer, _, B):
'''
B is the backprop value of the layer
layer 1: B has size of (6, 128)
layer 2: B ahs size of (6, 1)
'''
A = activations[layer]
BA = torch.einsum('nl,ni->nli', B, A) # do batch-wise outer product
#爱因斯坦求和简记法,进行张量上的运算,结果为nli,即3维张量
# full Hessian
hess[layer] += torch.einsum('nli,nkj->likj', BA, BA) # do batch-wise outer product, then sum over the batch
# function to compute the minimum ratio
def compute_minimum_ratio(model, criterion, train, target):
model.zero_grad()
# compute Hessian matrix
# save the gradient of each layer
with autograd_lib.module_hook(save_activations):
output = model(train)
loss = criterion(output, target)
# compute Hessian according to the gradient value stored in the previous step
with autograd_lib.module_hook(compute_hess):
autograd_lib.backward_hessian(output, loss='LeastSquares')
#计算海塞矩阵,以后再看(什么菜比)
layer_hess = list(hess.values())
minimum_ratio = []
# compute eigenvalues of the Hessian matrix
for h in layer_hess:
size = h.shape[0] * h.shape[1]
h = h.reshape(size, size)
h_eig = torch.symeig(h).eigenvalues # torch.symeig() returns eigenvalues and eigenvectors of a real symmetric matrix
num_greater = torch.sum(h_eig > 0).item() #把大于0的特征值都加起来
minimum_ratio.append(num_greater / len(h_eig)) #计算平均值了属于是
ratio_mean = np.mean(minimum_ratio) # compute mean of minimum ratio
return ratio_mean
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-C0MlqYab-1626854477503)(D:\学习\大二暑假\李宏毅-机器学习\配图\15.png)]
# the main function to compute gradient norm and minimum ratio
def main(model, train, target):
criterion = nn.MSELoss()#MSE loss
gradient_norm = compute_gradient_norm(model, criterion, train, target)
minimum_ratio = compute_minimum_ratio(model, criterion, train, target)
print('gradient norm: {}, minimum ratio: {}'.format(gradient_norm, minimum_ratio))
if __name__ == '__main__':
# fix random seed
torch.manual_seed(0)
# reset compute dictionaries
activations = defaultdict(int)
hess = defaultdict(float)
# compute Hessian
main(model, train, target)
gradient norm: 0.0722242183983326, minimum ratio: 0.47265625
总结
纯纯的数学部分了,看不懂我觉得还是有点正常的,跑一遍就行了