一、OptimizerHook类解析
这是一个包含关于optimizer自定义操作的hook
初始化函数参数:
grad_clip:梯度裁减
detect_anomalous_params:检测计算图中未包含的异常参数
@HOOKS.register_module()
class OptimizerHook(Hook):
def __init__(self, grad_clip=None, detect_anomalous_params=False):
self.grad_clip = grad_clip
self.detect_anomalous_params = detect_anomalous_params
def clip_grads(self, params):
params = list(
filter(lambda p: p.requires_grad and p.grad is not None, params))
if len(params) > 0:
return clip_grad.clip_grad_norm_(params, **self.grad_clip)
def after_train_iter(self, runner):
runner.optimizer.zero_grad()
if self.detect_anomalous_params:
self.detect_anomalous_parameters(runner.outputs['loss'], runner)
runner.outputs['loss'].backward()
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
runner.optimizer.step()
def detect_anomalous_parameters(self, loss, runner):
logger = runner.logger
parameters_in_graph = set()
visited = set()
def traverse(grad_fn):
if grad_fn is None:
return
if grad_fn not in visited:
visited.add(grad_fn)
if hasattr(grad_fn, 'variable'):
parameters_in_graph.add(grad_fn.variable)
parents = grad_fn.next_functions
if parents is not None:
for parent in parents:
grad_fn = parent[0]
traverse(grad_fn)
traverse(loss.grad_fn)
for n, p in runner.model.named_parameters():
if p not in parameters_in_graph and p.requires_grad:
logger.log(
level=logging.ERROR,
msg=f'{n} with shape {p.size()} is not '
f'in the computational graph \n')
这里我们重点关注after_train_iter函数
runner.optimizer.zero_grad():
清空每个神经元的梯度grad,不清空会累加。
self.detect_anomalous_parameters(runner.outputs[‘loss’], runner):
检测计算图中未包含的异常参数
runner.outputs[‘loss’].backward():
根据损失函数反向传播 更新每个神经元的梯度grad
grad_norm = self.clip_grads(runner.model.parameters())
梯度裁减
runner.optimizer.step()
根据每个神经元的梯度grad和优化器的规则 更新每个神经元的weight也就是权重参数
def after_train_iter(self, runner):
runner.optimizer.zero_grad()
if self.detect_anomalous_params:
self.detect_anomalous_parameters(runner.outputs['loss'], runner)
runner.outputs['loss'].backward()
if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
runner.optimizer.step()
二、优化器介绍
mmdetection 常用优化器包含 SGD和Adam
SGD: 随机梯度下降+动量
Adam:自适应梯度下降+动量
在训练深度神经网络的过程中,我们需要通过反向传播算法计算每一个参数对损失函数的梯度,然后使用优化器更新参数,使得损失函数最小化。optimizer.step()方法就是用于执行参数更新的。