Lookahead和LazyOptimizer和AccumOptimizer
lookahead用法:
model.compile(optimizer=Adam(1e-3), loss='mse') # 用你想用的优化器
lookahead = Lookahead(k=5, alpha=0.5) # 初始化Lookahead
lookahead.inject(model) # 插入到模型中
## 定义Lookahead
from keras import backend as K
class Lookahead(object):
"""Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
"""
def __init__(self, k=5, alpha=0.5):
self.k = k
self.alpha = alpha
self.count = 0
def inject(self, model):
"""Inject the Lookahead algorithm for the given model.
The following code is modified from keras's _make_train_function method.
See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
"""
if not hasattr(model, 'train_function'):
raise RuntimeError('You must compile your model before using it.')
model._check_trainable_weights_consistency()
if model.train_function is None:
inputs = (model._feed_inputs +
model._feed_targets +
model._feed_sample_weights)
if model._uses_dynamic_learning_phase():
inputs += [K.learning_phase()]
fast_params = model._collected_trainable_weights
with K.name_scope('training'):
with K.name_scope(model.optimizer.__class__.__name__):
training_updates = model.optimizer.get_updates(
params=fast_params,
loss=model.total_loss)
slow_params = [K.variable(p) for p in fast_params]
fast_updates = (model.updates +
training_updates +
model.metrics_updates)
slow_updates, copy_updates = [], []
for p, q in zip(fast_params, slow_params):
slow_updates.append(K.update(q, q + self.alpha * (p - q)))
copy_updates.append(K.update(p, q))
# Gets loss and metrics. Updates weights at each call.
fast_train_function = K.function(
inputs,
[model.total_loss] + model.metrics_tensors,
updates=fast_updates,
name='fast_train_function',
**model._function_kwargs)
def F(inputs):
self.count += 1
R = fast_train_function(inputs)
if self.count % self.k == 0:
K.batch_get_value(slow_updates)
K.batch_get_value(copy_updates)
return R
model.train_function = F
LazyOptimizer用法:
model.compile(
loss='mse',
optimizer=LazyOptimizer(Adam(1e-3), embedding_layers)
)
# 定义LazyOptimizer
from keras.optimizers import Optimizer
import keras.backend as K
class LazyOptimizer(Optimizer):
"""Inheriting Optimizer class, wrapping the original optimizer
to achieve a new corresponding lazy optimizer.
(Not only LazyAdam, but also LazySGD with momentum if you like.)
# Arguments
optimizer: an instance of keras optimizer (supporting
all keras optimizers currently available);
embedding_layers: all Embedding layers you want to update sparsely.
# Returns
a new keras optimizer.
继承Optimizer类,包装原有优化器,实现Lazy版优化器
(不局限于LazyAdam,任何带动量的优化器都可以有对应的Lazy版)。
# 参数
optimizer:优化器实例,支持目前所有的keras优化器;
embedding_layers:模型中所有你喜欢稀疏更新的Embedding层。
# 返回
一个新的keras优化器
"""
def __init__(self, optimizer, embedding_layers=None, **kwargs):
super(LazyOptimizer, self).__init__(**kwargs)
self.optimizer = optimizer
self.embeddings = []
if embedding_layers is not None:
for l in embedding_layers:
self.embeddings.append(
l.trainable_weights[0]
)
with K.name_scope(self.__class__.__name__):
for attr in self.optimizer.get_config():
if not hasattr(self, attr):
value = getattr(self.optimizer, attr)
setattr(self, attr, value)
self.optimizer.get_gradients = self.get_gradients
self._cache_grads = {}
def get_gradients(self, loss, params):
"""Cache the gradients to avoiding recalculating.
把梯度缓存起来,避免重复计算,提高效率。
"""
_params = []
for p in params:
if (loss, p) not in self._cache_grads:
_params.append(p)
_grads = super(LazyOptimizer, self).get_gradients(loss, _params)
for p, g in zip(_params, _grads):
self._cache_grads[(loss, p)] = g
return [self._cache_grads[(loss, p)] for p in params]
def get_updates(self, loss, params):
# Only for initialization (仅初始化)
self.optimizer.get_updates(loss, params)
# Common updates (常规更新)
dense_params = [p for p in params if p not in self.embeddings]
self.updates = self.optimizer.get_updates(loss, dense_params)
# Sparse update (稀疏更新)
sparse_params = self.embeddings
sparse_grads = self.get_gradients(loss, sparse_params)
sparse_flags = [
K.all(K.not_equal(g, 0), axis=-1, keepdims=True)
for g in sparse_grads
]
original_lr = self.optimizer.lr
for f, p in zip(sparse_flags, sparse_params):
self.optimizer.lr = original_lr * K.cast(f, 'float32')
# updates only when gradients are not equal to zeros.
# (gradients are equal to zeros means these words are not sampled very likely.)
# 仅更新梯度不为0的Embedding(梯度为0意味着这些词很可能是没被采样到的)
self.updates.extend(
self.optimizer.get_updates(loss, [p])
)
self.optimizer.lr = original_lr
return self.updates
def get_config(self):
config = self.optimizer.get_config()
return config
AccumOptimizer用法:
opt = AccumOptimizer(Adam(), 10) # 10是累积步数
model.compile(loss='mse', optimizer=opt)
model.fit(x_train, y_train, epochs=10, batch_size=10)
定义方法
from keras.optimizers import Optimizer
import keras.backend as K
class AccumOptimizer(Optimizer):
"""继承Optimizer类,包装原有优化器,实现梯度累积。
# 参数
optimizer:优化器实例,支持目前所有的keras优化器;
steps_per_update:累积的步数。
# 返回
一个新的keras优化器
Inheriting Optimizer class, wrapping the original optimizer
to achieve a new corresponding optimizer of gradient accumulation.
# Arguments
optimizer: an instance of keras optimizer (supporting
all keras optimizers currently available);
steps_per_update: the steps of gradient accumulation
# Returns
a new keras optimizer.
"""
def __init__(self, optimizer, steps_per_update=1, **kwargs):
super(AccumOptimizer, self).__init__(**kwargs)
self.optimizer = optimizer
with K.name_scope(self.__class__.__name__):
self.steps_per_update = steps_per_update
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.cond = K.equal(self.iterations % self.steps_per_update, 0)
self.lr = self.optimizer.lr
self.optimizer.lr = K.switch(self.cond, self.optimizer.lr, 0.)
for attr in ['momentum', 'rho', 'beta_1', 'beta_2']:
if hasattr(self.optimizer, attr):
value = getattr(self.optimizer, attr)
setattr(self, attr, value)
setattr(self.optimizer, attr, K.switch(self.cond, value, 1 - 1e-7))
for attr in self.optimizer.get_config():
if not hasattr(self, attr):
value = getattr(self.optimizer, attr)
setattr(self, attr, value)
# 覆盖原有的获取梯度方法,指向累积梯度
# Cover the original get_gradients method with accumulative gradients.
def get_gradients(loss, params):
return [ag / self.steps_per_update for ag in self.accum_grads]
self.optimizer.get_gradients = get_gradients
def get_updates(self, loss, params):
self.updates = [
K.update_add(self.iterations, 1),
K.update_add(self.optimizer.iterations, K.cast(self.cond, 'int64')),
]
# 累积梯度 (gradient accumulation)
self.accum_grads = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
grads = self.get_gradients(loss, params)
for g, ag in zip(grads, self.accum_grads):
self.updates.append(K.update(ag, K.switch(self.cond, ag * 0, ag + g)))
# 继承optimizer的更新 (inheriting updates of original optimizer)
self.updates.extend(self.optimizer.get_updates(loss, params)[1:])
self.weights.extend(self.optimizer.weights)
return self.updates
def get_config(self):
iterations = K.eval(self.iterations)
K.set_value(self.iterations, 0)
config = self.optimizer.get_config()
K.set_value(self.iterations, iterations)
return config
参考来源:
Lookahead
LazyOptimizer
Keras实现两个优化器
用时间换取效果:Keras梯度累积优化器
AccumOptimizer
keras inject#497
keras/optimizers.py