Deep Learning 32: 自己写的keras的一个callbacks函数,解决keras中不能在每个epoch实时显示学习速率learning rate的问题...

最新推荐文章于 2022-04-11 23:12:14 发布

weixin_30335353

最新推荐文章于 2022-04-11 23:12:14 发布

阅读量468

点赞数

文章标签：人工智能 python

原文链接：http://www.cnblogs.com/dmzhuo/p/6215805.html

版权

一.问题:

keras中不能在每个epoch实时显示学习速率learning rate,从而方便调试,实际上也是为了调试解决这个问题:Deep Learning 31: 不同版本的keras,对同样的代码,得到不同结果的原因总结

二.解决方法

1.把下面代码加入keras文件callbacks.py中:

 1 class DisplayLearningRate(Callback):
 2     '''Display Learning rate .
 3     '''
 4     def __init__(self):
 5         super(DisplayLearningRate, self).__init__()
 6 
 7     def on_epoch_begin(self, epoch, logs={}):
 8         assert hasattr(self.model.optimizer, 'lr'), \
 9             'Optimizer must have a "lr" attribute.'
10         lr_now = K.get_value(self.model.optimizer.lr)
11 
12         print('Epoch %05d: Learning rate is  %s' % (epoch, lr_now))

2.应用方法如下:

 1 history = model.fit(X_train,
 2     Y_train,
 3     batch_size=batch_size,
 4     nb_epoch=nb_epoch,
 5     show_accuracy=False,
 6     verbose=2,
 7     validation_data=(X_test, Y_test),
 8     callbacks = [
 9         keras.callbacks.DisplayLearningRate(),
10         keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='auto'), # 该回调函数将在每个epoch后保存模型到filepath
11         # keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')# 当监测值不再改善时，该回调函数将中止训练.当early stop被激活（如发现loss相比上一个epoch训练没有下降），则经过patience个epoch后停止训练
12     ])

三.总结

按照上面的方法试了之后发现,每个epoch显示的learning rate都是一样的,原来按照这样显示的是最开始初始化时的learning rate,每次epoch学习速率更新后,并没有把值赋给初始时的learning rate,所以才会这样,那么要怎么样才能实时显示每个epoch的学习速率呢? 我觉得应该是显示optimizer中的updates.

四.最终办法

 1 # set the decay as 1e-1 to see the Ir change between epochs.
 2 sgd = SGD(lr=0.1, decay=1e-1, momentum=0.9, nesterov=True)
 3 model.compile(loss='categorical_crossentropy',
 4               optimizer=sgd,
 5               metrics=['accuracy'])
 6 class LossHistory(Callback):
 7     def on_epoch_begin(self, batch, logs={}):
 8         lr = self.lr * (1. / (1. + self.decay * self.iterations))
 9         print('Ir:', lr)
10 history=LossHistory()
11 model.fit(X_train, Y_train,
12           batch_size= batch_size,
13           nb_epoch= nb_epoch,
14           callbacks= [history])

参考：http://stackoverflow.com/questions/40144805/print-learning-rate-evary-epoch-in-sgd

下面我分别把keras==0.3.3和1.2.0时的optimizer.py分别贴出来:

keras==0.3.3时的optimizer.py如下:

  1 from __future__ import absolute_import
  2 from . import backend as K
  3 import numpy as np
  4 from .utils.generic_utils import get_from_module
  5 from six.moves import zip
  6 
  7 
  8 def clip_norm(g, c, n):
  9     if c > 0:
 10         g = K.switch(n >= c, g * c / n, g)
 11     return g
 12 
 13 
 14 def kl_divergence(p, p_hat):
 15     return p_hat - p + p * K.log(p / p_hat)
 16 
 17 
 18 class Optimizer(object):
 19     '''Abstract optimizer base class.
 20 
 21     Note: this is the parent class of all optimizers, not an actual optimizer
 22     that can be used for training models.
 23 
 24     All Keras optimizers support the following keyword arguments:
 25 
 26         clipnorm: float >= 0. Gradients will be clipped
 27             when their L2 norm exceeds this value.
 28         clipvalue: float >= 0. Gradients will be clipped
 29             when their absolute value exceeds this value.
 30     '''
 31     def __init__(self, **kwargs):
 32         self.__dict__.update(kwargs)
 33         self.updates = []
 34 
 35     def get_state(self):
 36         return [K.get_value(u[0]) for u in self.updates]
 37 
 38     def set_state(self, value_list):
 39         assert len(self.updates) == len(value_list)
 40         for u, v in zip(self.updates, value_list):
 41             K.set_value(u[0], v)
 42 
 43     def get_updates(self, params, constraints, loss):
 44         raise NotImplementedError
 45 
 46     def get_gradients(self, loss, params):
 47         grads = K.gradients(loss, params)
 48         if hasattr(self, 'clipnorm') and self.clipnorm > 0:
 49             norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
 50             grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
 51         if hasattr(self, 'clipvalue') and self.clipvalue > 0:
 52             grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
 53         return grads
 54 
 55     def get_config(self):
 56         return {"name": self.__class__.__name__}
 57 
 58 
 59 class SGD(Optimizer):
 60     '''Stochastic gradient descent, with support for momentum,
 61     decay, and Nesterov momentum.
 62 
 63     # Arguments
 64         lr: float >= 0. Learning rate.
 65         momentum: float >= 0. Parameter updates momentum.
 66         decay: float >= 0. Learning rate decay over each update.
 67         nesterov: boolean. Whether to apply Nesterov momentum.
 68     '''
 69     def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False,
 70                  *args, **kwargs):
 71         super(SGD, self).__init__(**kwargs)
 72         self.__dict__.update(locals())
 73         self.iterations = K.variable(0.)
 74         self.lr = K.variable(lr)
 75         self.momentum = K.variable(momentum)
 76         self.decay = K.variable(decay)
 77 
 78     def get_updates(self, params, constraints, loss):
 79         grads = self.get_gradients(loss, params)
 80         lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
 81         self.updates = [(self.iterations, self.iterations + 1.)]
 82 
 83         for p, g, c in zip(params, grads, constraints):
 84             m = K.variable(np.zeros(K.get_value(p).shape))  # momentum
 85             v = self.momentum * m - lr * g  # velocity
 86             self.updates.append((m, v))
 87 
 88             if self.nesterov:
 89                 new_p = p + self.momentum * v - lr * g
 90             else:
 91                 new_p = p + v
 92 
 93             self.updates.append((p, c(new_p)))  # apply constraints
 94         return self.updates
 95 
 96     def get_config(self):
 97         return {"name": self.__class__.__name__,
 98                 "lr": float(K.get_value(self.lr)),
 99                 "momentum": float(K.get_value(self.momentum)),
100                 "decay": float(K.get_value(self.decay)),
101                 "nesterov": self.nesterov}
102 
103 
104 class RMSprop(Optimizer):
105     '''RMSProp optimizer.
106 
107     It is recommended to leave the parameters of this optimizer
108     at their default values.
109 
110     This optimizer is usually a good choice for recurrent
111     neural networks.
112 
113     # Arguments
114         lr: float >= 0. Learning rate.
115         rho: float >= 0.
116         epsilon: float >= 0. Fuzz factor.
117     '''
118     def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
119         super(RMSprop, self).__init__(**kwargs)
120         self.__dict__.update(locals())
121         self.lr = K.variable(lr)
122         self.rho = K.variable(rho)
123 
124     def get_updates(self, params, constraints, loss):
125         grads = self.get_gradients(loss, params)
126         accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
127         self.updates = []
128 
129         for p, g, a, c in zip(params, grads, accumulators, constraints):
130             # update accumulator
131             new_a = self.rho * a + (1 - self.rho) * K.square(g)
132             self.updates.append((a, new_a))
133 
134             new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon)
135             self.updates.append((p, c(new_p)))  # apply constraints
136         return self.updates
137 
138     def get_config(self):
139         return {"name": self.__class__.__name__,
140                 "lr": float(K.get_value(self.lr)),
141                 "rho": float(K.get_value(self.rho)),
142                 "epsilon": self.epsilon}
143 
144 
145 class Adagrad(Optimizer):
146     '''Adagrad optimizer.
147 
148     It is recommended to leave the parameters of this optimizer
149     at their default values.
150 
151     # Arguments
152         lr: float >= 0. Learning rate.
153         epsilon: float >= 0.
154     '''
155     def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
156         super(Adagrad, self).__init__(**kwargs)
157         self.__dict__.update(locals())
158         self.lr = K.variable(lr)
159 
160     def get_updates(self, params, constraints, loss):
161         grads = self.get_gradients(loss, params)
162         accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
163         self.updates = []
164 
165         for p, g, a, c in zip(params, grads, accumulators, constraints):
166             new_a = a + K.square(g)  # update accumulator
167             self.updates.append((a, new_a))
168             new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon)
169             self.updates.append((p, c(new_p)))  # apply constraints
170         return self.updates
171 
172     def get_config(self):
173         return {"name": self.__class__.__name__,
174                 "lr": float(K.get_value(self.lr)),
175                 "epsilon": self.epsilon}
176 
177 
178 class Adadelta(Optimizer):
179     '''Adadelta optimizer.
180 
181     It is recommended to leave the parameters of this optimizer
182     at their default values.
183 
184     # Arguments
185         lr: float >= 0. Learning rate. It is recommended to leave it at the default value.
186         rho: float >= 0.
187         epsilon: float >= 0. Fuzz factor.
188 
189     # References
190         - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
191     '''
192     def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs):
193         super(Adadelta, self).__init__(**kwargs)
194         self.__dict__.update(locals())
195         self.lr = K.variable(lr)
196 
197     def get_updates(self, params, constraints, loss):
198         grads = self.get_gradients(loss, params)
199         accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
200         delta_accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
201         self.updates = []
202 
203         for p, g, a, d_a, c in zip(params, grads, accumulators,
204                                    delta_accumulators, constraints):
205             # update accumulator
206             new_a = self.rho * a + (1 - self.rho) * K.square(g)
207             self.updates.append((a, new_a))
208 
209             # use the new accumulator and the *old* delta_accumulator
210             update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
211 
212             new_p = p - self.lr * update
213             self.updates.append((p, c(new_p)))  # apply constraints
214 
215             # update delta_accumulator
216             new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
217             self.updates.append((d_a, new_d_a))
218         return self.updates
219 
220     def get_config(self):
221         return {"name": self.__class__.__name__,
222                 "lr": float(K.get_value(self.lr)),
223                 "rho": self.rho,
224                 "epsilon": self.epsilon}
225 
226 
227 class Adam(Optimizer):
228     '''Adam optimizer.
229 
230     Default parameters follow those provided in the original paper.
231 
232     # Arguments
233         lr: float >= 0. Learning rate.
234         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
235         epsilon: float >= 0. Fuzz factor.
236 
237     # References
238         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
239     '''
240     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8,
241                  *args, **kwargs):
242         super(Adam, self).__init__(**kwargs)
243         self.__dict__.update(locals())
244         self.iterations = K.variable(0)
245         self.lr = K.variable(lr)
246         self.beta_1 = K.variable(beta_1)
247         self.beta_2 = K.variable(beta_2)
248 
249     def get_updates(self, params, constraints, loss):
250         grads = self.get_gradients(loss, params)
251         self.updates = [(self.iterations, self.iterations+1.)]
252 
253         t = self.iterations + 1
254         lr_t = self.lr * K.sqrt(1 - K.pow(self.beta_2, t)) / (1 - K.pow(self.beta_1, t))
255 
256         for p, g, c in zip(params, grads, constraints):
257             # zero init of moment
258             m = K.variable(np.zeros(K.get_value(p).shape))
259             # zero init of velocity
260             v = K.variable(np.zeros(K.get_value(p).shape))
261 
262             m_t = (self.beta_1 * m) + (1 - self.beta_1) * g
263             v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g)
264             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
265 
266             self.updates.append((m, m_t))
267             self.updates.append((v, v_t))
268             self.updates.append((p, c(p_t)))  # apply constraints
269         return self.updates
270 
271     def get_config(self):
272         return {"name": self.__class__.__name__,
273                 "lr": float(K.get_value(self.lr)),
274                 "beta_1": float(K.get_value(self.beta_1)),
275                 "beta_2": float(K.get_value(self.beta_2)),
276                 "epsilon": self.epsilon}
277 
278 
279 class Adamax(Optimizer):
280     '''Adamax optimizer from Adam paper's Section 7. It is a variant
281      of Adam based on the infinity norm.
282 
283     Default parameters follow those provided in the paper.
284 
285     # Arguments
286         lr: float >= 0. Learning rate.
287         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
288         epsilon: float >= 0. Fuzz factor.
289 
290     # References
291         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
292     '''
293     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8,
294                  *args, **kwargs):
295         super(Adamax, self).__init__(**kwargs)
296         self.__dict__.update(locals())
297         self.iterations = K.variable(0)
298         self.lr = K.variable(lr)
299         self.beta_1 = K.variable(beta_1)
300         self.beta_2 = K.variable(beta_2)
301 
302     def get_updates(self, params, constraints, loss):
303         grads = self.get_gradients(loss, params)
304         self.updates = [(self.iterations, self.iterations+1.)]
305 
306         t = self.iterations + 1
307         lr_t = self.lr / (1 - K.pow(self.beta_1, t))
308 
309         for p, g, c in zip(params, grads, constraints):
310             # zero init of 1st moment
311             m = K.variable(np.zeros(K.get_value(p).shape))
312             # zero init of exponentially weighted infinity norm
313             u = K.variable(np.zeros(K.get_value(p).shape))
314 
315             m_t = (self.beta_1 * m) + (1 - self.beta_1) * g
316             u_t = K.maximum(self.beta_2 * u, K.abs(g))
317             p_t = p - lr_t * m_t / (u_t + self.epsilon)
318 
319             self.updates.append((m, m_t))
320             self.updates.append((u, u_t))
321             self.updates.append((p, c(p_t)))  # apply constraints
322         return self.updates
323 
324     def get_config(self):
325         return {"name": self.__class__.__name__,
326                 "lr": float(K.get_value(self.lr)),
327                 "beta_1": float(K.get_value(self.beta_1)),
328                 "beta_2": float(K.get_value(self.beta_2)),
329                 "epsilon": self.epsilon}
330 
331 
332 # aliases
333 sgd = SGD
334 rmsprop = RMSprop
335 adagrad = Adagrad
336 adadelta = Adadelta
337 adam = Adam
338 adamax = Adamax
339 
340 
341 def get(identifier, kwargs=None):
342     return get_from_module(identifier, globals(), 'optimizer',
343                            instantiate=True, kwargs=kwargs)

View Code

keras==1.2.0时的optimizer.py如下:

  1 from __future__ import absolute_import
  2 from . import backend as K
  3 from .utils.generic_utils import get_from_module
  4 from six.moves import zip
  5 
  6 
  7 def clip_norm(g, c, n):
  8     if c > 0:
  9         g = K.switch(n >= c, g * c / n, g)
 10     return g
 11 
 12 
 13 def optimizer_from_config(config, custom_objects={}):
 14     all_classes = {
 15         'sgd': SGD,
 16         'rmsprop': RMSprop,
 17         'adagrad': Adagrad,
 18         'adadelta': Adadelta,
 19         'adam': Adam,
 20         'adamax': Adamax,
 21         'nadam': Nadam,
 22         'tfoptimizer': TFOptimizer,
 23     }
 24     class_name = config['class_name']
 25     if class_name in custom_objects:
 26         cls = custom_objects[class_name]
 27     else:
 28         if class_name.lower() not in all_classes:
 29             raise ValueError('Optimizer class not found:', class_name)
 30         cls = all_classes[class_name.lower()]
 31     return cls.from_config(config['config'])
 32 
 33 
 34 class Optimizer(object):
 35     '''Abstract optimizer base class.
 36 
 37     Note: this is the parent class of all optimizers, not an actual optimizer
 38     that can be used for training models.
 39 
 40     All Keras optimizers support the following keyword arguments:
 41 
 42         clipnorm: float >= 0. Gradients will be clipped
 43             when their L2 norm exceeds this value.
 44         clipvalue: float >= 0. Gradients will be clipped
 45             when their absolute value exceeds this value.
 46     '''
 47     def __init__(self, **kwargs):
 48         allowed_kwargs = {'clipnorm', 'clipvalue'}
 49         for k in kwargs:
 50             if k not in allowed_kwargs:
 51                 raise TypeError('Unexpected keyword argument '
 52                                 'passed to optimizer: ' + str(k))
 53         self.__dict__.update(kwargs)
 54         self.updates = []
 55         self.weights = []
 56 
 57     def get_updates(self, params, constraints, loss):
 58         raise NotImplementedError
 59 
 60     def get_gradients(self, loss, params):
 61         grads = K.gradients(loss, params)
 62         if hasattr(self, 'clipnorm') and self.clipnorm > 0:
 63             norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
 64             grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
 65         if hasattr(self, 'clipvalue') and self.clipvalue > 0:
 66             grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
 67         return grads
 68 
 69     def set_weights(self, weights):
 70         '''Sets the weights of the optimizer, from Numpy arrays.
 71 
 72         Should only be called after computing the gradients
 73         (otherwise the optimizer has no weights).
 74 
 75         # Arguments
 76             weights: a list of Numpy arrays. The number
 77                 of arrays and their shape must match
 78                 number of the dimensions of the weights
 79                 of the optimizer (i.e. it should match the
 80                 output of `get_weights`).
 81         '''
 82         params = self.weights
 83         weight_value_tuples = []
 84         param_values = K.batch_get_value(params)
 85         for pv, p, w in zip(param_values, params, weights):
 86             if pv.shape != w.shape:
 87                 raise ValueError('Optimizer weight shape ' +
 88                                  str(pv.shape) +
 89                                  ' not compatible with '
 90                                  'provided weight shape ' + str(w.shape))
 91             weight_value_tuples.append((p, w))
 92         K.batch_set_value(weight_value_tuples)
 93 
 94     def get_weights(self):
 95         '''Returns the current weights of the optimizer,
 96         as a list of numpy arrays.
 97         '''
 98         return K.batch_get_value(self.weights)
 99 
100     def get_config(self):
101         config = {}
102         if hasattr(self, 'clipnorm'):
103             config['clipnorm'] = self.clipnorm
104         if hasattr(self, 'clipvalue'):
105             config['clipvalue'] = self.clipvalue
106         return config
107 
108     @classmethod
109     def from_config(cls, config):
110         return cls(**config)
111 
112 
113 class SGD(Optimizer):
114     '''Stochastic gradient descent, with support for momentum,
115     learning rate decay, and Nesterov momentum.
116 
117     # Arguments
118         lr: float >= 0. Learning rate.
119         momentum: float >= 0. Parameter updates momentum.
120         decay: float >= 0. Learning rate decay over each update.
121         nesterov: boolean. Whether to apply Nesterov momentum.
122     '''
123     def __init__(self, lr=0.01, momentum=0., decay=0.,
124                  nesterov=False, **kwargs):
125         super(SGD, self).__init__(**kwargs)
126         self.__dict__.update(locals())
127         self.iterations = K.variable(0.)
128         self.lr = K.variable(lr)
129         self.momentum = K.variable(momentum)
130         self.decay = K.variable(decay)
131         self.inital_decay = decay
132 
133     def get_updates(self, params, constraints, loss):
134         grads = self.get_gradients(loss, params)
135         self.updates = []
136 
137         lr = self.lr
138         if self.inital_decay > 0:
139             lr *= (1. / (1. + self.decay * self.iterations))
140             self.updates .append(K.update_add(self.iterations, 1))
141 
142         # momentum
143         shapes = [K.get_variable_shape(p) for p in params]
144         moments = [K.zeros(shape) for shape in shapes]
145         self.weights = [self.iterations] + moments
146         for p, g, m in zip(params, grads, moments):
147             v = self.momentum * m - lr * g  # velocity
148             self.updates.append(K.update(m, v))
149 
150             if self.nesterov:
151                 new_p = p + self.momentum * v - lr * g
152             else:
153                 new_p = p + v
154 
155             # apply constraints
156             if p in constraints:
157                 c = constraints[p]
158                 new_p = c(new_p)
159 
160             self.updates.append(K.update(p, new_p))
161         return self.updates
162 
163     def get_config(self):
164         config = {'lr': float(K.get_value(self.lr)),
165                   'momentum': float(K.get_value(self.momentum)),
166                   'decay': float(K.get_value(self.decay)),
167                   'nesterov': self.nesterov}
168         base_config = super(SGD, self).get_config()
169         return dict(list(base_config.items()) + list(config.items()))
170 
171 
172 class RMSprop(Optimizer):
173     '''RMSProp optimizer.
174 
175     It is recommended to leave the parameters of this optimizer
176     at their default values
177     (except the learning rate, which can be freely tuned).
178 
179     This optimizer is usually a good choice for recurrent
180     neural networks.
181 
182     # Arguments
183         lr: float >= 0. Learning rate.
184         rho: float >= 0.
185         epsilon: float >= 0. Fuzz factor.
186         decay: float >= 0. Learning rate decay over each update.
187     '''
188     def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0.,
189                  **kwargs):
190         super(RMSprop, self).__init__(**kwargs)
191         self.__dict__.update(locals())
192         self.lr = K.variable(lr)
193         self.rho = K.variable(rho)
194         self.decay = K.variable(decay)
195         self.inital_decay = decay
196         self.iterations = K.variable(0.)
197 
198     def get_updates(self, params, constraints, loss):
199         grads = self.get_gradients(loss, params)
200         shapes = [K.get_variable_shape(p) for p in params]
201         accumulators = [K.zeros(shape) for shape in shapes]
202         self.weights = accumulators
203         self.updates = []
204 
205         lr = self.lr
206         if self.inital_decay > 0:
207             lr *= (1. / (1. + self.decay * self.iterations))
208             self.updates.append(K.update_add(self.iterations, 1))
209 
210         for p, g, a in zip(params, grads, accumulators):
211             # update accumulator
212             new_a = self.rho * a + (1. - self.rho) * K.square(g)
213             self.updates.append(K.update(a, new_a))
214             new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
215 
216             # apply constraints
217             if p in constraints:
218                 c = constraints[p]
219                 new_p = c(new_p)
220             self.updates.append(K.update(p, new_p))
221         return self.updates
222 
223     def get_config(self):
224         config = {'lr': float(K.get_value(self.lr)),
225                   'rho': float(K.get_value(self.rho)),
226                   'decay': float(K.get_value(self.decay)),
227                   'epsilon': self.epsilon}
228         base_config = super(RMSprop, self).get_config()
229         return dict(list(base_config.items()) + list(config.items()))
230 
231 
232 class Adagrad(Optimizer):
233     '''Adagrad optimizer.
234 
235     It is recommended to leave the parameters of this optimizer
236     at their default values.
237 
238     # Arguments
239         lr: float >= 0. Learning rate.
240         epsilon: float >= 0.
241 
242     # References
243         - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
244     '''
245     def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
246         super(Adagrad, self).__init__(**kwargs)
247         self.__dict__.update(locals())
248         self.lr = K.variable(lr)
249         self.decay = K.variable(decay)
250         self.inital_decay = decay
251         self.iterations = K.variable(0.)
252 
253     def get_updates(self, params, constraints, loss):
254         grads = self.get_gradients(loss, params)
255         shapes = [K.get_variable_shape(p) for p in params]
256         accumulators = [K.zeros(shape) for shape in shapes]
257         self.weights = accumulators
258         self.updates = []
259 
260         lr = self.lr
261         if self.inital_decay > 0:
262             lr *= (1. / (1. + self.decay * self.iterations))
263             self.updates.append(K.update_add(self.iterations, 1))
264 
265         for p, g, a in zip(params, grads, accumulators):
266             new_a = a + K.square(g)  # update accumulator
267             self.updates.append(K.update(a, new_a))
268             new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
269             # apply constraints
270             if p in constraints:
271                 c = constraints[p]
272                 new_p = c(new_p)
273             self.updates.append(K.update(p, new_p))
274         return self.updates
275 
276     def get_config(self):
277         config = {'lr': float(K.get_value(self.lr)),
278                   'decay': float(K.get_value(self.decay)),
279                   'epsilon': self.epsilon}
280         base_config = super(Adagrad, self).get_config()
281         return dict(list(base_config.items()) + list(config.items()))
282 
283 
284 class Adadelta(Optimizer):
285     '''Adadelta optimizer.
286 
287     It is recommended to leave the parameters of this optimizer
288     at their default values.
289 
290     # Arguments
291         lr: float >= 0. Learning rate.
292             It is recommended to leave it at the default value.
293         rho: float >= 0.
294         epsilon: float >= 0. Fuzz factor.
295 
296     # References
297         - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
298     '''
299     def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0.,
300                  **kwargs):
301         super(Adadelta, self).__init__(**kwargs)
302         self.__dict__.update(locals())
303         self.lr = K.variable(lr)
304         self.decay = K.variable(decay)
305         self.inital_decay = decay
306         self.iterations = K.variable(0.)
307 
308     def get_updates(self, params, constraints, loss):
309         grads = self.get_gradients(loss, params)
310         shapes = [K.get_variable_shape(p) for p in params]
311         accumulators = [K.zeros(shape) for shape in shapes]
312         delta_accumulators = [K.zeros(shape) for shape in shapes]
313         self.weights = accumulators + delta_accumulators
314         self.updates = []
315 
316         lr = self.lr
317         if self.inital_decay > 0:
318             lr *= (1. / (1. + self.decay * self.iterations))
319             self.updates.append(K.update_add(self.iterations, 1))
320 
321         for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
322             # update accumulator
323             new_a = self.rho * a + (1. - self.rho) * K.square(g)
324             self.updates.append(K.update(a, new_a))
325 
326             # use the new accumulator and the *old* delta_accumulator
327             update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
328 
329             new_p = p - lr * update
330             # apply constraints
331             if p in constraints:
332                 c = constraints[p]
333                 new_p = c(new_p)
334             self.updates.append(K.update(p, new_p))
335 
336             # update delta_accumulator
337             new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
338             self.updates.append(K.update(d_a, new_d_a))
339         return self.updates
340 
341     def get_config(self):
342         config = {'lr': float(K.get_value(self.lr)),
343                   'rho': self.rho,
344                   'decay': float(K.get_value(self.decay)),
345                   'epsilon': self.epsilon}
346         base_config = super(Adadelta, self).get_config()
347         return dict(list(base_config.items()) + list(config.items()))
348 
349 
350 class Adam(Optimizer):
351     '''Adam optimizer.
352 
353     Default parameters follow those provided in the original paper.
354 
355     # Arguments
356         lr: float >= 0. Learning rate.
357         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
358         epsilon: float >= 0. Fuzz factor.
359 
360     # References
361         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
362     '''
363     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
364                  epsilon=1e-8, decay=0., **kwargs):
365         super(Adam, self).__init__(**kwargs)
366         self.__dict__.update(locals())
367         self.iterations = K.variable(0)
368         self.lr = K.variable(lr)
369         self.beta_1 = K.variable(beta_1)
370         self.beta_2 = K.variable(beta_2)
371         self.decay = K.variable(decay)
372         self.inital_decay = decay
373 
374     def get_updates(self, params, constraints, loss):
375         grads = self.get_gradients(loss, params)
376         self.updates = [K.update_add(self.iterations, 1)]
377 
378         lr = self.lr
379         if self.inital_decay > 0:
380             lr *= (1. / (1. + self.decay * self.iterations))
381 
382         t = self.iterations + 1
383         lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))
384 
385         shapes = [K.get_variable_shape(p) for p in params]
386         ms = [K.zeros(shape) for shape in shapes]
387         vs = [K.zeros(shape) for shape in shapes]
388         self.weights = [self.iterations] + ms + vs
389 
390         for p, g, m, v in zip(params, grads, ms, vs):
391             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
392             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
393             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
394 
395             self.updates.append(K.update(m, m_t))
396             self.updates.append(K.update(v, v_t))
397 
398             new_p = p_t
399             # apply constraints
400             if p in constraints:
401                 c = constraints[p]
402                 new_p = c(new_p)
403             self.updates.append(K.update(p, new_p))
404         return self.updates
405 
406     def get_config(self):
407         config = {'lr': float(K.get_value(self.lr)),
408                   'beta_1': float(K.get_value(self.beta_1)),
409                   'beta_2': float(K.get_value(self.beta_2)),
410                   'decay': float(K.get_value(self.decay)),
411                   'epsilon': self.epsilon}
412         base_config = super(Adam, self).get_config()
413         return dict(list(base_config.items()) + list(config.items()))
414 
415 
416 class Adamax(Optimizer):
417     '''Adamax optimizer from Adam paper's Section 7. It is a variant
418      of Adam based on the infinity norm.
419 
420     Default parameters follow those provided in the paper.
421 
422     # Arguments
423         lr: float >= 0. Learning rate.
424         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
425         epsilon: float >= 0. Fuzz factor.
426 
427     # References
428         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
429     '''
430     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
431                  epsilon=1e-8, decay=0., **kwargs):
432         super(Adamax, self).__init__(**kwargs)
433         self.__dict__.update(locals())
434         self.iterations = K.variable(0.)
435         self.lr = K.variable(lr)
436         self.beta_1 = K.variable(beta_1)
437         self.beta_2 = K.variable(beta_2)
438         self.decay = K.variable(decay)
439         self.inital_decay = decay
440 
441     def get_updates(self, params, constraints, loss):
442         grads = self.get_gradients(loss, params)
443         self.updates = [K.update_add(self.iterations, 1)]
444 
445         lr = self.lr
446         if self.inital_decay > 0:
447             lr *= (1. / (1. + self.decay * self.iterations))
448 
449         t = self.iterations + 1
450         lr_t = lr / (1. - K.pow(self.beta_1, t))
451 
452         shapes = [K.get_variable_shape(p) for p in params]
453         # zero init of 1st moment
454         ms = [K.zeros(shape) for shape in shapes]
455         # zero init of exponentially weighted infinity norm
456         us = [K.zeros(shape) for shape in shapes]
457         self.weights = [self.iterations] + ms + us
458 
459         for p, g, m, u in zip(params, grads, ms, us):
460 
461             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
462             u_t = K.maximum(self.beta_2 * u, K.abs(g))
463             p_t = p - lr_t * m_t / (u_t + self.epsilon)
464 
465             self.updates.append(K.update(m, m_t))
466             self.updates.append(K.update(u, u_t))
467 
468             new_p = p_t
469             # apply constraints
470             if p in constraints:
471                 c = constraints[p]
472                 new_p = c(new_p)
473             self.updates.append(K.update(p, new_p))
474         return self.updates
475 
476     def get_config(self):
477         config = {'lr': float(K.get_value(self.lr)),
478                   'beta_1': float(K.get_value(self.beta_1)),
479                   'beta_2': float(K.get_value(self.beta_2)),
480                   'decay': float(K.get_value(self.decay)),
481                   'epsilon': self.epsilon}
482         base_config = super(Adamax, self).get_config()
483         return dict(list(base_config.items()) + list(config.items()))
484 
485 
486 class Nadam(Optimizer):
487     '''
488     Nesterov Adam optimizer: Much like Adam is essentially RMSprop with momentum,
489     Nadam is Adam RMSprop with Nesterov momentum.
490 
491     Default parameters follow those provided in the paper.
492     It is recommended to leave the parameters of this optimizer
493     at their default values.
494 
495     # Arguments
496         lr: float >= 0. Learning rate.
497         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
498         epsilon: float >= 0. Fuzz factor.
499 
500     # References
501         - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
502         - [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
503     '''
504     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
505                  epsilon=1e-8, schedule_decay=0.004, **kwargs):
506         super(Nadam, self).__init__(**kwargs)
507         self.__dict__.update(locals())
508         self.iterations = K.variable(0.)
509         self.m_schedule = K.variable(1.)
510         self.lr = K.variable(lr)
511         self.beta_1 = K.variable(beta_1)
512         self.beta_2 = K.variable(beta_2)
513         self.schedule_decay = schedule_decay
514 
515     def get_updates(self, params, constraints, loss):
516         grads = self.get_gradients(loss, params)
517         self.updates = [K.update_add(self.iterations, 1)]
518 
519         t = self.iterations + 1
520 
521         # Due to the recommendations in [2], i.e. warming momentum schedule
522         momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(0.96, t * self.schedule_decay)))
523         momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(0.96, (t + 1) * self.schedule_decay)))
524         m_schedule_new = self.m_schedule * momentum_cache_t
525         m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
526         self.updates.append((self.m_schedule, m_schedule_new))
527 
528         shapes = [K.get_variable_shape(p) for p in params]
529         ms = [K.zeros(shape) for shape in shapes]
530         vs = [K.zeros(shape) for shape in shapes]
531 
532         self.weights = [self.iterations] + ms + vs
533 
534         for p, g, m, v in zip(params, grads, ms, vs):
535             # the following equations given in [1]
536             g_prime = g / (1. - m_schedule_new)
537             m_t = self.beta_1 * m + (1. - self.beta_1) * g
538             m_t_prime = m_t / (1. - m_schedule_next)
539             v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
540             v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
541             m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
542 
543             self.updates.append(K.update(m, m_t))
544             self.updates.append(K.update(v, v_t))
545 
546             p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
547             new_p = p_t
548 
549             # apply constraints
550             if p in constraints:
551                 c = constraints[p]
552                 new_p = c(new_p)
553             self.updates.append(K.update(p, new_p))
554         return self.updates
555 
556     def get_config(self):
557         config = {'lr': float(K.get_value(self.lr)),
558                   'beta_1': float(K.get_value(self.beta_1)),
559                   'beta_2': float(K.get_value(self.beta_2)),
560                   'epsilon': self.epsilon,
561                   'schedule_decay': self.schedule_decay}
562         base_config = super(Nadam, self).get_config()
563         return dict(list(base_config.items()) + list(config.items()))
564 
565 
566 class TFOptimizer(Optimizer):
567 
568     def __init__(self, optimizer):
569         self.optimizer = optimizer
570         self.iterations = K.variable(0.)
571         self.updates = []
572 
573     def get_updates(self, params, constraints, loss):
574         if constraints:
575             raise ValueError('TF optimizers do not support '
576                              'weights constraints. Either remove '
577                              'all weights constraints in your model, '
578                              'or use a Keras optimizer.')
579         grads = self.optimizer.compute_gradients(loss, params)
580         opt_update = self.optimizer.apply_gradients(
581             grads, global_step=self.iterations)
582         self.updates.append(opt_update)
583         return self.updates
584 
585     @property
586     def weights(self):
587         raise NotImplementedError
588 
589     def get_config(self):
590         raise NotImplementedError
591 
592     def from_config(self, config):
593         raise NotImplementedError
594 
595 
596 # aliases
597 sgd = SGD
598 rmsprop = RMSprop
599 adagrad = Adagrad
600 adadelta = Adadelta
601 adam = Adam
602 adamax = Adamax
603 nadam = Nadam
604 
605 
606 def get(identifier, kwargs=None):
607     if K.backend() == 'tensorflow':
608         # Wrap TF optimizer instances
609         import tensorflow as tf
610         if isinstance(identifier, tf.train.Optimizer):
611             return TFOptimizer(identifier)
612     # Instantiate a Keras optimizer
613     return get_from_module(identifier, globals(), 'optimizer',
614                            instantiate=True, kwargs=kwargs)