总结一下目前所学的关于keras.callbacks中的回调函数。
参考https://github.com/ChihebTrabelsi/deep_complex_networks
- 回调函数是一组在训练的特定阶段被调用的函数集,你可以使用回调函数来观察训练过程中网络内部的状态和统计信息。
- 虽然我们称之为回调“函数”,但事实上Keras的回调函数是一个类
keras.callbacks内置类函数
keras.callbacks.Callback()**
这是回调函数的抽象类,定义新的回调函数必须继承自该类
属性和参数见https://keras.io/zh/callbacks/
keras.callbacks.LearningRateScheduler(schedule)
该回调函数是用于动态设置学习率
参数:
● schedule:函数,该函数以epoch号为参数(从0算起的整数),返回一个新学习率(浮点数)
示例:
def schedule(epoch):
if epoch >= 0 and epoch < 10:
lrate = 0.01
if epoch == 0:
L.getLogger("train").info("Current learning rate value is "+str(lrate))
elif epoch >= 10 and epoch < 100:
lrate = 0.1
if epoch == 10:
L.getLogger("train").info("Current learning rate value is "+str(lrate))
elif epoch >= 100 and epoch < 120:
lrate = 0.01
if epoch == 100:
L.getLogger("train").info("Current learning rate value is "+str(lrate))
elif epoch >= 120 and epoch < 150:
lrate = 0.001
if epoch == 120:
L.getLogger("train").info("Current learning rate value is "+str(lrate))
elif epoch >= 150:
lrate = 0.0001
if epoch == 150:
L.getLogger("train").info("Current learning rate value is "+str(lrate))
return lrate
# 学习率调度器
scheduler = LearningRateScheduler(schedule)
自定义的回调函数,继承keras.callbacks.Callback()
TestErrorCallback(Callback)
评价在每一轮末在测试集上的性能表现
# Also evaluate performance on test set at each epoch end.
#
class TestErrorCallback(Callback):
def __init__(self, test_data):
self.test_data = test_data
self.loss_history = []
self.acc_history = []
def on_epoch_end(self, epoch, logs={}):
x, y = self.test_data
L.getLogger("train").info("Epoch {:5d} Evaluating on test set...".format(epoch+1))
test_loss, test_acc = self.model.evaluate(x, y, verbose=0)
L.getLogger("train").info(" complete.")
self.loss_history.append(test_loss)
self.acc_history.append(test_acc)
L.getLogger("train").info("Epoch {:5d} train_loss: {}, train_acc: {}, val_loss: {}, val_acc: {}, test_loss: {}, test_acc: {}".format(
epoch+1,
logs["loss"], logs["acc"],
logs["val_loss"], logs["val_acc"],
test_loss, test_acc))
LrDivisor(Callback)
连续若干轮验证集监测指标不变,则降低学习率
# LrDivisor. To use:
#
# lrDivisorCb = LrDivisor(patience = float(50000),
# division_cst = 10.0,
# verbose = 1,
# epoch_checkpoints = {75})
#
class LrDivisor(Callback):
def __init__(self, patience=float(50000), division_cst=10.0, epsilon=1e-03, verbose=1, epoch_checkpoints={41, 61}):
super(Callback, self).__init__()
self.patience = patience #没有进步的训练轮数,在这之后训练就会停止
self.checkpoints = epoch_checkpoints
self.wait = 0
self.previous_score = 0.
self.division_cst = division_cst
self.epsilon = epsilon #模糊因子,防止除零错误
self.verbose = verbose
self.iterations = 0
def on_batch_begin(self, batch, logs={}):
self.iterations += 1
def on_epoch_end(self, epoch, logs={}):
current_score = logs.get('val_acc')
divide = False
if (epoch + 1) in self.checkpoints: #?
divide = True
elif (current_score >= self.previous_score - self.epsilon and current_score <= self.previous_score + self.epsilon): #验证集acc几乎没变
self.wait +=1
if self.wait == self.patience: #连续patience次验证集acc不改变
divide = True
else:
self.wait = 0
if divide == True:
K.set_value(self.model.optimizer.lr, self.model.optimizer.lr.get_value() / self.division_cst)#学习率除以10
self.wait = 0
if self.verbose > 0:
L.getLogger("train").info("Current learning rate is divided by"+str(self.division_cst) + ' and his values is equal to: ' + str(self.model.optimizer.lr.get_value()))
self.previous_score = current_score
TrainValHistory(Callback)
把每一轮训练的性能记录下来
#
# Keep a history of the validation performance.
#
class TrainValHistory(Callback):
def __init__(self):
self.train_loss = []
self.train_acc = []
self.val_loss = []
self.val_acc = []
def on_epoch_end(self, epoch, logs={}):
self.train_loss.append(logs.get('loss'))
self.train_acc .append(logs.get('acc'))
self.val_loss .append(logs.get('val_loss'))
self.val_acc .append(logs.get('val_acc'))
SaveLastModel(Callback)
每一个epoch保存一个checkpoint,保留一个最新记录的checkpoint-ModelChkpt.hdf5
#
# Save checkpoints.
#
class SaveLastModel(Callback):
def __init__(self, workdir, period=10):
self.workdir = workdir
self.chkptsdir = os.path.join(self.workdir, "chkpts")#一个保存checkpoint文件的文件夹
if not os.path.isdir(self.chkptsdir):
os.mkdir(self.chkptsdir)
self.period_of_epochs = period
self.linkFilename = os.path.join(self.chkptsdir, "ModelChkpt.hdf5") #最新的checkpoint文件,每次覆盖
def on_epoch_end(self, epoch, logs={}):
if (epoch + 1) % self.period_of_epochs == 0: #整数轮次
# Filenames
baseHDF5Filename = "ModelChkpt{:06d}.hdf5".format(epoch+1)
baseYAMLFilename = "ModelChkpt{:06d}.yaml".format(epoch+1)
hdf5Filename = os.path.join(self.chkptsdir, baseHDF5Filename)
yamlFilename = os.path.join(self.chkptsdir, baseYAMLFilename)
# YAML
yamlModel = self.model.to_yaml()
with open(yamlFilename, "w") as yamlFile:
yamlFile.write(yamlModel)
# HDF5
KM.save_model(self.model, hdf5Filename)#每个epoch创建一个
with H.File(hdf5Filename, "r+") as f:
f.require_dataset("initialEpoch", (), "uint64", True)[...] = int(epoch+1) #这个epoch的checkpoint的initialEpoch是epoch+1
f.flush() #把数据写入到硬盘中,并且会释放被占用的内存
# Symlink to new HDF5 file, then atomically rename and replace.
os.symlink(baseHDF5Filename, self.linkFilename+".rename") #创建一个软连接,src指向dst
os.rename (self.linkFilename+".rename",
self.linkFilename)
# Print
L.getLogger("train").info("Saved checkpoint to {:s} at epoch {:5d}".format(hdf5Filename, epoch+1))
SaveBestModel(Callback)
保存目前最佳模型
#
# Save record-best models.
#
class SaveBestModel(Callback):
def __init__(self, workdir):
self.workdir = workdir
self.bestdir = os.path.join(self.workdir, "best")
if not os.path.isdir(self.bestdir):
os.mkdir(self.bestdir)
self.best_acc = 0
self.best_loss = +np.inf #numpy.inf无限大正数
def on_epoch_end(self, epoch, logs={}):
val_loss = logs['loss']
val_acc = logs['acc']
if val_acc > self.best_acc:
self.best_acc = val_acc
self.best_loss = val_loss
# Filenames
hdf5Filename = os.path.join(self.bestdir, "Bestmodel_{:06d}_{:.4f}_{:.4f}.hdf5".format(epoch+1, val_acc, val_loss))
yamlFilename = os.path.join(self.bestdir, "Bestmodel_{:06d}_{:.4f}_{:.4f}.yaml".format(epoch+1, val_acc, val_loss))
# YAML
yamlModel = self.model.to_yaml()
with open(yamlFilename, "w") as yamlFile:
yamlFile.write(yamlModel)
# HDF5
KM.save_model(self.model, hdf5Filename)
with H.File(hdf5Filename, "r+") as f:
f.require_dataset("initialEpoch", (), "uint64", True)[...] = int(epoch+1)
f.flush()
# Print
L.getLogger("train").info("Saved best model to {:s} at epoch {:5d}".format(hdf5Filename, epoch+1))