# 关于warm_up学习率

number batch warm_up_iter warm_up_epoch
COCO17 118287 8*8 4000 4000*64/118287=2.16
paiti_v1v2v3 22720 8*8 2.16*22720/64=766 2.16
• 关于warm-up：tf的models里面提到warm-upw为5epoch,所以上面计算的2.16epoch相对合理。
warmup: Run a 5 epoch warmup to the initial lr.

• warm_up核心代码
    learning_rate = cfg.learning_rate
boundaries = cfg.lr_steps	# _C.lr_steps = [65000, 68000]
gamma = cfg.lr_gamma	#_C.lr_gamma = 0.1
step_num = len(cfg.lr_steps)
# values = [lr*0.1^0, lr*0.1^1, lr*0.1^2]
values = [learning_rate * (gamma**i) for i in range(step_num + 1)]	#_C.lr_gamma = 0.1

optimizer = fluid.optimizer.Momentum(
learning_rate=exponential_with_warmup_decay(
learning_rate=learning_rate,
boundaries=boundaries,
values=values,
warmup_iter=cfg.warm_up_iter,
warmup_factor=cfg.warm_up_factor),
regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
momentum=cfg.momentum)

def exponential_with_warmup_decay(learning_rate, boundaries, values,
warmup_iter, warmup_factor):
global_step = lr_scheduler._decay_step_counter()

lr = fluid.layers.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="learning_rate")

warmup_iter_var = fluid.layers.fill_constant(
shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True)

with control_flow.Switch() as switch:
with switch.case(global_step < warmup_iter_var):
alpha = global_step / warmup_iter_var
# factor range:  [warmup_factor, alpha_final] -> [warmup_factor, 1]
factor = warmup_factor * (1 - alpha) + alpha
# decayed_lr: [lr*warmup_factor, lr]，在这里即实现了warm_up
decayed_lr = learning_rate * factor
fluid.layers.assign(decayed_lr, lr)

for i in range(len(boundaries)):	# len(boundaries))=2
boundary_val = fluid.layers.fill_constant(
shape=[1],
dtype='float32',
value=float(boundaries[i]),
force_cpu=True)
value_var = fluid.layers.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
fluid.layers.assign(value_var, lr)

last_value_var = fluid.layers.fill_constant(
shape=[1], dtype='float32', value=float(values[len(values) - 1]))
with switch.default():
fluid.layers.assign(last_value_var, lr)

return lr


`