神经机器翻译工具Nematus
程序运行流程分析
nematus/nmt.py/train (程序入口,从此函数开始分析)
- 1.1 读取源语言和目标语言词汇表
# 获取设置的超参数参数
model_options = locals().copy()
print 'Model options:',model_options
# 加载字典,并且反转
worddicts = [None]*len(dictionaries)
worddicts_r = [None]*len(dictionaries)
for ii,dd in enumerate(dictionaries):
worddicts[ii] = load_dict(dd)
worddicts_r[ii] = dict()
for kk,vv in worddicts[ii].iteritems():
worddicts_r[ii][vv] = kk
# 若词汇总大小未设置,则给定默认值为词汇表大小
if n_words_src is None:
n_words_src = len(worddicts[0])
model_options['n_words_src'] = n_words_src
if n_words_tgt is None:
n_words_tgt = len(worddicts[1])
model_options['n_words_tgt'] = n_words_tgt
- 1.2 加载训练集和开发集
# 加载数据
print 'Loading data ...'
train = TextIterator(datasets[0],datasets[1],
dictionaries[0],dictionaries[1],
n_words_source=n_words_src,
n_words_target=n_words_tgt,
batch_size=batch_size,
maxlen=maxlen,
shuffle_each_epoch=shuffle_each_epoch,
sort_by_length=sort_by_length,
maxibatch_size=maxibatch_size)
valid = TextIterator(valid_datasets[0], valid_datasets[1],
dictionaries[0], dictionaries[1],
n_words_source=n_words_src, n_words_target=n_words_tgt,
batch_size=valid_batch_size,
maxlen=maxlen)
- 1.3 初始化模型参数 init_params(model_options)
# 初始化模型参数
print 'Init parameters ...'
params = init_params(model_options)
- 1.4 重载模型,调用 load_params(saveto, params)
# 重新载入模型,当程序意外中断的时候,可以继续运行代码
if reload_ and os.path.exists(saveto):
print 'Reloading model parameters'
params = load_params(saveto,params)
- 1.5 把网络中的参数变为共享变量,变成共享变量后参数才可以更新 init_theano_params(params)
# 把网络中的W,b 变为共享变量
tparams = init_theano_params(params)
- 1.6 建立模型,即搭建计算图,定义网络前向传播过程并定义损失函数 build_model(tparams, model_options)
# 建立模型
print 'Building model ...'
trng,use_noise,x,x_mask,y,y_mask,\
opt_ret, cost, ctx, tt, _ = build_model(tparams,model_options)
inps = [x, x_mask, y, y_mask]
- 1.7 建立采样器,用于测试过程
#建立采样器
if validFreq or sampleFreq:
print 'Building sampler ...'
f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)
- 1.8 正则化操作
- 权重正则化,应用
l2
l
2
正则化
- 注意力权重归一化
- ???
- 权重正则化,应用
l2
l
2
正则化
# apply L2 regularization on weights
if decay_c > 0.:
decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
weight_decay = 0.
for kk, vv in tparams.iteritems():
weight_decay += (vv ** 2).sum()
weight_decay *= decay_c
cost += weight_decay #加上正则项
# regularize the alpha weights
if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
alpha_reg = alpha_c * (
(tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
cost += alpha_reg
# apply L2 regularisation to loaded model (map training)
if map_decay_c > 0:
map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c")
weight_map_decay = 0.
for kk, vv in tparams.iteritems():
init_value = theano.shared(vv.get_value(), name= kk + "_init")
weight_map_decay += ((vv -init_value) ** 2).sum()
weight_map_decay *= map_decay_c
cost += weight_map_decay
- 1.9 计算损失函数关于网络中各个参数的梯度
print 'Computing gradient...',
grads = tensor.grad(cost, wrt=itemlist(tparams))
print 'Done'
- 1.10 应用 梯度裁剪 策略
# apply gradient clipping here
if clip_c > 0.:
g2 = 0.
for g in grads:
g2 += (g**2).sum()
new_grads = []
for g in grads:
new_grads.append(tensor.switch(g2 > (clip_c**2),
g / tensor.sqrt(g2) * clip_c,
g))
grads = new_grads
- 1.11 定义学习率标量,并建立优化器,使用优化器更新学习率
# compile the optimizer, the actual computational graph is compiled here
lr = tensor.scalar(name='lr')
print 'Building optimizers...',
f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=profile)
print 'Done'
- 1.12 开始优化过程…
重载历史,包括:更新次数 uidx,历史错误率 history_errs
#开始优化
print 'Optimization'
best_p = None
bad_counter = 0
uidx = 0
estop = False
history_errs = []
# reload history
if reload_ and os.path.exists(saveto):
rmodel = numpy.load(saveto)
history_errs = list(rmodel['history_errs'])
if 'uidx' in rmodel:
uidx = rmodel['uidx']
if validFreq == -1:
validFreq = len(train[0])/batch_size
if saveFreq == -1:
saveFreq = len(train[0])/batch_size
if sampleFreq == -1:
sampleFreq = len(train[0])/batch_size
valid_err = None
从此处开始优化过程…
- max_epochs: 表示最大的 epochs 次数
- prepare_data: 准备数据,输入x为列表,列表行为 batch_size,每一行为一个句子中的词的 id 号
假设输入
x
x
为:
则 prepare_data(x,y, maxlen=maxlen, …) 输出 x x 为:
- 在上面这个例子中,矩阵中的红色0,代表词汇表中的 eos,即句尾结束符。即每个句子后面加上句尾结束符
- 函数 prepare_data 输出 x_mask为:
x_mask=⎡⎣⎢⎢⎢⎢⎢⎢⎢⎢⎢⎢⎢⎢⎢⎢111111101111100011111111⎤⎦⎥⎥⎥⎥⎥⎥⎥⎥⎥⎥⎥⎥⎥⎥
x
_
m
a
s
k
=
[
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
0
1
0
0
1
]
- x_mask的作用为,在GRU网络中输出隐状态后,在最后一个时候的隐状态一直复制下去。隐状态组成一个三维数组 (x_len, batch_size, dim)那么在此三维数组的底层就保存了每个句子的最后一个时刻的隐状态,具体见下图。上述矩阵中红色的 1表示,每个句子最后的结尾 eos通过GRU输出的隐状态也要保留。
- 更新参数后,计算损失值 cost = f_grad_shared(x, x_mask, y, y_mask)
- 更新学习率,f_update(lrate) ???
for eidx in xrange(max_epochs):
n_samples = 0
for x, y in train:
n_samples += len(x)
uidx += 1
use_noise.set_value(1.)
# 准备数据用于训练
x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
n_words_src=n_words_src,
n_words=n_words_tgt)
#长度小于 maxlen 的值的句子为 0
if x is None:
print 'Minibatch with zero sample under length ', maxlen
uidx -= 1
continue
ud_start = time.time()
# compute cost, grads and copy grads to shared variables
cost = f_grad_shared(x, x_mask, y, y_mask) #参数更新后,损失值
# do the update on parameters
f_update(lrate) #更新学习率
- 显示Epoch(Epoch次数), Update(更新次数), Cost(损失值), UD(执行一次更新的时间)
- 保存网络最优参数(最优参数存放在 best_p 中),并且保存 history_errs 和更新次数 uidx
- 保存当前迭代次数对应的模型参数
ud = time.time() - ud_start
# check for bad numbers, usually we remove non-finite elements
# and continue training - but not done here
if numpy.isnan(cost) or numpy.isinf(cost):
print 'NaN detected'
return 1., 1., 1.
# verbose
if numpy.mod(uidx, dispFreq) == 0:
print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
# save the best model so far, in addition, save the latest model
# into a separate file with the iteration number for external eval
if numpy.mod(uidx, saveFreq) == 0:
print 'Saving the best model...', #保存模型最优参数
if best_p is not None:
params = best_p
else:
params = unzip_from_theano(tparams)
numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)
print 'Done'
# save with uidx
if not overwrite:
print 'Saving the model at iteration {}...'.format(uidx),
saveto_uidx = '{}.iter{}.npz'.format(
os.path.splitext(saveto)[0], uidx)
numpy.savez(saveto_uidx, history_errs=history_errs,
uidx=uidx, **unzip_from_theano(tparams))
print 'Done'
- 产生当前模型参数下,翻译的结果样例
# generate some samples with the model and display them
if sampleFreq and numpy.mod(uidx, sampleFreq) == 0:
# FIXME: random selection?
for jj in xrange(numpy.minimum(5, x.shape[1])):
stochastic = True
sample, score, sample_word_probs, alignment = gen_sample([f_init], [f_next],
x[:, jj][:, None],
trng=trng, k=1,
maxlen=30,
stochastic=stochastic,
argmax=False,
suppress_unk=False)
print 'Source ', jj, ': ',
for vv in x[:,jj]:
if vv == 0:
break
if vv in worddicts_r[0]:
print worddicts_r[0][vv],
else:
print 'UNK'
print
print 'Truth ', jj, ' : ',
for vv in y[:, jj]:
if vv == 0:
break
if vv in worddicts_r[-1]:
print worddicts_r[-1][vv],
else:
print 'UNK',
print
print 'Sample ', jj, ': ',
if stochastic:
ss = sample
else:
score = score / numpy.array([len(s) for s in sample])
ss = sample[score.argmin()]
for vv in ss:
if vv == 0:
break
if vv in worddicts_r[-1]:
print worddicts_r[-1][vv],
else:
print 'UNK',
print
- 验证模型,使用 Early Stop 机制防止模型过拟合
# validate model on validation set and early stop if necessary
if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
use_noise.set_value(0.)
valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
model_options, valid)
valid_err = valid_errs.mean()
history_errs.append(valid_err)
if uidx == 0 or valid_err <= numpy.array(history_errs).min():
best_p = unzip_from_theano(tparams)
bad_counter = 0
#early stop关键所在
if len(history_errs) > patience and valid_err >= \
numpy.array(history_errs)[:-patience].min():
bad_counter += 1
if bad_counter > patience:
print 'Early Stop!'
estop = True
break
if numpy.isnan(valid_err):
ipdb.set_trace()
print 'Valid ', valid_err
if external_validation_script:
print "Calling external validation script"
print 'Saving model...',
params = unzip_from_theano(tparams)
#每次验证的时候,也会保存 uidx
numpy.savez(saveto +'.dev', history_errs=history_errs, uidx=uidx, **params)
json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2)
print 'Done'
p = Popen([external_validation_script])
- 验证是否达到最大的更新次数,若达到停止更新。输出验证集上最终的误差,保存最优的模型。
# finish after this many updates
if uidx >= finish_after:
print 'Finishing after %d iterations!' % uidx
estop = True
break
print 'Seen %d samples' % n_samples
if estop:
break
if best_p is not None:
zip_to_theano(best_p, tparams)
if valid:
use_noise.set_value(0.)
valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
model_options, valid)
valid_err = valid_errs.mean()
print 'Valid ', valid_err
if best_p is not None:
params = copy.copy(best_p)
else:
params = unzip_from_theano(tparams)
numpy.savez(saveto, zipped_params=best_p,
history_errs=history_errs,
uidx=uidx,
**params)
return valid_err