神经机器翻译工具Nematus
Nematus中的技巧分析
1、build_model 解析?build_encoder 解析?目标语言端句子的处理?
- build_model 解析,包括 建立编码器,和 建立解码器
- 建立编码器 x,ctx = build_encoder(tparams,options,trng,use_noise,x_mask,sampling=False)
#build a training model
def build_model(tparams,options):
"""
@function:建立模型
"""
opt_ret = dict()
trng = RandomStreams(1234)
use_noise = theano.shared(numpy.float32(0.))
x_mask = tensor.matrix('x_mask',dtype='float32')
y = tensor.matrix('y',dtype='int64')
y_mask = tensor.matrix('y_mask',dtype='float32')
# 建立编码器
x,ctx = build_encoder(tparams,options,trng,use_noise,x_mask,sampling=False)
- 初始化解码器端初始隐状态 s0 s 0
n_samples = x.shape[1]
n_timesteps_trg = y.shape[0]
if options['use_dropout']:
retain_probability_emb = 1-options['dropout_embedding']
retain_probability_hidden = 1-options['dropout_hidden']
retain_probability_target = 1-options['dropout_target']
if options['model_version'] < 0.1:
scaled = False
else:
scaled = True
rec_dropout_d = shared_dropout_layer((5, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
emb_dropout_d = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
ctx_dropout_d = shared_dropout_layer((4, n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)
target_dropout = shared_dropout_layer((n_timesteps_trg, n_samples, 1), use_noise, trng, retain_probability_target, scaled)
target_dropout = tensor.tile(target_dropout, (1,1,options['dim_word']))
else:
rec_dropout_d = theano.shared(numpy.array([1.]*5, dtype='float32'))
emb_dropout_d = theano.shared(numpy.array([1.]*2, dtype='float32'))
ctx_dropout_d = theano.shared(numpy.array([1.]*4, dtype='float32'))
# 从此处开始建立解码器
# mean of the context (across time) will be used to intialize decoder rnn
ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None] ### 技巧之处
# or you can use the last state of forward+backward encoder rnns
# ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2)
if options['use_dropout']:
ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled)
# initial decoder state
init_state = fflayer(tparams,ctx_mean,options,
prefix='ff_state',activ='tanh')
- 此处对目标语言句子进行处理,即在句首加上词 eos
# word embedding (target), we will shift the target sequence one time step
# to the right. This is done because of the bi-gram connections in the
# readout and decoder rnn. The first target will be all zeros and we will
# not condition on the last output.
emb = tparams['Wemb_dec'][y.flatten()]
emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
#注意此处操作,将第一层置为0,即将目标语言端句子句首加上句子开始符 eos
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
emb = emb_shifted
if options['use_dropout']:
emb *= target_dropout
- 进入解码器端RNN,输出:proj_h:解码器隐状态 sj s j ctxs:上下文向量 cj c j opt_ret[‘dec_alphas’]:对齐权重 αij α i j
# decoder - pass through the decoder conditional gru with attention
proj = gru_cond_layer(tparams, emb, options,
prefix='decoder',
mask=y_mask, context=ctx,
context_mask=x_mask,
one_step=False,
init_state=init_state,
emb_dropout=emb_dropout_d,
ctx_dropout=ctx_dropout_d,
rec_dropout=rec_dropout_d,
profile=profile)
# hidden states of the decoder gru
proj_h = proj[0]
# weighted averages of context, generated by attention module
ctxs = proj[1]
if options['use_dropout']:
proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)
# weights (alignment matrix) #####LIUCAN: this is where the attention vector is.
opt_ret['dec_alphas'] = proj[2]
- 此处计算 tj=tanh(U0sj−1+V0Eyj−1+C0cj) t j = t a n h ( U 0 s j − 1 + V 0 E y j − 1 + C 0 c j )
-
U0→(m,n)
U
0
→
(
m
,
n
)
sj−1→(n,1)
s
j
−
1
→
(
n
,
1
)
V0→(m,m)
V
0
→
(
m
,
m
)
E→(m,Ky) E → ( m , K y ) yj−1→(Ky,1) y j − 1 → ( K y , 1 ) C0→(m,2n) C 0 → ( m , 2 n ) cj→(2n,1) c j → ( 2 n , 1 )
# compute word probabilities
logit_lstm = fflayer(tparams, proj_h, options,
prefix='ff_logit_lstm', activ='linear')
logit_prev = fflayer(tparams, emb, options,
prefix='ff_logit_prev', activ='linear')
logit_ctx = fflayer(tparams, ctxs, options,
prefix='ff_logit_ctx', activ='linear')
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
if options['use_dropout']:
logit *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_hidden, scaled)
# 生成tj,用于获取质量向量
tt = logit
- 此处计算目标语言词
yj
y
j
的条件概率:
p(yj|{y1,…,yj−1},x)=exp(yTjWotj)∑Kyk=1exp(yTkWotj) p ( y j | { y 1 , … , y j − 1 } , x ) = e x p ( y j T W o t j ) ∑ k = 1 K y e x p ( y k T W o t j ) -
Wo→(Ky,m)
W
o
→
(
K
y
,
m
)
logit = fflayer(tparams, logit, options,
prefix='ff_logit', activ='linear') #联合训练的时候,ff_logit_b不需要用到
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]])) # (70*80,40000)
- 此处定义损失函数 cost
max1N∑n=1Nlogpθ(y(n)|x(n)) m a x 1 N ∑ n = 1 N l o g p θ ( y ( n ) | x ( n ) )
cost=−1N∑n=1N∑j=1Tylogp(y(n)j|{y(n)1,…,y(n)j−1},x(n)) c o s t = − 1 N ∑ n = 1 N ∑ j = 1 T y l o g p ( y j ( n ) | { y 1 ( n ) , … , y j − 1 ( n ) } , x ( n ) )
# cost
y_flat = y.flatten() # (70*80,1)
y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words_tgt'] + y_flat # (70*80,1)
cost = -tensor.log(probs.flatten()[y_flat_idx]) # (70*80, 1)
cost = cost.reshape([y.shape[0], y.shape[1]])
cost = (cost * y_mask).sum(0) # 此处可获得对数似然概率
return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, ctx, tt, proj_h
- build_encoder 建立编码器
#bidirectional RNN encoder: take input x(optionally with mask), and produce sequence of context
def build_encoder(tparams,options,trng,use_noise,x_mask=None,sampling=False):
x = tensor.matrix('x',dtype='int64')
x.tag.test_value = (numpy.random.rand(5,10)*100).astype('int64')
#for the backward rnn, we just need to invert x
xr = x[::-1] #此处有区别 xr = x[:,::-1]
if x_mask is None: #测试的时候
xr_mask = None
else:
xr_mask = x_mask[::-1]
#时间步数,和样本个数
n_timesteps = x.shape[0]
n_samples = x.shape[1]
#是否使用 dropout
if options['use_dropout']:
retain_probability_emb = 1-options['dropout_embedding']
retain_probability_hidden = 1-options['dropout_hidden']
retain_probability_source = 1-options['dropout_source']
if sampling:
if options['model_version'] < 0.1:
rec_dropout = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32'))
rec_dropout_r = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32'))
emb_dropout = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32'))
emb_dropout_r = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32'))
source_dropout = theano.shared(numpy.float32(retain_probability_source))
else:
rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
source_dropout = theano.shared(numpy.float32(1.))
else:
if options['model_version'] < 0.1:
scaled = False
else:
scaled = True
rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
rec_dropout_r = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
emb_dropout_r = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
source_dropout = shared_dropout_layer((n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled)
source_dropout = tensor.tile(source_dropout, (1,1,options['dim_word']))
else:
rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
# word embedding for forward rnn (source)
emb = tparams['Wemb'][x.flatten()] #此处不同
emb = emb.reshape([n_timesteps,n_samples,options['dim_word']])
if options['use_dropout']:
emb *= source_dropout #此处使用dropout
proj = gru_layer(tparams,emb,options,
prefix='encoder',
mask=x_mask,
emb_dropout=emb_dropout,
rec_dropout=rec_dropout,
profile=profile)
# word embedding for backward rnn (source)
embr = tparams['Wemb'][xr.flatten()]
embr = embr.reshape([n_timesteps,n_samples,options['dim_word']])
if options['use_dropout']:
if sampling:
embr *= source_dropout
else:
embr *= source_dropout[::-1]
projr = gru_layer(tparams,embr,options,
prefix='encoder_r',
mask=xr_mask,
emb_dropout=emb_dropout_r,
rec_dropout=rec_dropout,
profile=profile)
#context will be the concatenation of forward and backward rnns
ctx = concatenate([proj[0],projr[0][::-1]],axis=proj[0].ndim-1)
return x,ctx
2、Early Stop 机制?
- 每隔10000次更新,计算验证集上的loss值,将此值加入到history_errors列表中。程序中设置patience为10,即用当前验证集上的错误率与除最新10个错误率之外的其他值相比,如果 valid_err >= numpy.array(history_errs)[:-patience].min()则此时 bad_counter += 1,如果 bad_counter > patience 则停止更新 Early Stop!!!
valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
model_options, valid)
valid_err = valid_errs.mean()
history_errs.append(valid_err)
if uidx == 0 or valid_err <= numpy.array(history_errs).min():
best_p = unzip_from_theano(tparams)
bad_counter = 0
#early stop关键所在
if len(history_errs) > patience and valid_err >= \
numpy.array(history_errs)[:-patience].min():
bad_counter += 1
if bad_counter > patience:
print 'Early Stop!'
estop = True
break
3、Nematus中decoder详解?
编码器端使用的是一层单向的RNN网络,此网络的基本单元使用的GRU单元。当然也可以改为LSTM单元。
LSTM计算公式如下:
ftitotCt~Ctht=σ(Wfxt+Ufht−1+bf)=σ(Wixt+Uiht−1+bi)=σ(Woxt+Uoht−1+bo)=tanh(Wcxt+Ucht−1+bc)=ft∗Ct−1+it∗Ct~=ot∗tanh(Ct)(9)(10)(11)(12)(13)(14) (9) f t = σ ( W f x t + U f h t − 1 + b f ) (10) i t = σ ( W i x t + U i h t − 1 + b i ) (11) o t = σ ( W o x t + U o h t − 1 + b o ) (12) C t ~ = t a n h ( W c x t + U c h t − 1 + b c ) (13) C t = f t ∗ C t − 1 + i t ∗ C t ~ (14) h t = o t ∗ t a n h ( C t )GRU计算公式如下:
ztrtht~ht=σ(Wzxt+Uzht−1+bz)=σ(Wrxt+Urht−1+br)=tanh(Whxt+Uh[rt∗ht−1]+bh)=(1−zt)∗ht−1+zt∗ht~(15)(16)(17)(18) (15) z t = σ ( W z x t + U z h t − 1 + b z ) (16) r t = σ ( W r x t + U r h t − 1 + b r ) (17) h t ~ = t a n h ( W h x t + U h [ r t ∗ h t − 1 ] + b h ) (18) h t = ( 1 − z t ) ∗ h t − 1 + z t ∗ h t ~
- state_below_ s t a t e _ b e l o w _ : 此处同时计算 Wzxt+bz,Wrxt+br W z x t + b z , W r x t + b r
- state_belowx s t a t e _ b e l o w x : 此处计算 Whxt+bh W h x t + b h
def gru_cond_layer(tparams, state_below, options, prefix='gru',
mask=None, context=None, one_step=False,
init_memory=None, init_state=None,
context_mask=None, emb_dropout=None,
rec_dropout=None, ctx_dropout=None,
profile=False,
**kwargs):
"""
@function:解码器GRU层的计算
"""
assert context, 'Context must be provided'
if one_step:
assert init_state, 'previous state must be provided'
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
# mask
if mask is None:
mask = tensor.alloc(1., state_below.shape[0], 1)
dim = tparams[pp(prefix, 'Wcx')].shape[1]
# initial/previous state
if init_state is None:
init_state = tensor.alloc(0., n_samples, dim)
# projected context
assert context.ndim == 3, \
'Context must be 3-d: #annotation x #sample x dim'
pctx_ = tensor.dot(context*ctx_dropout[0], tparams[pp(prefix, 'Wc_att')]) +\
tparams[pp(prefix, 'b_att')]
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n*dim:(n+1)*dim]
return _x[:, n*dim:(n+1)*dim]
# projected x
state_belowx = tensor.dot(state_below*emb_dropout[0], tparams[pp(prefix, 'Wx')]) +\
tparams[pp(prefix, 'bx')]
state_below_ = tensor.dot(state_below*emb_dropout[1], tparams[pp(prefix, 'W')]) +\
tparams[pp(prefix, 'b')]
- _step_slice _ s t e p _ s l i c e :为循环体
- theano.scan() t h e a n o . s c a n ( ) :用于实现循环
theano.scan函数参数
- sequences: 每次取参数中的一行,可以是多个参数同时取对应的那行
- outputs_info: 更新参数,并返回这些参数(隐状态 sj−1 s j − 1 ,上下文向量 cj c j ,对齐权重 αij α i j )
- non_sequences: 这些参数作为常数,不会更新
- s~j−1=GRU1(sj−1,eyj−1) s ~ j − 1 = G R U 1 ( s j − 1 , e y j − 1 )
def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, rec_dropout, ctx_dropout,
U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
U_n1, Ux_n1, b_n1, bx_n1):
preact1 = tensor.dot(h_*rec_dropout[0], U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)
r1 = _slice(preact1, 0, dim) #重置门的计算
u1 = _slice(preact1, 1, dim) #更新门的计算
preactx1 = tensor.dot(h_*rec_dropout[1], Ux) #计算h_tilde
preactx1 *= r1
preactx1 += xx_
h1 = tensor.tanh(preactx1) #计算当前时刻的隐状态
h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
αij=exp(eij)∑Txk=1exp(ekj) α i j = e x p ( e i j ) ∑ k = 1 T x e x p ( e k j )- eij=vTatanh(Wos~j−1+Uohi) e i j = v a T t a n h ( W o s ~ j − 1 + U o h i )
- cj=∑Txi=1αijhi c j = ∑ i = 1 T x α i j h i
- sj=GRU2(s~j−1,cj) s j = G R U 2 ( s ~ j − 1 , c j )
![]()
![]()
上述图参考文献:https://arxiv.org/abs/1610.05011
# attention
pstate_ = tensor.dot(h1*rec_dropout[2], W_comb_att)
pctx__ = pctx_ + pstate_[None, :, :]
#pctx__ += xc_
pctx__ = tensor.tanh(pctx__)
alpha = tensor.dot(pctx__*ctx_dropout[1], U_att)+c_tt
alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
alpha = tensor.exp(alpha - alpha.max(0, keepdims=True)) #不同于 dl4mt
if context_mask:
alpha = alpha * context_mask
alpha = alpha / alpha.sum(0, keepdims=True)
ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context
preact2 = tensor.dot(h1*rec_dropout[3], U_n1)+b_n1
preact2 += tensor.dot(ctx_*ctx_dropout[2], Wc)
preact2 = tensor.nnet.sigmoid(preact2)
r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)
preactx2 = tensor.dot(h1*rec_dropout[4], Ux_n1)+bx_n1
preactx2 *= r2
preactx2 += tensor.dot(ctx_*ctx_dropout[3], Wcx)
h2 = tensor.tanh(preactx2)
h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u
- 调用 theano.scan() t h e a n o . s c a n ( ) 执行循环
seqs = [mask, state_below_, state_belowx]
#seqs = [mask, state_below_, state_belowx, state_belowc]
_step = _step_slice
shared_vars = [tparams[pp(prefix, 'U')],
tparams[pp(prefix, 'Wc')],
tparams[pp(prefix, 'W_comb_att')],
tparams[pp(prefix, 'U_att')],
tparams[pp(prefix, 'c_tt')],
tparams[pp(prefix, 'Ux')],
tparams[pp(prefix, 'Wcx')],
tparams[pp(prefix, 'U_n1')],
tparams[pp(prefix, 'Ux_n1')],
tparams[pp(prefix, 'b_n1')],
tparams[pp(prefix, 'bx_n1')]]
if one_step:
rval = _step(*(seqs + [init_state, None, None, pctx_, context, rec_dropout, ctx_dropout] +
shared_vars))
else:
rval, updates = theano.scan(_step,
sequences=seqs,
outputs_info=[init_state, # 初始化时使用的值不返回
tensor.alloc(0., n_samples,
context.shape[2]),
tensor.alloc(0., n_samples,
context.shape[0])],
non_sequences=[pctx_, context, rec_dropout, ctx_dropout]+shared_vars,
name=pp(prefix, '_layers'),
n_steps=nsteps,
profile=profile,
strict=True)
return rval
- context参数传入的是 hi h i
# decoder - pass through the decoder conditional gru with attention
proj = gru_cond_layer(tparams, emb, options,
prefix='decoder',
mask=y_mask, context=ctx,
context_mask=x_mask,
one_step=False,
init_state=init_state,
emb_dropout=emb_dropout_d,
ctx_dropout=ctx_dropout_d,
rec_dropout=rec_dropout_d,
profile=profile)
4、梯度截断?
Gradient Clipping的引入是为了处理gradient explosion或者gradients vanishing的问题。当在一次迭代中权重的更新过于迅猛的话,很容易导致loss divergence。Gradient Clipping的直观作用就是让权重的更新限制在一个合适的范围。
具体的细节是
- 1.在solver中先设置一个clip_gradient
- 2.在前向传播与反向传播之后,我们会得到每个权重的梯度diff,这时不像通常那样直接使用这些梯度进行权重更新,而是先求所有权重梯度的平方和sumsq_diff,如果sumsq_diff > clip_gradient,则求缩放因子scale_factor = clip_gradient / sumsq_diff。这个scale_factor在(0,1)之间。如果权重梯度的平方和sumsq_diff越大,那缩放因子将越小。
- 3.最后将所有的权重梯度乘以这个缩放因子,这时得到的梯度才是最后的梯度信息。
这样就保证了在一次迭代更新中,所有权重的梯度的平方和在一个设定范围以内,这个范围就是clip_gradient.
# apply gradient clipping here
if clip_c > 0.:
g2 = 0.
for g in grads:
g2 += (g**2).sum()
new_grads = []
for g in grads:
new_grads.append(tensor.switch(g2 > (clip_c**2),
g / tensor.sqrt(g2) * clip_c,
g))
grads = new_grads
5、神经网络优化算法的比较 SGD, Momentum, Adam, RMSprop, Adagrad ?
参考文献
- An overview of gradient descent optimization algorithms http://ruder.io/optimizing-gradient-descent/
- 吴恩达-神经网络与深度学习 https://study.163.com/my#/smarts
- 《深度学习》-Ian Goodfellow
6、参数正则化?
# apply L2 regularization on weights
if decay_c > 0.:
decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
weight_decay = 0.
for kk, vv in tparams.iteritems():
weight_decay += (vv ** 2).sum()
weight_decay *= decay_c
cost += weight_decay #加上正则项
7、柱搜索 Beam Search?
参考文献
- seq2seq中的beam search算法过程 https://zhuanlan.zhihu.com/p/28048246
- 谁能解释下seq2seq中的beam search算法过程? https://www.zhihu.com/question/54356960
- Beam Search(集束搜索/束搜索) https://www.cnblogs.com/xxey/p/4277181.html
8、注意力机制?基于Nematus如何画出词对齐矩阵?
词对齐权重保存在opt_ret[‘dec_alphas’]中,size为(y_maxlen, batch_size, x_maxlen)
# decoder - pass through the decoder conditional gru with attention
proj = gru_cond_layer(tparams, emb, options,
prefix='decoder',
mask=y_mask, context=ctx,
context_mask=x_mask,
one_step=False,
init_state=init_state,
emb_dropout=emb_dropout_d,
ctx_dropout=ctx_dropout_d,
rec_dropout=rec_dropout_d,
profile=profile)
# hidden states of the decoder gru
proj_h = proj[0]
# weighted averages of context, generated by attention module
ctxs = proj[1]
if options['use_dropout']:
proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)
# weights (alignment matrix) #####LIUCAN: this is where the attention vector is.
opt_ret['dec_alphas'] = proj[2]
- 词对齐矩阵
def get_data(source, target, alignment):
with codecs.open(source,'r',encoding='utf8') as fp:
src = fp.readlines()
with codecs.open(target,'r',encoding='utf8') as fp:
trg = fp.readlines()
align = []
with open(alignment) as fp:
align_data = []
for lines in fp:
lines = lines.strip()
if lines != "":
align_data.append(map(lambda x:float(x), lines.split('\t')))
else:
align.append(align_data)
align_data = []
for i in range(len(src)):
align_matrix = numpy.array(align[i])
src_sentence = src[i].strip().split()
trg_sentence = trg[i].strip().split()
show_matrix(align_matrix, src_sentence, trg_sentence)
def show_matrix(align_matrix, source, target):
"""
@function:画出词对齐矩阵
"""
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['xtick.direction'] = 'out'
plt.rcParams['ytick.direction'] = 'out'
source = source + [u'</s>']
target = target + [u'</s>']
print 'source:',source
print 'target:',target
fig, ax = plt.subplots()
width = 10
#ax.spines['right'].set_visible(False)
#ax.spines['bottom'].set_visible(False)
ax.xaxis.set_ticks_position('top')
#ax.spines['top'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
#ax.spines['left'].set_position(('data',0))
align_shape = align_matrix.shape
indx = numpy.arange(align_shape[1])
indy = numpy.arange(align_shape[0])
scale_ = 10 # 图像大小
out_matrix = numpy.ones([scale_*align_shape[0],scale_*align_shape[1]])
for j in range(align_shape[0]):
for k in range(align_shape[1]):
out_matrix[j*scale_:(j+1)*scale_,k*scale_:(k+1)*scale_] *= align_matrix[j,k]
#ax.pcolor(out_matrix)
ax.imshow(out_matrix, plt.cm.gray)
ax.set_xticks(indx*width+5)
ax.set_xticklabels(source, fontdict={'size':10, 'rotation':90})
ax.set_yticks(indy*width+5)
ax.set_yticklabels(target, fontdict={'size':10})
plt.show()
9、解码器端隐状态的初始化 s0 s 0 ?
# 从此处开始建立解码器
# mean of the context (across time) will be used to intialize decoder rnn
ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None] ### 技巧之处
# or you can use the last state of forward+backward encoder rnns
# ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2)
if options['use_dropout']:
ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled)
# initial decoder state
init_state = fflayer(tparams,ctx_mean,options,
prefix='ff_state',activ='tanh')
10、Nematus中的模型保存方式?
- 使用Numpy中的savez函数,将变量保存为二进制文件
if numpy.mod(uidx, saveFreq) == 0:
print 'Saving the best model...',
if best_p is not None:
params = best_p
else:
params = unzip_from_theano(tparams)
numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)
11、二维数组如何转化为三维?及其他数组运算技巧
- x_mask→ x _ m a s k → [70,80]
- x_mask x _ m a s k [:,:,None] → → [70,80,1]
- (ctx*x_mask[:,:,None].sum(0) → → [80,1]
- x_mask.sum(0) x _ m a s k . s u m ( 0 ) [:,None] → → [80,1]
- xr=x[::−1] x r = x [ :: − 1 ] 将数组翻转
- emb = tparams[‘Wemb’][x.flatten()] 此处类似tensorflow中tf.embedding_lookup函数的功能
- ctx = concatenate([proj[0],projr[0][::-1]],axis=proj[0].ndim-1) 前向后向隐状态拼接
12、Nematus中使用dropout的地方?
- 词向量使用dropout
# word embedding for forward rnn (source)
emb = tparams['Wemb'][x.flatten()] #此处不同
emb = emb.reshape([n_timesteps,n_samples,options['dim_word']])
if options['use_dropout']:
emb *= source_dropout
- GRU隐状态dropout
#state_below is the input word embeddings
# input to the gates, concatenated
state_below_ = tensor.dot(state_below*emb_dropout[0],tparams[pp(prefix,'W')]) + \
tparams[pp(prefix,'b')]
#input to compute the hidden state proposal
state_belowx = tensor.dot(state_below*emb_dropout[1],tparams[pp(prefix,'Wx')]) + \
tparams[pp(prefix,'bx')]
def _step_slice(m_,x_,xx_,h_,U,Ux,rec_dropout):
preact = tensor.dot(h_*rec_dropout[0],U) #此处计算 U*h
preact += x_
#reset and upate gates
r = tensor.nnet.sigmoid(_slice(preact,0,dim))
u = tensor.nnet.sigmoid(_slice(preact,1,dim))
# compute the hidden state proposal
preactx = tensor.dot(h_*rec_dropout[1],Ux) #此处计算 h_tilde
preactx = preactx*r
preactx = preactx + xx_
#hidden state proposal
h = tensor.tanh(preactx)
#leaky itegrate and obtain next hidden state
h = u*h_ + (1.-u)*h
h = m_[:,None]*h + (1.-m_)[:,None]*h_
return h
# prepare scan arugments
seqs = [mask,state_below_,state_belowx]
init_states = [tensor.alloc(0.,n_samples,dim)]
_step = _step_slice
shared_vars = [tparams[pp(prefix,'U')],
tparams[pp(prefix,'Ux')],
rec_dropout]
rval,updates = theano.scan(_step,
sequences=seqs,
outputs_info=init_states,
non_sequences=shared_vars,
name=pp(prefix,'_layers'),
n_steps=nsteps,
profile=profile,
strict=True)
rval = [rval]
- 隐状态均值dropout
# mean of the context (across time) will be used to intialize decoder rnn
ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None]
# or you can use the last state of forward+backward encoder rnns
# ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2)
if options['use_dropout']:
ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled)
- 隐状态,词向量,上下文向量dropout
if options['use_dropout']:
proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)
- 在计算概率之前使用dropout
# compute word probabilities
logit_lstm = fflayer(tparams, proj_h, options,
prefix='ff_logit_lstm', activ='linear')
logit_prev = fflayer(tparams, emb, options,
prefix='ff_logit_prev', activ='linear')
logit_ctx = fflayer(tparams, ctxs, options,
prefix='ff_logit_ctx', activ='linear')
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
if options['use_dropout']:
logit *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_hidden, scaled)