Nematus（三）技巧分析

最新推荐文章于 2024-05-17 09:41:34 发布

nlpming

最新推荐文章于 2024-05-17 09:41:34 发布

阅读量1k

点赞数

分类专栏： Machine Translation 文章标签： neural machine translation nematus theano

本文链接：https://blog.csdn.net/abc13310086/article/details/79189051

版权

Machine Translation 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

神经机器翻译工具Nematus

Nematus中的技巧分析

1、build_model 解析？build_encoder 解析？目标语言端句子的处理？

build_model 解析，包括 建立编码器，和 建立解码器

建立编码器 x,ctx = build_encoder(tparams,options,trng,use_noise,x_mask,sampling=False)

#build a training model
def build_model(tparams,options):
    """
    @function:建立模型
    """
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.)) 

    x_mask = tensor.matrix('x_mask',dtype='float32')
    y = tensor.matrix('y',dtype='int64')
    y_mask = tensor.matrix('y_mask',dtype='float32')

    # 建立编码器
    x,ctx = build_encoder(tparams,options,trng,use_noise,x_mask,sampling=False)

初始化解码器端初始隐状态 $s_0$

    n_samples = x.shape[1]
    n_timesteps_trg = y.shape[0]

    if options['use_dropout']:
        retain_probability_emb = 1-options['dropout_embedding']
        retain_probability_hidden = 1-options['dropout_hidden']
        retain_probability_target = 1-options['dropout_target']

        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout_d = shared_dropout_layer((5, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
        emb_dropout_d = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
        ctx_dropout_d = shared_dropout_layer((4, n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)
        target_dropout = shared_dropout_layer((n_timesteps_trg, n_samples, 1), use_noise, trng, retain_probability_target, scaled)
        target_dropout = tensor.tile(target_dropout, (1,1,options['dim_word']))
    else:
        rec_dropout_d = theano.shared(numpy.array([1.]*5, dtype='float32'))
        emb_dropout_d = theano.shared(numpy.array([1.]*2, dtype='float32'))
        ctx_dropout_d = theano.shared(numpy.array([1.]*4, dtype='float32'))

    # 从此处开始建立解码器
    # mean of the context (across time) will be used to intialize decoder rnn
    ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None]   ### 技巧之处
    # or you can use the last state of forward+backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2)

    if options['use_dropout']:
        ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled)

    # initial decoder state
    init_state = fflayer(tparams,ctx_mean,options,
                          prefix='ff_state',activ='tanh')

此处对目标语言句子进行处理，即在句首加上词 eos

    # word embedding (target), we will shift the target sequence one time step
    # to the right. This is done because of the bi-gram connections in the
    # readout and decoder rnn. The first target will be all zeros and we will
    # not condition on the last output.
    emb = tparams['Wemb_dec'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])

    #注意此处操作，将第一层置为0，即将目标语言端句子句首加上句子开始符 eos
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    if options['use_dropout']:
        emb *= target_dropout

进入解码器端RNN，输出：proj_h:解码器隐状态 $s_j$ ctxs:上下文向量 $c_j$ opt_ret[‘dec_alphas’]:对齐权重 $\alpha_{ij}$

    # decoder - pass through the decoder conditional gru with attention
    proj = gru_cond_layer(tparams, emb, options,
                                prefix='decoder',
                                mask=y_mask, context=ctx,
                                context_mask=x_mask,
                                one_step=False,
                                init_state=init_state,
                                emb_dropout=emb_dropout_d,
                                ctx_dropout=ctx_dropout_d,
                                rec_dropout=rec_dropout_d,
                                profile=profile)

    # hidden states of the decoder gru
    proj_h = proj[0]

    # weighted averages of context, generated by attention module
    ctxs = proj[1]

    if options['use_dropout']:
        proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
        emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
        ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)

    # weights (alignment matrix) #####LIUCAN: this is where the attention vector is.
    opt_ret['dec_alphas'] = proj[2]

此处计算 $t_j=tanh(U_0s_{j-1}+V_0Ey_{j-1}+C_0c_j)$
$U_0{\rightarrow}(m,n)$ $s_{j-1}{\rightarrow}(n,1)$ $V_0{\rightarrow}(m,m)$
$E{\rightarrow}(m,K_y)$ $y_{j-1}{\rightarrow}(K_y,1)$ $C_0{\rightarrow}(m,2n)$ $c_j{\rightarrow}(2n,1)$


    # compute word probabilities
    logit_lstm = fflayer(tparams, proj_h, options,
                                    prefix='ff_logit_lstm', activ='linear')
    logit_prev = fflayer(tparams, emb, options,
                                    prefix='ff_logit_prev', activ='linear')
    logit_ctx = fflayer(tparams, ctxs, options,
                                   prefix='ff_logit_ctx', activ='linear')
    logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)

    if options['use_dropout']:
        logit *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_hidden, scaled)

    # 生成tj，用于获取质量向量
    tt = logit

此处计算目标语言词 $y_j$ 的条件概率：
$p (y j | {y 1, \dots, y j - 1}, x) = e x p ( y T j W o t j ) \sum K y k = 1 e x p ( y T k W o t j )$ $p(y_j|\{y_1,\ldots ,y_{j-1}\},x)= \frac{exp(y_j^TW_ot_j)}{\sum_{k=1}^{K_y}exp(y_k^TW_ot_j)}$
$W_o \rightarrow (K_y,m)$

    logit = fflayer(tparams, logit, options,
                                   prefix='ff_logit', activ='linear') #联合训练的时候，ff_logit_b不需要用到
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
                                               logit_shp[2]])) # (70*80,40000)

此处定义损失函数 cost
$m a x 1 N \sum n = 1 N l o g p θ (y (n) | x (n))$ $max\frac{1}{N}\sum_{n=1}^{N}logp_\theta(y^{(n)}|x^{(n)})$
$c o s t = - 1 N \sum n = 1 N \sum j = 1 T y l o g p (y (n) j | {y (n) 1, \dots, y (n) j - 1}, x (n))$ $cost = - \frac{1}{N}\sum_{n=1}^{N}\sum_{j=1}^{T_y}logp(y_j^{(n)}|\{y_1^{(n)},\ldots,y_{j-1}^{(n)}\},x^{(n)})$

# cost
    y_flat = y.flatten() # (70*80,1)
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words_tgt'] + y_flat # (70*80,1)
    cost = -tensor.log(probs.flatten()[y_flat_idx]) # (70*80, 1)
    cost = cost.reshape([y.shape[0], y.shape[1]])

    cost = (cost * y_mask).sum(0) # 此处可获得对数似然概率

    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, ctx, tt, proj_h

build_encoder 建立编码器

#bidirectional RNN encoder: take input x(optionally with mask), and produce sequence of context
def build_encoder(tparams,options,trng,use_noise,x_mask=None,sampling=False):

    x = tensor.matrix('x',dtype='int64')
    x.tag.test_value = (numpy.random.rand(5,10)*100).astype('int64')

    #for the backward rnn, we just need to invert x
    xr = x[::-1]   #此处有区别 xr = x[:,::-1]
    if x_mask is None:  #测试的时候
        xr_mask = None
    else:
        xr_mask = x_mask[::-1]

    #时间步数，和样本个数
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1-options['dropout_embedding']
        retain_probability_hidden = 1-options['dropout_hidden']
        retain_probability_source = 1-options['dropout_source']
        if sampling:
            if options['model_version'] < 0.1:
                rec_dropout = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32'))
                rec_dropout_r = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32'))
                emb_dropout = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32'))
                emb_dropout_r = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32'))
                source_dropout = theano.shared(numpy.float32(retain_probability_source))
            else:
                rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
                rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
                emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
                emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
                source_dropout = theano.shared(numpy.float32(1.))
        else:
            if options['model_version'] < 0.1:
                scaled = False
            else:
                scaled = True
            rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
            rec_dropout_r = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
            emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
            emb_dropout_r = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
            source_dropout = shared_dropout_layer((n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled)
            source_dropout = tensor.tile(source_dropout, (1,1,options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
        rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32'))
        emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32'))

    # word embedding for forward rnn (source)
    emb = tparams['Wemb'][x.flatten()]     #此处不同
    emb = emb.reshape([n_timesteps,n_samples,options['dim_word']])
    if options['use_dropout']:
        emb *= source_dropout #此处使用dropout

    proj = gru_layer(tparams,emb,options,
                     prefix='encoder',
                     mask=x_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=profile)

    # word embedding for backward rnn (source)
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps,n_samples,options['dim_word']])
    if options['use_dropout']:
        if sampling:
            embr *= source_dropout
        else:
            embr *= source_dropout[::-1]

    projr = gru_layer(tparams,embr,options,
                      prefix='encoder_r',
                      mask=xr_mask,
                      emb_dropout=emb_dropout_r,
                      rec_dropout=rec_dropout,
                      profile=profile)

    #context will be the concatenation of forward and backward rnns
    ctx = concatenate([proj[0],projr[0][::-1]],axis=proj[0].ndim-1)

    return x,ctx

2、Early Stop 机制？

每隔10000次更新，计算验证集上的loss值，将此值加入到history_errors列表中。程序中设置patience为10，即用当前验证集上的错误率与除最新10个错误率之外的其他值相比，如果 valid_err >= numpy.array(history_errs)[:-patience].min()则此时 bad_counter += 1，如果 bad_counter > patience 则停止更新 Early Stop!!!

 valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                #early stop关键所在
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

3、Nematus中decoder详解？

编码器端使用的是一层单向的RNN网络，此网络的基本单元使用的GRU单元。当然也可以改为LSTM单元。
LSTM计算公式如下：

$f t i t o t C t ~ C t h t = σ (W f x t + U f h t - 1 + b f) = σ (W i x t + U i h t - 1 + b i) = σ (W o x t + U o h t - 1 + b o) = t a n h (W c x t + U c h t - 1 + b c) = f t * C t - 1 + i t * C t ~ = o t * t a n h (C t) (9) (10) (11) (12) (13) (14)$ $\begin{align} f_t & = {\sigma}(W_fx_t+U_fh_{t-1}+b_f) \\ i_t & = {\sigma}(W_ix_t+U_ih_{t-1}+b_i) \\ o_t & = {\sigma}(W_ox_t+U_oh_{t-1}+b_o) \\ \tilde{C_t} & = tanh(W_cx_t+U_ch_{t-1}+b_c) \\ C_t & = f_t*C_{t-1}+i_t*\tilde{C_t} \\ h_t & = o_t*tanh(C_t) \end{align}$
GRU计算公式如下：

$z t r t h t ~ h t = σ (W z x t + U z h t - 1 + b z) = σ (W r x t + U r h t - 1 + b r) = t a n h (W h x t + U h [r t * h t - 1] + b h) = (1 - z t) * h t - 1 + z t * h t ~ (15) (16) (17) (18)$ $\begin{align} z_t & = {\sigma}(W_zx_t+U_zh_{t-1}+b_z) \\ r_t & = {\sigma}(W_rx_t+U_rh_{t-1}+b_r) \\ \tilde{h_t} & = tanh(W_hx_t+U_h[r_t*h_{t-1}]+b_h) \\ h_t & = (1-z_t)*h_{t-1}+z_t*\tilde{h_t} \end{align}$

$state\_below\_$ : 此处同时计算 $W_zx_t+b_z, W_rx_t+b_r$
$state\_belowx$ : 此处计算 $W_hx_t+b_h$

 def gru_cond_layer(tparams, state_below, options, prefix='gru',
                   mask=None, context=None, one_step=False,
                   init_memory=None, init_state=None,
                   context_mask=None, emb_dropout=None,
                   rec_dropout=None, ctx_dropout=None,
                   profile=False,
                   **kwargs):
    """
    @function:解码器GRU层的计算
    """
    assert context, 'Context must be provided'

    if one_step:
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    dim = tparams[pp(prefix, 'Wcx')].shape[1]

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)

    # projected context
    assert context.ndim == 3, \
        'Context must be 3-d: #annotation x #sample x dim'
    pctx_ = tensor.dot(context*ctx_dropout[0], tparams[pp(prefix, 'Wc_att')]) +\
        tparams[pp(prefix, 'b_att')]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    # projected x
    state_belowx = tensor.dot(state_below*emb_dropout[0], tparams[pp(prefix, 'Wx')]) +\
        tparams[pp(prefix, 'bx')]
    state_below_ = tensor.dot(state_below*emb_dropout[1], tparams[pp(prefix, 'W')]) +\
        tparams[pp(prefix, 'b')]

$\_step\_slice$ :为循环体
$theano.scan()$ :用于实现循环

theano.scan函数参数
- sequences: 每次取参数中的一行，可以是多个参数同时取对应的那行
- outputs_info: 更新参数，并返回这些参数（隐状态 $s_{j-1}$ ，上下文向量 $c_j$ ，对齐权重 $\alpha_{ij}$ ）
- non_sequences: 这些参数作为常数，不会更新

$\tilde{s}_{j-1}=GRU_1(s_{j-1},e_{y_{j-1}})$

    def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, rec_dropout, ctx_dropout,
                    U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
                    U_n1, Ux_n1, b_n1, bx_n1):

        preact1 = tensor.dot(h_*rec_dropout[0], U)
        preact1 += x_
        preact1 = tensor.nnet.sigmoid(preact1)

        r1 = _slice(preact1, 0, dim) #重置门的计算
        u1 = _slice(preact1, 1, dim) #更新门的计算

        preactx1 = tensor.dot(h_*rec_dropout[1], Ux)  #计算h_tilde
        preactx1 *= r1
        preactx1 += xx_

        h1 = tensor.tanh(preactx1) #计算当前时刻的隐状态

        h1 = u1 * h_ + (1. - u1) * h1
        h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_

$α i j = e x p ( e i j ) \sum T x k = 1 e x p ( e k j )$ $\alpha_{ij}=\frac{exp(e_{ij})}{\sum_{k=1}^{T_x}exp(e_{kj})}$
$e_{ij}=v_a^Ttanh(W_o\tilde{s}_{j-1}+U_oh_i)$
$c_j=\sum_{i=1}^{T_x}{\alpha}_{ij}h_i$
$s_j=GRU_2(\tilde{s}_{j-1},c_j)$

上述图参考文献：https://arxiv.org/abs/1610.05011

       # attention
        pstate_ = tensor.dot(h1*rec_dropout[2], W_comb_att)
        pctx__ = pctx_ + pstate_[None, :, :]
        #pctx__ += xc_
        pctx__ = tensor.tanh(pctx__)
        alpha = tensor.dot(pctx__*ctx_dropout[1], U_att)+c_tt
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
        alpha = tensor.exp(alpha - alpha.max(0, keepdims=True))  #不同于 dl4mt
        if context_mask:
            alpha = alpha * context_mask
        alpha = alpha / alpha.sum(0, keepdims=True)
        ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context

        preact2 = tensor.dot(h1*rec_dropout[3], U_n1)+b_n1
        preact2 += tensor.dot(ctx_*ctx_dropout[2], Wc)
        preact2 = tensor.nnet.sigmoid(preact2)

        r2 = _slice(preact2, 0, dim)
        u2 = _slice(preact2, 1, dim)

        preactx2 = tensor.dot(h1*rec_dropout[4], Ux_n1)+bx_n1
        preactx2 *= r2
        preactx2 += tensor.dot(ctx_*ctx_dropout[3], Wcx)

        h2 = tensor.tanh(preactx2)

        h2 = u2 * h1 + (1. - u2) * h2
        h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

        return h2, ctx_, alpha.T  # pstate_, preact, preactx, r, u

调用 $theano.scan()$ 执行循环

    seqs = [mask, state_below_, state_belowx]
    #seqs = [mask, state_below_, state_belowx, state_belowc]
    _step = _step_slice

    shared_vars = [tparams[pp(prefix, 'U')],
                   tparams[pp(prefix, 'Wc')],
                   tparams[pp(prefix, 'W_comb_att')],
                   tparams[pp(prefix, 'U_att')],
                   tparams[pp(prefix, 'c_tt')],
                   tparams[pp(prefix, 'Ux')],
                   tparams[pp(prefix, 'Wcx')],
                   tparams[pp(prefix, 'U_n1')],
                   tparams[pp(prefix, 'Ux_n1')],
                   tparams[pp(prefix, 'b_n1')],
                   tparams[pp(prefix, 'bx_n1')]]

    if one_step:
        rval = _step(*(seqs + [init_state, None, None, pctx_, context, rec_dropout, ctx_dropout] +
                       shared_vars))
    else:
        rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    outputs_info=[init_state,  # 初始化时使用的值不返回
                                                  tensor.alloc(0., n_samples,
                                                               context.shape[2]),
                                                  tensor.alloc(0., n_samples,
                                                               context.shape[0])],
                                    non_sequences=[pctx_, context, rec_dropout, ctx_dropout]+shared_vars,
                                    name=pp(prefix, '_layers'),
                                    n_steps=nsteps,
                                    profile=profile,
                                    strict=True)
    return rval

context参数传入的是 $h_i$

# decoder - pass through the decoder conditional gru with attention
    proj = gru_cond_layer(tparams, emb, options,
                                prefix='decoder',
                                mask=y_mask, context=ctx,
                                context_mask=x_mask,
                                one_step=False,
                                init_state=init_state,
                                emb_dropout=emb_dropout_d,
                                ctx_dropout=ctx_dropout_d,
                                rec_dropout=rec_dropout_d,
                                profile=profile)

4、梯度截断？

Gradient Clipping的引入是为了处理gradient explosion或者gradients vanishing的问题。当在一次迭代中权重的更新过于迅猛的话，很容易导致loss divergence。Gradient Clipping的直观作用就是让权重的更新限制在一个合适的范围。

具体的细节是
- １．在solver中先设置一个clip_gradient
- ２．在前向传播与反向传播之后，我们会得到每个权重的梯度diff，这时不像通常那样直接使用这些梯度进行权重更新，而是先求所有权重梯度的平方和sumsq_diff，如果sumsq_diff > clip_gradient，则求缩放因子scale_factor = clip_gradient / sumsq_diff。这个scale_factor在(0,1)之间。如果权重梯度的平方和sumsq_diff越大，那缩放因子将越小。
- ３．最后将所有的权重梯度乘以这个缩放因子，这时得到的梯度才是最后的梯度信息。

这样就保证了在一次迭代更新中，所有权重的梯度的平方和在一个设定范围以内，这个范围就是clip_gradient.

 # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c**2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

5、神经网络优化算法的比较 SGD, Momentum, Adam, RMSprop, Adagrad ？

参考文献
- An overview of gradient descent optimization algorithms http://ruder.io/optimizing-gradient-descent/
- 吴恩达-神经网络与深度学习 https://study.163.com/my#/smarts
- 《深度学习》-Ian Goodfellow

6、参数正则化？

# apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay #加上正则项

7、柱搜索 Beam Search?

参考文献
- seq2seq中的beam search算法过程 https://zhuanlan.zhihu.com/p/28048246
- 谁能解释下seq2seq中的beam search算法过程? https://www.zhihu.com/question/54356960
- Beam Search（集束搜索/束搜索） https://www.cnblogs.com/xxey/p/4277181.html

8、注意力机制？基于Nematus如何画出词对齐矩阵？

词对齐权重保存在opt_ret[‘dec_alphas’]中,size为(y_maxlen, batch_size, x_maxlen)

  # decoder - pass through the decoder conditional gru with attention
    proj = gru_cond_layer(tparams, emb, options,
                                prefix='decoder',
                                mask=y_mask, context=ctx,
                                context_mask=x_mask,
                                one_step=False,
                                init_state=init_state,
                                emb_dropout=emb_dropout_d,
                                ctx_dropout=ctx_dropout_d,
                                rec_dropout=rec_dropout_d,
                                profile=profile)

    # hidden states of the decoder gru
    proj_h = proj[0]

    # weighted averages of context, generated by attention module
    ctxs = proj[1]

    if options['use_dropout']:
        proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
        emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
        ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)

    # weights (alignment matrix) #####LIUCAN: this is where the attention vector is.
    opt_ret['dec_alphas'] = proj[2]

词对齐矩阵

def get_data(source, target, alignment):
    with codecs.open(source,'r',encoding='utf8') as fp:
        src = fp.readlines()
    with codecs.open(target,'r',encoding='utf8') as fp:
        trg = fp.readlines()
    align = []
    with open(alignment) as fp:
        align_data = []
        for lines in fp:
            lines = lines.strip()
            if lines != "":
                align_data.append(map(lambda x:float(x), lines.split('\t')))
            else:
                align.append(align_data)
                align_data = []

    for i in range(len(src)):
        align_matrix = numpy.array(align[i])
        src_sentence = src[i].strip().split()
        trg_sentence = trg[i].strip().split()

        show_matrix(align_matrix, src_sentence, trg_sentence)


def show_matrix(align_matrix, source, target):
    """
    @function:画出词对齐矩阵
    """
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['xtick.direction'] = 'out'  
    plt.rcParams['ytick.direction'] = 'out'

    source = source + [u'</s>']
    target = target + [u'</s>']
    print 'source:',source
    print 'target:',target

    fig, ax = plt.subplots()
    width = 10

    #ax.spines['right'].set_visible(False)
    #ax.spines['bottom'].set_visible(False)

    ax.xaxis.set_ticks_position('top')
    #ax.spines['top'].set_position(('data',0))
    ax.yaxis.set_ticks_position('left')
    #ax.spines['left'].set_position(('data',0))

    align_shape = align_matrix.shape
    indx = numpy.arange(align_shape[1])
    indy = numpy.arange(align_shape[0])

    scale_ = 10 # 图像大小
    out_matrix = numpy.ones([scale_*align_shape[0],scale_*align_shape[1]])
    for j in range(align_shape[0]):
        for k in range(align_shape[1]):
            out_matrix[j*scale_:(j+1)*scale_,k*scale_:(k+1)*scale_] *= align_matrix[j,k]
    #ax.pcolor(out_matrix)  
    ax.imshow(out_matrix, plt.cm.gray)
    ax.set_xticks(indx*width+5)
    ax.set_xticklabels(source, fontdict={'size':10, 'rotation':90})
    ax.set_yticks(indy*width+5)
    ax.set_yticklabels(target, fontdict={'size':10})
    plt.show()

9、解码器端隐状态的初始化 $s_0$ ?

# 从此处开始建立解码器
    # mean of the context (across time) will be used to intialize decoder rnn
    ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None]   ### 技巧之处
    # or you can use the last state of forward+backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2)

    if options['use_dropout']:
        ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled)

    # initial decoder state
    init_state = fflayer(tparams,ctx_mean,options,
                          prefix='ff_state',activ='tanh')

10、Nematus中的模型保存方式？

使用Numpy中的savez函数，将变量保存为二进制文件

if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
                json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)

11、二维数组如何转化为三维？及其他数组运算技巧

$x\_mask\rightarrow$ [70,80]
$x\_mask$ [:,:,None] $\rightarrow$ [70,80,1]
(ctx*x_mask[:,:,None].sum(0) $\rightarrow$ [80,1]
$x\_mask.sum(0)$ [:,None] $\rightarrow$ [80,1]
$xr = x[::-1]$ 将数组翻转
emb = tparams[‘Wemb’][x.flatten()] 此处类似tensorflow中tf.embedding_lookup函数的功能
ctx = concatenate([proj[0],projr[0][::-1]],axis=proj[0].ndim-1) 前向后向隐状态拼接

12、Nematus中使用dropout的地方？

词向量使用dropout

# word embedding for forward rnn (source)
    emb = tparams['Wemb'][x.flatten()]     #此处不同
    emb = emb.reshape([n_timesteps,n_samples,options['dim_word']])
    if options['use_dropout']:
        emb *= source_dropout

GRU隐状态dropout

#state_below is the input word embeddings
    # input to the gates, concatenated
    state_below_ = tensor.dot(state_below*emb_dropout[0],tparams[pp(prefix,'W')]) + \
                tparams[pp(prefix,'b')]
    #input to compute the hidden state proposal
    state_belowx = tensor.dot(state_below*emb_dropout[1],tparams[pp(prefix,'Wx')]) + \
                tparams[pp(prefix,'bx')]
def _step_slice(m_,x_,xx_,h_,U,Ux,rec_dropout):

        preact = tensor.dot(h_*rec_dropout[0],U)  #此处计算 U*h
        preact += x_
        #reset and upate gates
        r = tensor.nnet.sigmoid(_slice(preact,0,dim))
        u = tensor.nnet.sigmoid(_slice(preact,1,dim))

        # compute the hidden state proposal
        preactx = tensor.dot(h_*rec_dropout[1],Ux) #此处计算 h_tilde
        preactx = preactx*r
        preactx = preactx + xx_

        #hidden state proposal
        h = tensor.tanh(preactx)

        #leaky itegrate and obtain next hidden state
        h = u*h_ + (1.-u)*h
        h = m_[:,None]*h + (1.-m_)[:,None]*h_

        return h

    # prepare scan arugments
    seqs = [mask,state_below_,state_belowx]
    init_states = [tensor.alloc(0.,n_samples,dim)]
    _step = _step_slice
    shared_vars = [tparams[pp(prefix,'U')],
                   tparams[pp(prefix,'Ux')],
                   rec_dropout]

    rval,updates = theano.scan(_step,
                               sequences=seqs,
                               outputs_info=init_states,
                               non_sequences=shared_vars,
                               name=pp(prefix,'_layers'),
                               n_steps=nsteps,
                               profile=profile,
                               strict=True)
    rval = [rval]

隐状态均值dropout

# mean of the context (across time) will be used to intialize decoder rnn
    ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None]
    # or you can use the last state of forward+backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2)

    if options['use_dropout']:
        ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled)

隐状态，词向量，上下文向量dropout

if options['use_dropout']:
        proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled)
        emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled)
        ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled)

在计算概率之前使用dropout

# compute word probabilities
    logit_lstm = fflayer(tparams, proj_h, options,
                                    prefix='ff_logit_lstm', activ='linear')
    logit_prev = fflayer(tparams, emb, options,
                                    prefix='ff_logit_prev', activ='linear')
    logit_ctx = fflayer(tparams, ctxs, options,
                                   prefix='ff_logit_ctx', activ='linear')
    logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)

    if options['use_dropout']:
        logit *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_hidden, scaled)

nlpming

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Nematus（三）技巧分析

神经机器翻译工具Nematus Nematus中的技巧分析1、build_model 解析？build_encoder 解析？目标语言端句子的处理？ build_model 解析，包括建立编码器，和建立解码器建立编码器 x,ctx = build_encoder(tparams,options,trng,use_noise,x_mask,sampl
复制链接

扫一扫