Adversarial Learning for Neural Dialogue Generation 代码分析

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Mr_tyting/article/details/80324880

作为一名NLPlayer 初学者,或者是一名道行不是很深的NLPlayer ,很有必要细细的评读相关优秀的论文,但是如果只是读论文,而不去探索实际的代码的实现,可能无法提高代码能力,也比较难的深刻理解论文中的一些细节,所以在读完论文后,详细的分析其开源代码,了解整个的实现过程非常有必要。

那么代码需要分析到什么程度呢?我的实习经历告诉我要分析到函数级别

本次要分析的论文代码链接:dialogue-gan

下面咱们就从程序的入口,随着程序运行的流程分析整个过程。

获取数据集

获取词表

def create_vocabulary(vocabulary_path, data_path_list, max_vocabulary_size,
    ┆   ┆   ┆   ┆   ┆ tokenizer=None, normalize_digits=True):
    注意数据文件内,一句一行。

    ##vocabulary_path: 统计、获取的词表要写入的文件,词表格式为[word],注意没有词频。
    ##data_path_list: 训练文件名列表 [.answer, .query]
    ##max_vocabulary_size: 取出现频率最高的top max_vocabulary_size 个词作为词表。
    ##tokenizer: 分词函数,若为None,则利用简单的空格,符号等分词。
    ##normalize_digits: 是否将分词后的数字词用0代替。

  if not gfile.Exists(vocabulary_path):
    print("Creating vocabulary %s from disc_data %s" % (vocabulary_path, data_path_list))
    vocab = {}
    for data_path in data_path_list:## 遍历answer, query文件with gfile.GFile(data_path, mode="r") as f:
    ┆   ┆ counter = 0
    ┆   ┆ for line in f:
    ┆   ┆   counter += 1
    ┆   ┆   if counter % 100000 == 0:
    ┆   ┆   ┆ print("  processing line %d" % counter)
    ┆   ┆   line = tf.compat.as_str_any(line)
    ┆   ┆   tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)##分词,形成一个词列表。
    ┆   ┆   for w in tokens:
    ┆   ┆   ┆ word = _DIGIT_RE.sub("0", w) if normalize_digits else w 
              ##如果分词后存在数字并且normalize_digits为真,则用0代替这些数字词。
              ## 统计每个词的出现频率。
    ┆   ┆   ┆ if word in vocab:
    ┆   ┆   ┆   vocab[word] += 1
    ┆   ┆   ┆ else:
    ┆   ┆   ┆   vocab[word] = 1
    ## 将vocab 字典按频率倒序排序,并且添加[_PAD, _GO, _EOS, _UNK]进词表。
    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    ## 截取出现频率最高的max_vocabulary_size的词作为词表。
    if len(vocab_list) > max_vocabulary_size:
    ┆ vocab_list = vocab_list[:max_vocabulary_size]
    ## 将统计出的词表写进vocabulary_path
    with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
    ┆ for w in vocab_list:
    ┆   vocab_file.write(w + "\n")

好了词表获取完了,存在vocab_path 内。然后我们还需要对词表中的每个词与某一整数对应,以符合模型的输入输出。

def initialize_vocabulary(vocabulary_path):
  ## vocabulary_path:上面存入词表地址,一行一个词。
  ##return:
  ##       vocab:(word, index)的列表。
  ##       rev_vocab: 词表。
  if gfile.Exists(vocabulary_path):
    rev_vocab = []
    with gfile.GFile(vocabulary_path, mode="r") as f:
    ┆ rev_vocab.extend(f.readlines())
    rev_vocab = [line.strip() for line in rev_vocab]
    ## 对每个词一一编号,其得到的vocab 格式为(word, index)的列表
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab, rev_vocab
  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)

这样就获得了词表rev_vocab 和词的映射表vocab

获取训练、开发数据集

def prepare_chitchat_data(data_dir, vocabulary, vocabulary_size, tokenizer=None):

  train_path = os.path.join(data_dir, "chitchat.train")
  dev_path = os.path.join(data_dir, "chitchat.dev")

  # Create token ids for the training disc_data.
  answer_train_ids_path = train_path + (".ids%d.answer" % vocabulary_size)
  query_train_ids_path = train_path + (".ids%d.query" % vocabulary_size)
  data_to_token_ids(train_path + ".answer", answer_train_ids_path, vocabulary, tokenizer)
  data_to_token_ids(train_path + ".query", query_train_ids_path, vocabulary, tokenizer)

  # Create token ids for the development disc_data.
  answer_dev_ids_path = dev_path + (".ids%d.answer" % vocabulary_size)
  query_dev_ids_path = dev_path + (".ids%d.query" % vocabulary_size)
  data_to_token_ids(dev_path + ".answer", answer_dev_ids_path, vocabulary, tokenizer)
  data_to_token_ids(dev_path + ".query", query_dev_ids_path, vocabulary, tokenizer)

  return (query_train_ids_path, answer_train_ids_path,
    ┆   ┆ query_dev_ids_path, answer_dev_ids_path)

上面这个函数就是获取训练数据集、开发数据集并根据上面所得的映射表将其映射成Integers

我们获取了训练数据集,开发数据集,但是answerquery是分开的,需要将其成对的打包起来,根据answerquery 的长度将其存入不同的bukets 中。

def read_data(config, source_path, target_path, max_size=None):
    ## source_path: query, target_path: answer
    ## 需要注意config.buckets 的 format: buckets = [(5, 10), (10, 15), (20, 25), (40, 50)],就是(query_size, answer_size)
    data_set = [[] for _ in config.buckets]
    with gfile.GFile(source_path, mode="r") as source_file:
    ┆   with gfile.GFile(target_path, mode="r") as target_file:
    ┆   ┆   source, target = source_file.readline(), target_file.readline()
    ┆   ┆   counter = 0 
    ┆   ┆   while source and target and (not max_size or counter < max_size):
    ┆   ┆   ┆   counter += 1
    ┆   ┆   ┆   if counter % 100000 == 0:
    ┆   ┆   ┆   ┆   print("  reading disc_data line %d" % counter)
    ┆   ┆   ┆   ┆   sys.stdout.flush()
    ┆   ┆   ┆   source_ids = [int(x) for x in source.split()]
    ┆   ┆   ┆   target_ids = [int(x) for x in target.split()]
    ┆   ┆   ┆   target_ids.append(data_utils.EOS_ID)
    ┆   ┆   ┆   for bucket_id, (source_size, target_size) in enumerate(config.buckets): #[bucket_id, (source_size, target_size)]
    ┆   ┆   ┆   ┆   if len(source_ids) < source_size and len(target_ids) < target_size:
    ┆   ┆   ┆   ┆   ┆   data_set[bucket_id].append([source_ids, target_ids])
    ┆   ┆   ┆   ┆   ┆   break
    ┆   ┆   ┆   source, target = source_file.readline(), target_file.readline()
    return data_set

将上面一系列操作串联起来,就得到了模型训练所需要的数据了。

def prepare_data(gen_config):
    train_path = os.path.join(gen_config.train_dir, "chitchat.train")
    voc_file_path = [train_path+".answer", train_path+".query"]
    vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size)
    data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size)
    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

    print("Preparing Chitchat gen_data in %s" % gen_config.train_dir)
    train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data(
    ┆   gen_config.train_dir, vocab, gen_config.vocab_size)

    # Read disc_data into buckets and compute their sizes.
    print ("Reading development and training gen_data (limit: %d)."
    ┆   ┆   ┆  % gen_config.max_train_data_size)
    dev_set = read_data(gen_config, dev_query, dev_answer)
    train_set = read_data(gen_config, train_query, train_answer, gen_config.max_train_data_size)

    return vocab, rev_vocab, dev_set, train_set

构建生成模型

获取数据后,就要构建生成模型了。代码中的生成模型是采用seq2seq 的带有attention 机制的模型。

我们分部分的看代码中的Seq2Seq 模型的__init__ 函数,__init__ 函数内有attention 机制:

def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
    return rl_seq2seq.embedding_attention_seq2seq(
    ┆   encoder_inputs,
    ┆   decoder_inputs,
    ┆   cell,
    ┆   num_encoder_symbols= source_vocab_size,
    ┆   num_decoder_symbols= target_vocab_size,
    ┆   embedding_size= emb_dim,## GRU隐藏层神经元个数
    ┆   output_projection=output_projection,##output 时可能还会做一个线性映射。
    ┆   feed_previous=do_decode,
    ┆   mc_search=self.mc_search,
    ┆   dtype=dtype)

我们再来看看这个embedding_attention_seq2seq 具体是怎么做的?

先看看Encoder 部分:

# Encoder.
encoder_cell = tf.contrib.rnn.EmbeddingWrapper(
    cell, embedding_classes=num_encoder_symbols,
    embedding_size=embedding_size)
encoder_outputs, encoder_state = tf.contrib.rnn.static_rnn(
    encoder_cell, encoder_inputs, dtype=dtype)
    ##encoder_outputs: shape [batch_size, query_size, emb_size] encoder 的每步的输出
    ## encoder_state: final state, shape [batch_size, num_layers, emb_size]

# First calculate a concatenation of encoder outputs to put attention on.
top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])## cell.output_size 为emb_size
    ┆   ┆   ┆ for e in encoder_outputs]
## 其实就是将每步的输出concat起来。
attention_states = array_ops.concat(top_states, 1)## shape为[-1, query_size, emb_size]

以上其实就是将encoder 的输出concat 作为attention_state。然后再结合decoderattention

outputs, state = embedding_attention_decoder(
  decoder_inputs,##answer
  encoder_state,## encoder final states作为deoceder初始状态
  attention_states,## 上面所得的,其实就是encoder_output
  cell,
  num_decoder_symbols,##config.answer_vocab_size
  embedding_size,
  num_heads=num_heads,
  output_size=output_size,##config.answer_vocab_size
  output_projection=output_projection,
  feed_previous=feed_previous,##如果为真,decode_input为上一步的输出,反正为decode_input,也就是true_data.
  initial_state_attention=initial_state_attention,
  mc_search=mc_search,## 若执行蒙特卡洛树搜索,则decode_input 方法不一样
  scope=scope)

具体来看看embedding_attention_decoder 方法时如何运作的?

if output_size is None:
output_size = cell.output_size
if output_projection is not None:
proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
proj_biases.get_shape().assert_is_compatible_with([num_symbols])

with variable_scope.variable_scope(
  scope or "embedding_attention_decoder", dtype=dtype) as scope:

embedding = variable_scope.get_variable("embedding",
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   [num_symbols, embedding_size])

loop_function = None 
if feed_previous == True:
    loop_function = _argmax_or_mcsearch(embedding, output_projection, update_embedding_for_previous, mc_search)

## 需要注意下_argmax_or_mcsearch函数,如果mc_search 为true,则decode_output出的分布中进行tf.multinomial采样出一个token作为当前步的真正输出,反正就用math_ops.argmax方法选取概率最大的token作为输出。
emb_inp = [
    embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs]
return attention_decoder(
    emb_inp,## decoder_input 做完wordEmbedding后
    initial_state,##encoder final state
    attention_states,## encoder 每一步的output 做concat结果
    cell,
    output_size=output_size,
    num_heads=num_heads,##这个参数值得注意下,就是选择多个不同的权重来做attention
    loop_function=loop_function,
    initial_state_attention=initial_state_attention,
    scope=scope)

我们在仔细看看attention_decoder 这个函数,是不是觉得有蒙圈?我感觉这个实现代码,可读性太差了?虽然代码中又大量的注释,但是代码结构一环套一环,让人读起来不顺畅,战线拉的太长,读到后面忘了前面。

attention_decoder 中首先计算W1ht,也就是attention 中的encoder_feature

## 其实就是将encoder_output reshape 成[-1, attn_length, 1, attn_size]
hidden = array_ops.reshape(
    attention_states, [-1, attn_length, 1, attn_size])##-1为batch_size
hidden_features = []
v = []
attention_vec_size = attn_size  # Size of query vectors for attention.
for a in xrange(num_heads):
  ## 这个k就是上面公式中的w1,不同的num_head,k可能不一样。
  ## 在下面的卷积操作中,k作为featureMap,起[1, 1, attn_size, attention_vec_size]表示[h,w,fin,fout]
  k = variable_scope.get_variable("AttnW_%d" % a,
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆ [1, 1, attn_size, attention_vec_size])
  ##下面这个卷积操作就比较厉害了,就是利用上面的k在长为attn_length,宽为1,通道数为attn_size的四维张量上以步长都为1做卷积
  ##由卷积操作的实际做法可知,这样做达到W1*ht的效果。
  hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
  v.append(
    ┆ variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

state = initial_state

要特别注意理解上面的卷积操作。
上面得到了num_head 个不同的encoder_feature

好了,我们在结合decode_statedecode_feature 的值,然后再求attention 的值,在attention_decoder 代码中是这样写的:

def attention(query):## 参数query就是decode_state

  ds = []  # Results of attention reads will be stored here.
  if nest.is_sequence(query):  # If the query is a tuple, flatten it.
    query_list = nest.flatten(query)
    for q in query_list:  # Check that ndims == 2 if specified.
    ┆ ndims = q.get_shape().ndims
    ┆ if ndims:
    ┆   assert ndims == 2
    query = array_ops.concat(query_list, 1)
  for a in xrange(num_heads):
    ## 在不同的num_head,存在不同的encode_feature和decode_features,我们计算对应,然后得出不同的attention值。
    with variable_scope.variable_scope("Attention_%d" % a):
    ┆ y = linear(query, attention_vec_size, True)
    ┆ y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])##decode_feature# Attention mask is a softmax of v^T * tanh(encode_features+decode_features)
    ┆ s = math_ops.reduce_sum(
    ┆   ┆ v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
    ┆ a = nn_ops.softmax(s)
    ┆ # Now calculate the attention-weighted vector d.
    ┆ d = math_ops.reduce_sum(
    ┆   ┆ array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
    ┆   ┆ [1, 2])
    ┆ ds.append(array_ops.reshape(d, [-1, attn_size]))
  return ds

以上会得到num_head 个不同的attention 值。我们再看是怎么实做的,接着看attention_decoder 中的代码。

outputs = []
prev = None
batch_attn_size = array_ops.stack([batch_size, attn_size])
attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
    ┆   ┆for _ in xrange(num_heads)]
for a in attns:  # Ensure the second shape of attention vectors is set.
  a.set_shape([None, attn_size])
if initial_state_attention:
  ## 利用encoder的final state 作为decoder的初始状态,计算attention值。
  attns = attention(initial_state)
for i, inp in enumerate(decoder_inputs):## 这个decoder_input就是true_data
  if i > 0:
    variable_scope.get_variable_scope().reuse_variables()
  # If loop_function is set, we use it instead of decoder_inputs.
  if loop_function is not None and prev is not None:
    with variable_scope.variable_scope("loop_function", reuse=True):
    ## 要特别注意这部分,如果prev不为空,则表示train 生成器,我们需要避免exposure bias问题
    ## 生成器的输入均是上一步的输出,并且上一步的采用loop_function里的采样
    ## 在pretain 阶段,prev为None,生成器的input就是用的true_data
    ┆ inp = loop_function(prev, i)
  # Merge input and previous attentions into one vector of the right size.
  input_size = inp.get_shape().with_rank(2)[1]
  if input_size.value is None:
    raise ValueError("Could not infer input size from input: %s" % inp.name)
  x = linear([inp] + attns, input_size, True)
  # Run the RNN.
  cell_output, state = cell(x, state)
  # Run the attention mechanism.
  if i == 0 and initial_state_attention:
    with variable_scope.variable_scope(variable_scope.get_variable_scope(),
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆  reuse=True):
    ┆ attns = attention(state)
  else:
    attns = attention(state)

  with variable_scope.variable_scope("AttnOutputProjection"):
    output = linear([cell_output] + attns, output_size, True)
  if loop_function is not None:
    prev = output
  outputs.append(output)

  return outputs, state

需要注意下,上面代码中attention 计算方式:

  • decoder 本步的输入input,结合上一步的attentiom 矩阵,作为本次decoder_input。
cell_output, new_state = cell(linear(input, prev_attn), prev_state)
  • 然后计算本次的attention 矩阵。
new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
  • 然后再计算本步的最终输出(这一步类似point generation network)
output = linear(cell_output, new_attn)
  • 如果prev不为None,则loop_function也必不为None,则表示不是在pretrain生成器的阶段,而是在train的阶段,这个时候为了避免exposure_bias问题,decoder_input 均为上一步的输出
if loop_function is not None:
    prev = output

attention_decoder 最终返回decoder 的所有步的输出的概率分布outputs,和decoder 最后的状态state

好了,embedding_attention_seq2seq 子方法都看完了,在回到embedding_attention_seq2seq 方法中:

outputs_and_state = control_flow_ops.cond(feed_previous,
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆ lambda: decoder(True),##train 阶段
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆ lambda: decoder(False))##pretrain阶段
outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
state_list = outputs_and_state[outputs_len:]
state = state_list[0]
if nest.is_sequence(encoder_state):
  state = nest.pack_sequence_as(structure=encoder_state,
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   flat_sequence=state_list)
return outputs_and_state[:outputs_len], state, encoder_state

好了,embedding_attention_seq2seq 方法分析完了,其返回了decoder 的所有步的输出outputs_and_state[:outputs_len]decoder 的最后的状态stateencoder 的最后一个状态encoder_state

再返回到Seq2SeqModel 类中:

# Feeds for inputs.
self.encoder_inputs = []
self.decoder_inputs = []
self.target_weights = []
for i in xrange(self.buckets[-1][0]):  # Last bucket is the biggest one.
    self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
for i in xrange(self.buckets[-1][1] + 1): 
    self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
    self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i)))
self.reward = [tf.placeholder(tf.float32, name="reward_%i" % i) for i in range(len(self.buckets))]

# Our targets are decoder inputs shifted by one.
targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] 

    self.outputs, self.losses, self.encoder_state = rl_seq2seq.model_with_buckets(
    ┆   self.encoder_inputs, self.decoder_inputs, targets, self.target_weights,
    ┆   self.buckets, source_vocab_size, self.batch_size,
    ┆   lambda x, y: seq2seq_f(x, y, tf.where(self.forward_only, True, False)),
    ┆   output_projection=output_projection, softmax_loss_function=softmax_loss_function)

我们首先注意到model_with_buckets 方法里面有个seq2seq_f 方法,其实也就是上面的embedding_attention_seq2seq,如果forward 为真,则embedding_attention_seq2seq 中的feed_previous 为真,反正为假。

我们来看看model_with_buckets,这个方法具体是怎么做的,算了,这个方法太简单了,简短解说:就是计算返回所有buketsdecoder_outputslosses(reward)encoder_states

接着看Seq2Seq 类代码:

if not forward_only:## 执行train_op,minimize loss
    with tf.name_scope("gradient_descent"):
    ┆   self.gradient_norms = []
    ┆   self.updates = []
    ┆   self.aj_losses = []
    ┆   self.gen_params = [p for p in tf.trainable_variables() if name_scope in p.name]
    ┆   #opt = tf.train.GradientDescentOptimizer(self.learning_rate)
    ┆   opt = tf.train.AdamOptimizer()
    ┆   for b in xrange(len(self.buckets)):
    ┆   ┆   self.reward[b] = self.reward[b] - reward_bias
    ┆   ┆   adjusted_loss = tf.cond(self.up_reward,
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆ lambda:tf.multiply(self.losses[b], self.reward[b]),
    ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆   ┆ lambda: self.losses[b])

    ┆   ┆   self.aj_losses.append(adjusted_loss)
    ┆   ┆   gradients = tf.gradients(adjusted_loss, self.gen_params)
    ┆   ┆   clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
    ┆   ┆   self.gradient_norms.append(norm)
    ┆   ┆   self.updates.append(opt.apply_gradients(
    ┆   ┆   ┆   zip(clipped_gradients, self.gen_params), global_step=self.global_step))

self.gen_variables = [k for k in tf.global_variables() if name_scope in k.name]
self.saver = tf.train.Saver(self.gen_variables)

我们需要注意到only_forwardup_ward 参数:

  • only_forwardtrue 时只计算MLE loss,不执行train_op
  • only_forwardfalseup_wardtrue 时,优化带有reward 的loss
  • only_forwardfalseup_wardfalse 时,优化不带有reward 的loss

至此,一个Seq2Seq model 就构建完毕了。

pretrain 生成器

首先我们需要看看seq2seq 模型的step 方法,简短解说:该方法就用一批训练数据来训练生成器,具体是不是只是计算loss,看参数forwardupward,这两个参数上面有说明,然后该方法返回:

if not forward_only:
    # Gradient norm(带reward), loss(不带reward), out_logits.
    return outputs[1], outputs[2], outputs[0]  
else:
    # encoder_state, loss, outputs.
    return outputs[0], outputs[1], outputs[2:]  

然后我们在看生成器的train部分主代码

## 创建seq2seq 模型
model = create_model(sess, gen_config, forward_only=False, name_scope=gen_config.name_model)

## 之前我们将训练数据,按照query,answer 的长度放在不同的buket内。
## len(train_set[b])表示在该buket内的query,answer 对数。
train_bucket_sizes = [len(train_set[b]) for b in xrange(len(gen_config.buckets))]

train_total_size = float(sum(train_bucket_sizes))## 训练数据总query,answer对数。
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
    ┆   ┆   ┆   ┆   ┆  for i in xrange(len(train_bucket_sizes))]
while True:
    # Choose a bucket according to disc_data distribution. We pick a random number
    # in [0, 1] and use the corresponding interval in train_buckets_scale.
    random_number_01 = np.random.random_sample()
    ## 可理解为随机的那某一个bukets数据训练
    bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01])

    # Get a batch and make a step.
    start_time = time.time()
    encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch(
    ┆   train_set, bucket_id, gen_config.batch_size)

    ## 训练生成器,注意这里面用的是MLE loss,decoder_input 为true_data
    _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False)

    step_time += (time.time() - start_time) / gen_config.steps_per_checkpoint
    loss += step_loss / gen_config.steps_per_checkpoint
    current_step += 1

好了,我们用MLE loss 的方式pretrain 了生成器。

构建判别器结构

简短解说:这部分就是分别把queryanswer (生成器生成的,或true_data)喂给两个RNN,然后这两个RNN 分别得到两个final state ,然后这两个final state 作为一个context input 再喂给一个二分类的RNN。嗯,代码也是这么简单。

pretrain 判别器

首先用上面已经pretrain 好的生成器来生成一批假数据(answers by generator),注意这里forwardtrue

按照和上面相同的处理,将query,answer,gen_answer 按照长度存储在不同的buket中。然后在对每个成对的queryanswer 做标签,如果answertrue_data,则对应起label 为1,如果answer 为生成器生成的,则其label 为0。

然后就pretrain 二分类的判别器了,代码太简单了,没什么好讲的。

未完待续

阅读更多
换一批

没有更多推荐了,返回首页