Relation Classification via Convolutional Deep Neural Network 代码研读

最新推荐文章于 2021-10-25 16:25:15 发布

herosunly

最新推荐文章于 2021-10-25 16:25:15 发布

阅读量5.9k

点赞数 17

分类专栏： TensorFlow

本文链接：https://blog.csdn.net/herosunly/article/details/90769134

版权

TensorFlow 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

1. base.py

训练集的数据格式可参考：3 12 12 15 15 the system as described above has its greatest application in an arrayed configuration of antenna elements

转换后的数据为一个列表，每个列表元素为[标签，实体1，实体2，句子]，其中实体1为[实体1起始位置，实体1终止位置]。

PAD_WORD = "<pad>"

#RawExample是名称，后面4个是它的属性
RawExample = namedtuple('RawExample', 'label entity1 entity2 sentence') 
PositionPair = namedtuple('PosPair', 'first last')

FLAGS = tf.app.flags.FLAGS # load FLAGS.word_dim

def load_raw_data(filename):
  '''load raw data from text file, 

  return: a list of Raw_Example
  '''
  data = []
  with open(filename) as f:
    for line in f:
      words = line.strip().split(' ')
      
      sent = words[5:]
      n = len(sent)
      #这一步操作不一定会很好
      if FLAGS.max_len < n:
        FLAGS.max_len = n

      label = int(words[0])

      entity1 = PositionPair(int(words[1]), int(words[2]))
      entity2 = PositionPair(int(words[3]), int(words[4]))

      example = RawExample(label, entity1, entity2, sent)
      data.append(example)
  print(FLAGS.max_len)
  return data

把训练和测试数据中的所有词汇和<pad>逐行写入到vocab_file中。

def build_vocab(raw_train_data, raw_test_data, vocab_file):
  '''collect words in sentence'''
  if not os.path.exists(vocab_file):
    vocab = set()
    for example in raw_train_data + raw_test_data:
      for w in example.sentence:
          vocab.add(w)

    with open(vocab_file, 'w') as f:
      for w in sorted(list(vocab)):
        f.write('%s\n' % w)
      f.write('%s\n' % PAD_WORD)

读取词嵌入，其中embed指的是二维矩阵，维度为word_num*embedding_size。

def _load_embedding(embed_file, words_file):
  embed = np.load(embed_file)

  words2id = {}
  words = _load_vocab(words_file)
  for id, w in enumerate(words):
    words2id[w] = id
  
  return embed, words2id

处理词嵌入，根据词嵌入，对训练集和测试集的构造的词对应的词嵌入逐个加入列表中。如果未知即为<unk>，它所代表的向量是np.random.normal(0,0.1,[FLAGS.word_dim])，并在最后一个为<pad>，它所代表的向量是零向量。

def trim_embeddings(vocab_file, 
                        pretrain_embed_file,
                        pretrain_words_file,
                        trimed_embed_file): #这个不就是处理unknown嘛，写的这么复杂！！！
  '''trim unnecessary words from original pre-trained word embedding

  Args:
    vocab_file: a file of tokens in train and test data
    pretrain_embed_file: file name of the original pre-trained embedding
    pretrain_words_file: file name of the words list w.r.t the embed
    trimed_embed_file: file name of the trimmed embedding
  '''
  if not os.path.exists(trimed_embed_file):
    pretrain_embed, pretrain_words2id = _load_embedding(
                                              pretrain_embed_file,
                                              pretrain_words_file)
    word_embed=[]
    vocab = _load_vocab(vocab_file)

    for w in vocab:
      if w in pretrain_words2id:
        id = pretrain_words2id[w]
        word_embed.append(pretrain_embed[id])
      else:
        vec = np.random.normal(0,0.1,[FLAGS.word_dim])
        word_embed.append(vec)
    pad_id = -1
    word_embed[pad_id] = np.zeros([FLAGS.word_dim])

    word_embed = np.asarray(word_embed)
    np.save(trimed_embed_file, word_embed.astype(np.float32))
    
  
  word_embed, vocab2id = _load_embedding(trimed_embed_file, vocab_file)
  return word_embed, vocab2id

嵌套函数

def _lexical_feature(raw_example):#语法特征
#嵌套函数开始
  def _entity_context(e_idx, sent):
    ''' return [w(e-1), w(e), w(e+1)]
    '''
    context = []
    context.append(sent[e_idx]) #

    if e_idx >= 1:
      context.append(sent[e_idx-1])
    else:
      context.append(sent[e_idx])
    
    if e_idx < len(sent)-1:
      context.append(sent[e_idx+1])
    else:
      context.append(sent[e_idx])
    
    return context
#嵌套函数结束
    
  e1_idx = raw_example.entity1.first
  e2_idx = raw_example.entity2.first

  context1 = _entity_context(e1_idx, raw_example.sentence) #返回的是一个list，[实体词左边的一个词+实体词+实体词右边的一个词]，这里的词是字符串类型
  context2 = _entity_context(e2_idx, raw_example.sentence)

  # ignore WordNet hypernyms in paper
  lexical = context1 + context2
  return lexical

获取每个词距离实体1的位置，每个词距离实体2的位置

def _position_feature(raw_example):
  def distance(n):
    '''convert relative distance to positive number
    -60), [-60, 60], (60
    '''
    # FIXME: FLAGS.pos_num
    if n < -60:
      return 0
    elif n >= -60 and n <= 60:
      return n + 61
    
    return 122

  e1_idx = raw_example.entity1.first
  e2_idx = raw_example.entity2.first

  position1 = []
  position2 = []
  length = len(raw_example.sentence)
  for i in range(length):
    position1.append(distance(i - e1_idx))
    position2.append(distance(i - e2_idx))
  
  return position1, position2


def build_sequence_example(raw_example):
  '''build tf.train.SequenceExample from Raw_Example
  context features : lexical, rid, direction (mtl)
  sequence features: sentence, position1, position2

  Args: 
    raw_example : type Raw_Example

  Returns:
    tf.trian.SequenceExample
  '''
  ex = tf.train.SequenceExample()

  lexical = _lexical_feature(raw_example)
  ex.context.feature['lexical'].int64_list.value.extend(lexical)

  rid = raw_example.label
  ex.context.feature['rid'].int64_list.value.append(rid)

  for word_id in raw_example.sentence:
    word = ex.feature_lists.feature_list['sentence'].feature.add()
    word.int64_list.value.append(word_id)
  
  position1, position2 = _position_feature(raw_example)
  for pos_val in position1:
    pos = ex.feature_lists.feature_list['position1'].feature.add()
    pos.int64_list.value.append(pos_val)
  for pos_val in position2:
    pos = ex.feature_lists.feature_list['position2'].feature.add()
    pos.int64_list.value.append(pos_val)

  return ex

def inputs():
  raw_train_data = load_raw_data(FLAGS.train_file)
  raw_test_data = load_raw_data(FLAGS.test_file)

  build_vocab(raw_train_data, raw_test_data, FLAGS.vocab_file)

  if FLAGS.word_dim == 50:
    word_embed, vocab2id = maybe_trim_embeddings(
                                        FLAGS.vocab_file,
                                        FLAGS.senna_embed50_file,
                                        FLAGS.senna_words_file,
                                        FLAGS.trimmed_embed50_file)
  elif FLAGS.word_dim == 300:
    word_embed, vocab2id = maybe_trim_embeddings(
                                        FLAGS.vocab_file,
                                        FLAGS.google_embed300_file,
                                        FLAGS.google_words_file,
                                        FLAGS.trimmed_embed300_file)

  # map words to ids
  map_words_to_id(raw_train_data, vocab2id)
  map_words_to_id(raw_test_data, vocab2id)

  # convert raw data to TFRecord format data, and write to file
  train_record = FLAGS.train_record
  test_record = FLAGS.test_record
  
  maybe_write_tfrecord(raw_train_data, train_record)
  maybe_write_tfrecord(raw_test_data, test_record)

  pad_value = vocab2id[PAD_WORD]
  train_data = read_tfrecord_to_batch(train_record, 
                              FLAGS.num_epochs, FLAGS.batch_size, 
                              pad_value, shuffle=True)
  test_data = read_tfrecord_to_batch(test_record, 
                              FLAGS.num_epochs, 2717, 
                              pad_value, shuffle=False)

  return train_data, test_data, word_embed

2. cnn_model.py

class CNNModel(BaseModel):
  '''
  Relation Classification via Convolutional Deep Neural Network
  http://www.aclweb.org/anthology/C14-1220
  '''

  def __init__(self, word_embed, data, word_dim, 
              pos_num, pos_dim, num_relations,
              keep_prob, num_filters,
              lrn_rate, is_train):
    # input data
    lexical, rid, sentence, pos1, pos2 = data

    # embedding initialization
    w_trainable = True if FLAGS.word_dim==50 else False

    word_embed = tf.get_variable('word_embed', 
                      initializer=word_embed,
                      dtype=tf.float32,
                      trainable=w_trainable)
    pos1_embed = tf.get_variable('pos1_embed', shape=[pos_num, pos_dim]) #123 * 5
    pos2_embed = tf.get_variable('pos2_embed', shape=[pos_num, pos_dim])


    # # embedding lookup
    lexical = tf.nn.embedding_lookup(word_embed, lexical) # batch_size, 6, word_dim
    lexical = tf.reshape(lexical, [-1, 6*word_dim])
    self.labels = tf.one_hot(rid, num_relations)       # batch_size, num_relations

    sentence = tf.nn.embedding_lookup(word_embed, sentence)   # batch_size, max_len, word_dim
    pos1 = tf.nn.embedding_lookup(pos1_embed, pos1)       # batch_size, max_len, pos_dim
    pos2 = tf.nn.embedding_lookup(pos2_embed, pos2)       # batch_size, max_len, pos_dim

    # cnn model
    sent_pos = tf.concat([sentence, pos1, pos2], axis=2)
    if is_train:
      sent_pos = tf.nn.dropout(sent_pos, keep_prob)
    
    feature = cnn_forward('cnn', sent_pos, lexical, num_filters)
    feature_size = feature.shape.as_list()[1]
    self.feature = feature
    
    if is_train:
      feature = tf.nn.dropout(feature, keep_prob)

    # Map the features to 19 classes
    logits, loss_l2 = linear_layer('linear_cnn', feature, 
                                  feature_size, num_relations, 
                                  is_regularize=True)

    prediction = tf.nn.softmax(logits)
    prediction = tf.argmax(prediction, axis=1)
    accuracy = tf.equal(prediction, tf.argmax(self.labels, axis=1))
    accuracy = tf.reduce_mean(tf.cast(accuracy, tf.float32))
    loss_ce = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=logits))

    self.logits = logits
    self.prediction = prediction
    self.accuracy = accuracy
    self.loss = loss_ce + 0.01*loss_l2

    if not is_train:
      return 

    # global_step = tf.train.get_or_create_global_step()
    global_step = tf.Variable(0, trainable=False, name='step', dtype=tf.int32)
    optimizer = tf.train.AdamOptimizer(lrn_rate)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):# for batch_norm
      self.train_op = optimizer.minimize(self.loss, global_step)
    self.global_step = global_step

3. train.py

该函数的目标是对Tensorflow进行性能剖析，可参考链接：https://walsvid.github.io/2017/03/25/profiletensorflow/#fn_2

def trace_runtime(sess, m_train):
  '''
  trace runtime bottleneck using timeline api

  navigate to the URL 'chrome://tracing' in a Chrome web browser, 
  click the 'Load' button and locate the timeline file.
  '''
  run_metadata=tf.RunMetadata() #元信息: 记录训练运算时间和内存占用等信息
  options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 
  from tensorflow.python.client import timeline
  trace_file = open('timeline.ctf.json', 'w')

  fetches = [m_train.train_op, m_train.loss, m_train.accuracy]
  _, loss, acc = sess.run(fetches, 
                            options=options, 
                            run_metadata=run_metadata)
                            
  trace = timeline.Timeline(step_stats=run_metadata.step_stats)
  trace_file.write(trace.generate_chrome_trace_format())
  trace_file.close()

def train(sess, m_train, m_valid):
  n = 1
  best = .0
  best_step = n
  start_time = time.time()
  orig_begin_time = start_time

  fetches = [m_train.train_op, m_train.loss, m_train.accuracy]

  while True:
    try:
      _, loss, acc = sess.run(fetches)

      epoch = n // 80
      if n % 80 == 0:
        now = time.time()
        duration = now - start_time
        start_time = now
        v_acc = sess.run(m_valid.accuracy)
        if best < v_acc:
          best = v_acc
          best_step = n
          m_train.save(sess, best_step)
        print("Epoch %d, loss %.2f, acc %.2f %.4f, time %.2f" % 
                                  (epoch, loss, acc, v_acc, duration))
        sys.stdout.flush()
      n += 1
    except tf.errors.OutOfRangeError:
      break

  duration = time.time() - orig_begin_time
  duration /= 3600
  print('Done training, best_step: %d, best_acc: %.4f' % (best_step, best))
  print('duration: %.2f hours' % duration)
  sys.stdout.flush()