1. base.py
训练集的数据格式可参考:3 12 12 15 15 the system as described above has its greatest application in an arrayed configuration of antenna elements
转换后的数据为一个列表,每个列表元素为[标签,实体1,实体2,句子],其中实体1为[实体1起始位置,实体1终止位置]。
PAD_WORD = "<pad>"
#RawExample是名称,后面4个是它的属性
RawExample = namedtuple('RawExample', 'label entity1 entity2 sentence')
PositionPair = namedtuple('PosPair', 'first last')
FLAGS = tf.app.flags.FLAGS # load FLAGS.word_dim
def load_raw_data(filename):
'''load raw data from text file,
return: a list of Raw_Example
'''
data = []
with open(filename) as f:
for line in f:
words = line.strip().split(' ')
sent = words[5:]
n = len(sent)
#这一步操作不一定会很好
if FLAGS.max_len < n:
FLAGS.max_len = n
label = int(words[0])
entity1 = PositionPair(int(words[1]), int(words[2]))
entity2 = PositionPair(int(words[3]), int(words[4]))
example = RawExample(label, entity1, entity2, sent)
data.append(example)
print(FLAGS.max_len)
return data
把训练和测试数据中的所有词汇和<pad>逐行写入到vocab_file中。
def build_vocab(raw_train_data, raw_test_data, vocab_file):
'''collect words in sentence'''
if not os.path.exists(vocab_file):
vocab = set()
for example in raw_train_data + raw_test_data:
for w in example.sentence:
vocab.add(w)
with open(vocab_file, 'w') as f:
for w in sorted(list(vocab)):
f.write('%s\n' % w)
f.write('%s\n' % PAD_WORD)
读取词嵌入,其中embed指的是二维矩阵,维度为word_num*embedding_size。
def _load_embedding(embed_file, words_file):
embed = np.load(embed_file)
words2id = {}
words = _load_vocab(words_file)
for id, w in enumerate(words):
words2id[w] = id
return embed, words2id
处理词嵌入,根据词嵌入,对训练集和测试集的构造的词对应的词嵌入逐个加入列表中。如果未知即为<unk>,它所代表的向量是np.random.normal(0,0.1,[FLAGS.word_dim]),并在最后一个为<pad>,它所代表的向量是零向量。
def trim_embeddings(vocab_file,
pretrain_embed_file,
pretrain_words_file,
trimed_embed_file): #这个不就是处理unknown嘛,写的这么复杂!!!
'''trim unnecessary words from original pre-trained word embedding
Args:
vocab_file: a file of tokens in train and test data
pretrain_embed_file: file name of the original pre-trained embedding
pretrain_words_file: file name of the words list w.r.t the embed
trimed_embed_file: file name of the trimmed embedding
'''
if not os.path.exists(trimed_embed_file):
pretrain_embed, pretrain_words2id = _load_embedding(
pretrain_embed_file,
pretrain_words_file)
word_embed=[]
vocab = _load_vocab(vocab_file)
for w in vocab:
if w in pretrain_words2id:
id = pretrain_words2id[w]
word_embed.append(pretrain_embed[id])
else:
vec = np.random.normal(0,0.1,[FLAGS.word_dim])
word_embed.append(vec)
pad_id = -1
word_embed[pad_id] = np.zeros([FLAGS.word_dim])
word_embed = np.asarray(word_embed)
np.save(trimed_embed_file, word_embed.astype(np.float32))
word_embed, vocab2id = _load_embedding(trimed_embed_file, vocab_file)
return word_embed, vocab2id
嵌套函数
def _lexical_feature(raw_example):#语法特征
#嵌套函数开始
def _entity_context(e_idx, sent):
''' return [w(e-1), w(e), w(e+1)]
'''
context = []
context.append(sent[e_idx]) #
if e_idx >= 1:
context.append(sent[e_idx-1])
else:
context.append(sent[e_idx])
if e_idx < len(sent)-1:
context.append(sent[e_idx+1])
else:
context.append(sent[e_idx])
return context
#嵌套函数结束
e1_idx = raw_example.entity1.first
e2_idx = raw_example.entity2.first
context1 = _entity_context(e1_idx, raw_example.sentence) #返回的是一个list,[实体词左边的一个词+实体词+实体词右边的一个词],这里的词是字符串类型
context2 = _entity_context(e2_idx, raw_example.sentence)
# ignore WordNet hypernyms in paper
lexical = context1 + context2
return lexical
获取每个词距离实体1的位置,每个词距离实体2的位置
def _position_feature(raw_example):
def distance(n):
'''convert relative distance to positive number
-60), [-60, 60], (60
'''
# FIXME: FLAGS.pos_num
if n < -60:
return 0
elif n >= -60 and n <= 60:
return n + 61
return 122
e1_idx = raw_example.entity1.first
e2_idx = raw_example.entity2.first
position1 = []
position2 = []
length = len(raw_example.sentence)
for i in range(length):
position1.append(distance(i - e1_idx))
position2.append(distance(i - e2_idx))
return position1, position2
def build_sequence_example(raw_example):
'''build tf.train.SequenceExample from Raw_Example
context features : lexical, rid, direction (mtl)
sequence features: sentence, position1, position2
Args:
raw_example : type Raw_Example
Returns:
tf.trian.SequenceExample
'''
ex = tf.train.SequenceExample()
lexical = _lexical_feature(raw_example)
ex.context.feature['lexical'].int64_list.value.extend(lexical)
rid = raw_example.label
ex.context.feature['rid'].int64_list.value.append(rid)
for word_id in raw_example.sentence:
word = ex.feature_lists.feature_list['sentence'].feature.add()
word.int64_list.value.append(word_id)
position1, position2 = _position_feature(raw_example)
for pos_val in position1:
pos = ex.feature_lists.feature_list['position1'].feature.add()
pos.int64_list.value.append(pos_val)
for pos_val in position2:
pos = ex.feature_lists.feature_list['position2'].feature.add()
pos.int64_list.value.append(pos_val)
return ex
def inputs():
raw_train_data = load_raw_data(FLAGS.train_file)
raw_test_data = load_raw_data(FLAGS.test_file)
build_vocab(raw_train_data, raw_test_data, FLAGS.vocab_file)
if FLAGS.word_dim == 50:
word_embed, vocab2id = maybe_trim_embeddings(
FLAGS.vocab_file,
FLAGS.senna_embed50_file,
FLAGS.senna_words_file,
FLAGS.trimmed_embed50_file)
elif FLAGS.word_dim == 300:
word_embed, vocab2id = maybe_trim_embeddings(
FLAGS.vocab_file,
FLAGS.google_embed300_file,
FLAGS.google_words_file,
FLAGS.trimmed_embed300_file)
# map words to ids
map_words_to_id(raw_train_data, vocab2id)
map_words_to_id(raw_test_data, vocab2id)
# convert raw data to TFRecord format data, and write to file
train_record = FLAGS.train_record
test_record = FLAGS.test_record
maybe_write_tfrecord(raw_train_data, train_record)
maybe_write_tfrecord(raw_test_data, test_record)
pad_value = vocab2id[PAD_WORD]
train_data = read_tfrecord_to_batch(train_record,
FLAGS.num_epochs, FLAGS.batch_size,
pad_value, shuffle=True)
test_data = read_tfrecord_to_batch(test_record,
FLAGS.num_epochs, 2717,
pad_value, shuffle=False)
return train_data, test_data, word_embed
2. cnn_model.py
class CNNModel(BaseModel):
'''
Relation Classification via Convolutional Deep Neural Network
http://www.aclweb.org/anthology/C14-1220
'''
def __init__(self, word_embed, data, word_dim,
pos_num, pos_dim, num_relations,
keep_prob, num_filters,
lrn_rate, is_train):
# input data
lexical, rid, sentence, pos1, pos2 = data
# embedding initialization
w_trainable = True if FLAGS.word_dim==50 else False
word_embed = tf.get_variable('word_embed',
initializer=word_embed,
dtype=tf.float32,
trainable=w_trainable)
pos1_embed = tf.get_variable('pos1_embed', shape=[pos_num, pos_dim]) #123 * 5
pos2_embed = tf.get_variable('pos2_embed', shape=[pos_num, pos_dim])
# # embedding lookup
lexical = tf.nn.embedding_lookup(word_embed, lexical) # batch_size, 6, word_dim
lexical = tf.reshape(lexical, [-1, 6*word_dim])
self.labels = tf.one_hot(rid, num_relations) # batch_size, num_relations
sentence = tf.nn.embedding_lookup(word_embed, sentence) # batch_size, max_len, word_dim
pos1 = tf.nn.embedding_lookup(pos1_embed, pos1) # batch_size, max_len, pos_dim
pos2 = tf.nn.embedding_lookup(pos2_embed, pos2) # batch_size, max_len, pos_dim
# cnn model
sent_pos = tf.concat([sentence, pos1, pos2], axis=2)
if is_train:
sent_pos = tf.nn.dropout(sent_pos, keep_prob)
feature = cnn_forward('cnn', sent_pos, lexical, num_filters)
feature_size = feature.shape.as_list()[1]
self.feature = feature
if is_train:
feature = tf.nn.dropout(feature, keep_prob)
# Map the features to 19 classes
logits, loss_l2 = linear_layer('linear_cnn', feature,
feature_size, num_relations,
is_regularize=True)
prediction = tf.nn.softmax(logits)
prediction = tf.argmax(prediction, axis=1)
accuracy = tf.equal(prediction, tf.argmax(self.labels, axis=1))
accuracy = tf.reduce_mean(tf.cast(accuracy, tf.float32))
loss_ce = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=logits))
self.logits = logits
self.prediction = prediction
self.accuracy = accuracy
self.loss = loss_ce + 0.01*loss_l2
if not is_train:
return
# global_step = tf.train.get_or_create_global_step()
global_step = tf.Variable(0, trainable=False, name='step', dtype=tf.int32)
optimizer = tf.train.AdamOptimizer(lrn_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):# for batch_norm
self.train_op = optimizer.minimize(self.loss, global_step)
self.global_step = global_step
3. train.py
该函数的目标是对Tensorflow进行性能剖析,可参考链接:https://walsvid.github.io/2017/03/25/profiletensorflow/#fn_2
def trace_runtime(sess, m_train):
'''
trace runtime bottleneck using timeline api
navigate to the URL 'chrome://tracing' in a Chrome web browser,
click the 'Load' button and locate the timeline file.
'''
run_metadata=tf.RunMetadata() #元信息: 记录训练运算时间和内存占用等信息
options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
from tensorflow.python.client import timeline
trace_file = open('timeline.ctf.json', 'w')
fetches = [m_train.train_op, m_train.loss, m_train.accuracy]
_, loss, acc = sess.run(fetches,
options=options,
run_metadata=run_metadata)
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file.write(trace.generate_chrome_trace_format())
trace_file.close()
def train(sess, m_train, m_valid):
n = 1
best = .0
best_step = n
start_time = time.time()
orig_begin_time = start_time
fetches = [m_train.train_op, m_train.loss, m_train.accuracy]
while True:
try:
_, loss, acc = sess.run(fetches)
epoch = n // 80
if n % 80 == 0:
now = time.time()
duration = now - start_time
start_time = now
v_acc = sess.run(m_valid.accuracy)
if best < v_acc:
best = v_acc
best_step = n
m_train.save(sess, best_step)
print("Epoch %d, loss %.2f, acc %.2f %.4f, time %.2f" %
(epoch, loss, acc, v_acc, duration))
sys.stdout.flush()
n += 1
except tf.errors.OutOfRangeError:
break
duration = time.time() - orig_begin_time
duration /= 3600
print('Done training, best_step: %d, best_acc: %.4f' % (best_step, best))
print('duration: %.2f hours' % duration)
sys.stdout.flush()