重要参考
用Bi-GRU和字向量做端到端的中文关系抽取(作者:羊肉泡馍与糖蒜)
代码链接:https://github.com/crownpku/Information-Extraction-Chinese/tree/master/RE_BGRU_2ATT
上述代码“主要是基于清华的开源项目thunlp/TensorFlow-NRE(https://github.com/thunlp/OpenNRE)开发”
模型代码
数据处理:initial.py
传入训练数据与测试数据存在路径,自定义处理后的训练数据与测试数据存放路径。
以下代码使用的是自行训练的字向量,训练方式见【NLP】使用Word2Vec训练字向量。
import numpy as np
import os
# embedding the position
def pos_embed(x):
if x < -60:
return 0
if -60 <= x <= 60:
return x + 61
if x > 60:
return 122
# find the index of x in y, if x not in y, return -1
def find_index(x, y):
flag = -1
for i in range(len(y)):
if x != y[i]:
continue
else:
return i
return flag
# reading data
def init():
print('reading word embedding data...')
vec = []
word2id = {}
f = open('./origin_data/token_vec_100.txt', encoding='utf-8')
content = f.readline()
content = content.strip().split()
dim = int(content[1])
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
word2id[content[0]] = len(word2id)
content = content[1:]
content = [(float)(i) for i in content]
vec.append(content)
f.close()
word2id['UNK'] = len(word2id)
word2id['BLANK'] = len(word2id)
vec.append(np.random.normal(size=dim, loc=0, scale=0.05))
vec.append(np.random.normal(size=dim, loc=0, scale=0.05))
vec = np.array(vec, dtype=np.float32)
print('reading relation to id')
relation2id = {}
f = open('./origin_data/ywp_relation2id.txt', 'r', encoding='utf-8')
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
relation2id[content[0]] = int(content[1])
f.close()
# length of sentence is 70
fixlen = 70
# max length of position embedding is 60 (-60~+60)
maxlen = 60
train_sen = {} # {entity pair:[[[label1-sentence 1],[label1-sentence 2]...],[[label2-sentence 1],[label2-sentence 2]...]}
train_ans = {} # {entity pair:[label1,label2,...]} the label is one-hot vector
print('reading train data...')
f = open('./origin_data/ywp_train.txt', 'r', encoding='utf-8')
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
# get entity name
en1 = content[0]
en2 = content[1]
relation = 0
if content[2] not in relation2id:
relation = relation2id['NA']
else:
relation = relation2id[content[2]]
# put the same entity pair sentences into a dict
tup = (en1, en2)
label_tag = 0
if tup not in train_sen:
train_sen[tup] = []
train_sen[tup].append([])
y_id = relation
label_tag = 0
label = [0 for i in range(len(relation2id))]
label[y_id] = 1
train_ans[tup] = []
train_ans[tup].append(label)
else:
y_id = relation
label_tag = 0
label = [0 for i in range(len(relation2id))]
label[y_id] = 1
temp = find_index(label, train_ans[tup])
if temp == -1:
train_ans[tup].append(label)
label_tag = len(train_ans[tup]) - 1
train_sen[tup].append([])
else:
label_tag = temp
sentence = content[3]
en1pos = 0
en2pos = 0
# For Chinese
en1pos = sentence.find(en1)
if en1pos == -1:
en1pos = 0
en2pos = sentence.find(en2)
if en2pos == -1:
en2pos = 0
output = []
# Embeding the position
for i in range(fixlen):
word = word2id['BLANK']
rel_e1 = pos_embed(i - en1pos)
rel_e2 = pos_embed(i - en2pos)
output.append([word, rel_e1, rel_e2])
for i in range(min(fixlen, len(sentence))):
word = 0
if sentence[i] not in word2id:
word = word2id['UNK']
else:
word = word2id[sentence[i]]
output[i][0] = word
train_sen[tup][label_tag].append(output)
print('reading test data ...')
test_sen = {} # {entity pair:[[sentence 1],[sentence 2]...]}
test_ans = {} # {entity pair:[labels,...]} the labels is N-hot vector (N is the number of multi-label)
f = open('./origin_data/ywp_test.txt', 'r', encoding='utf-8')
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
en1 = content[0]
en2 = content[1]
relation = 0
if content[2] not in relation2id:
relation = relation2id['NA']
else:
relation = relation2id[content[2]]
tup = (en1, en2)
if tup not in test_sen:
test_sen[tup] = []
y_id = relation
label_tag = 0
label = [0 for i in range(len(relation2id))]
label[y_id] = 1
test_ans[tup] = label
else:
y_id = relation
test_ans[tup][y_id] = 1
sentence = content[3]
en1pos = 0
en2pos = 0
# For Chinese
en1pos = sentence.find(en1)
if en1pos == -1:
en1pos = 0
en2pos = sentence.find(en2)
if en2pos == -1:
en2pos = 0
output = []
for i in range(fixlen):
word = word2id['BLANK']
rel_e1 = pos_embed(i - en1pos)
rel_e2 = pos_embed(i - en2pos)
output.append([word, rel_e1, rel_e2])
for i in range(min(fixlen, len(sentence))):
word = 0
if sentence[i] not in word2id:
word = word2id['UNK']
else:
word = word2id[sentence[i]]
output[i][0] = word
test_sen[tup].append(output)
train_x = []
train_y = []
test_x = []
test_y = []
if not os.path.exists("data"):
os.makedirs("data")
print('organizing train data')
f = open('./data/ywp_train_q&a.txt', 'w', encoding='utf-8')
temp = 0
for i in train_sen:
if len(train_ans[i]) != len(train_sen[i]):
print('ERROR')
lenth = len(train_ans[i])
for j in range(lenth):
train_x.append(train_sen[i][j])
train_y.append(train_ans[i][j])
f.write(str(temp) + '\t' + i[0] + '\t' + i[1] + '\t' + str(np.argmax(train_ans[i][j])) + '\n')
temp += 1
f.close()
print('organizing test data')
f = open('./data/ywp_test_q&a.txt', 'w', encoding='utf-8')
temp = 0
for i in test_sen:
test_x.append(test_sen[i])
test_y.append(test_ans[i])
tempstr = ''
for j in range(len(test_ans[i])):
if test_ans[i][j] != 0:
tempstr = tempstr + str(j) + '\t'
f.write(str(temp) + '\t' + i[0] + '\t' + i[1] + '\t' + tempstr + '\n')
temp += 1
f.close()
train_x = np.array(train_x)
train_y = np.array(train_y)
test_x = np.array(test_x)
test_y = np.array(test_y)
np.save('./data/vec.npy', vec)
np.save('./data/train_x.npy', train_x)
np.save('./data/train_y.npy', train_y)
np.save('./data/testall_x.npy', test_x)
np.save('./data/testall_y.npy', test_y)
def seperate():
print('reading training data')
x_train = np.load('./data/train_x.npy')
train_word = []
train_pos1 = []
train_pos2 = []
print('seprating train data')
for i in range(len(x_train)):
word = []
pos1 = []
pos2 = []
for j in x_train[i]:
temp_word = []
temp_pos1 = []
temp_pos2 = []
for k in j:
temp_word.append(k[0])
temp_pos1.append(k[1])
temp_pos2.append(k[2])
word.append(temp_word)
pos1.append(temp_pos1)
pos2.append(temp_pos2)
train_word.append(word)
train_pos1.append(pos1)
train_pos2.append(pos2)
train_word = np.array(train_word)
train_pos1 = np.array(train_pos1)
train_pos2 = np.array(train_pos2)
np.save('./data/train_word.npy', train_word)
np.save('./data/train_pos1.npy', train_pos1)
np.save('./data/train_pos2.npy', train_pos2)
print('seperating test all data')
x_test = np.load('./data/testall_x.npy')
test_word = []
test_pos1 = []
test_pos2 = []
for i in range(len(x_test)):
word = []
pos1 = []
pos2 = []
for j in x_test[i]:
temp_word = []
temp_pos1 = []
temp_pos2 = []
for k in j:
temp_word.append(k[0])
temp_pos1.append(k[1])
temp_pos2.append(k[2])
word.append(temp_word)
pos1.append(temp_pos1)
pos2.append(temp_pos2)
test_word.append(word)
test_pos1.append(pos1)
test_pos2.append(pos2)
test_word = np.array(test_word)
test_pos1 = np.array(test_pos1)
test_pos2 = np.array(test_pos2)
np.save('./data/testall_word.npy', test_word)
np.save('./data/testall_pos1.npy', test_pos1)
np.save('./data/testall_pos2.npy', test_pos2)
# get answer metric for PR curve evaluation
def getans():
test_y = np.load('./data/testall_y.npy')
eval_y = []
for i in test_y:
eval_y.append(i[1:])
allans = np.reshape(eval_y, (-1))
np.save('./data/allans.npy', allans)
def get_metadata():
fwrite = open('./data/metadata.tsv', 'w', encoding='utf-8')
f = open('./origin_data/token_vec_100.txt', encoding='utf-8')
f.readline()
while True:
content = f.readline().strip()
if content == '':
break
name = content.split()[0]
fwrite.write(name + '\n')
f.close()
fwrite.close()
init()
seperate()
getans()
get_metadata()
模型训练与评估:train_GRU.py
最后一段代码,涉及到保存自己训练的模型。原文件夹提供了作者训练好的模型,其中的向量维度为12。
而我定义的向量维度并非12,使用原模型运行,报错“Assign requires shapes of both tensors to match”(如下图),即输入的向量维度与模型中的不同。
因此,删除原模型,重新训练自己的模型并保存。
![](https://img-blog.csdnimg.cn/20210328103841695.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1lXUF8yMDE2,size_16,color_FFFFFF,t_70)
train_GRU.py完整代码:
含自定义迭代次数:
for one_epoch in range(1000):
import tensorflow as tf
import numpy as np
import time
import datetime
import os
import network
from tensorflow.contrib.tensorboard.plugins import projector
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('summary_dir', '.', 'path to store summary')
def main(_):
# the path to save models
save_path = './model/'
print('reading wordembedding')
wordembedding = np.load('./data/vec.npy')
print('reading training data')
train_y = np.load('./data/train_y.npy')
train_word = np.load('./data/train_word.npy')
train_pos1 = np.load('./data/train_pos1.npy')
train_pos2 = np.load('./data/train_pos2.npy')
settings = network.Settings()
settings.vocab_size = len(wordembedding)
print("train_y[0]",len(train_y[0]))
settings.num_classes = len(train_y[0])
big_num = settings.big_num
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
initializer = tf.contrib.layers.xavier_initializer()
with tf.variable_scope("model", reuse=None, initializer=initializer):
m = network.GRU(is_training=True, word_embeddings=wordembedding, settings=settings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(0.0005)
train_op = optimizer.minimize(m.final_loss, global_step=global_step)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=None)
merged_summary = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train_loss', sess.graph)
def train_step(word_batch, pos1_batch, pos2_batch, y_batch, big_num):
feed_dict = {}
total_shape = []
total_num = 0
total_word = []
total_pos1 = []
total_pos2 = []
print("len(word_batch)",len(word_batch))
for i in range(len(word_batch)):
total_shape.append(total_num)
total_num += len(word_batch[i])
for word in word_batch[i]:
total_word.append(word)
for pos1 in pos1_batch[i]:
total_pos1.append(pos1)
for pos2 in pos2_batch[i]:
total_pos2.append(pos2)
total_shape.append(total_num)
total_shape = np.array(total_shape)
total_word = np.array(total_word)
total_pos1 = np.array(total_pos1)
total_pos2 = np.array(total_pos2)
feed_dict[m.total_shape] = total_shape
feed_dict[m.input_word] = total_word
feed_dict[m.input_pos1] = total_pos1
feed_dict[m.input_pos2] = total_pos2
feed_dict[m.input_y] = y_batch
temp, step, loss, accuracy, summary, l2_loss, final_loss = sess.run(
[train_op, global_step, m.total_loss, m.accuracy, merged_summary, m.l2_loss, m.final_loss],
feed_dict)
time_str = datetime.datetime.now().isoformat()
accuracy = np.reshape(np.array(accuracy), (big_num))
acc = np.mean(accuracy)
summary_writer.add_summary(summary, step)
if step % 1 == 0:
tempstr = "{}: step {}, softmax_loss {:g}, acc {:g}".format(time_str, step, loss, acc)
print(tempstr)
print("settings.num_epochs",settings.num_epochs)
for one_epoch in range(1000):
temp_order = list(range(len(train_word)))
np.random.shuffle(temp_order)
#print("int(len(temp_order) / float(settings.big_num",len(temp_order) / float(settings.big_num))
for i in range(int(len(temp_order) / float(settings.big_num))):
temp_word = []
temp_pos1 = []
temp_pos2 = []
temp_y = []
temp_input = temp_order[i * settings.big_num:(i + 1) * settings.big_num]
for k in temp_input:
temp_word.append(train_word[k])
temp_pos1.append(train_pos1[k])
temp_pos2.append(train_pos2[k])
temp_y.append(train_y[k])
num = 0
for single_word in temp_word:
num += len(single_word)
if num > 1500:
print('out of range')
continue
temp_word = np.array(temp_word)
temp_pos1 = np.array(temp_pos1)
temp_pos2 = np.array(temp_pos2)
temp_y = np.array(temp_y)
train_step(temp_word, temp_pos1, temp_pos2, temp_y, settings.big_num)
current_step = tf.train.global_step(sess, global_step)
print("current_step",current_step)
print('saving model')
path = saver.save(sess, save_path + 'ATT_GRU_model')
tempstr = 'have saved model to ' + path
print(tempstr)
if __name__ == "__main__":
tf.app.run()
模型使用:test_GRU.py
与源代码不同,修改后的test_GRU.py代码中包含数据的读写。
如下,逐行读取文件内容并处理(分割两个实体词与关系词),将处理后的内容写入另一个文件。
infile = open(
'D:\\Asian elephant\\biye\\Spatial relation extraction\\Information-Extraction-Chinese-master\\RE_BGRU_2ATT\\origin_data\\ywp_use2.txt',
encoding='utf-8')
for orgline in infile:
print(orgline)
en1, en2, sentence = orgline.split()
ywpoutfile = open(
'D:\\Asian elephant\\biye\\Spatial relation extraction\\Information-Extraction-Chinese-master\\RE_BGRU_2ATT\\origin_data\\ywp_use_result2.txt',
'a')
ywpoutfile.write('\n' + en1 + ' ' + en2 + '\n')
如下,将预测后的实体关系写入文件。
prob, accuracy = test_step(test_word, test_pos1, test_pos2, test_y)
prob = np.reshape(np.array(prob), (1, test_settings.num_classes))[0]
top3_id = prob.argsort()[-3:][::-1]
for n, rel_id in enumerate(top3_id):
ywpoutfile.write("No." + str(n + 1) + ": " + id2relation[rel_id] + ", Probability is " + str(
prob[rel_id]) + '\n')
test_GRU.py完整代码:
from pprint import pprint
import tensorflow as tf
import numpy as np
import time
import datetime
import os
import network
from sklearn.metrics import average_precision_score
FLAGS = tf.app.flags.FLAGS
import warnings
warnings.filterwarnings(action='ignore')
# embedding the position
def pos_embed(x):
if x < -60:
return 0
if -60 <= x <= 60:
return x + 61
if x > 60:
return 122
def main_for_evaluation():
pathname = "./model/ATT_GRU_model"
wordembedding = np.load('./data/vec.npy')
test_settings = network.Settings()
test_settings.vocab_size = 16693
test_settings.num_classes = 8
test_settings.big_num = 5561
big_num_test = test_settings.big_num
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
def test_step(word_batch, pos1_batch, pos2_batch, y_batch):
feed_dict = {}
total_shape = []
total_num = 0
total_word = []
total_pos1 = []
total_pos2 = []
for i in range(len(word_batch)):
total_shape.append(total_num)
total_num += len(word_batch[i])
for word in word_batch[i]:
total_word.append(word)
for pos1 in pos1_batch[i]:
total_pos1.append(pos1)
for pos2 in pos2_batch[i]:
total_pos2.append(pos2)
total_shape.append(total_num)
total_shape = np.array(total_shape)
total_word = np.array(total_word)
total_pos1 = np.array(total_pos1)
total_pos2 = np.array(total_pos2)
feed_dict[mtest.total_shape] = total_shape
feed_dict[mtest.input_word] = total_word
feed_dict[mtest.input_pos1] = total_pos1
feed_dict[mtest.input_pos2] = total_pos2
feed_dict[mtest.input_y] = y_batch
loss, accuracy, prob = sess.run(
[mtest.loss, mtest.accuracy, mtest.prob], feed_dict)
return prob, accuracy
with tf.variable_scope("model"):
mtest = network.GRU(is_training=False, word_embeddings=wordembedding, settings=test_settings)
names_to_vars = {v.op.name: v for v in tf.global_variables()}
saver = tf.train.Saver(names_to_vars)
# testlist = range(1000, 1800, 100)
testlist = [9000]
for model_iter in testlist:
# for compatibility purposes only, name key changes from tf 0.x to 1.x, compat_layer
saver.restore(sess, pathname + str(model_iter))
time_str = datetime.datetime.now().isoformat()
print(time_str)
print('Evaluating all test data and save data for PR curve')
test_y = np.load('./data/testall_y.npy')
test_word = np.load('./data/testall_word.npy')
test_pos1 = np.load('./data/testall_pos1.npy')
test_pos2 = np.load('./data/testall_pos2.npy')
allprob = []
acc = []
for i in range(int(len(test_word) / float(test_settings.big_num))):
prob, accuracy = test_step(test_word[i * test_settings.big_num:(i + 1) * test_settings.big_num],
test_pos1[i * test_settings.big_num:(i + 1) * test_settings.big_num],
test_pos2[i * test_settings.big_num:(i + 1) * test_settings.big_num],
test_y[i * test_settings.big_num:(i + 1) * test_settings.big_num])
acc.append(np.mean(np.reshape(np.array(accuracy), (test_settings.big_num))))
prob = np.reshape(np.array(prob), (test_settings.big_num, test_settings.num_classes))
for single_prob in prob:
allprob.append(single_prob[1:])
allprob = np.reshape(np.array(allprob), (-1))
order = np.argsort(-allprob)
print('saving all test result...')
current_step = model_iter
np.save('./out/allprob_iter_' + str(current_step) + '.npy', allprob)
allans = np.load('./data/allans.npy')
# caculate the pr curve area
average_precision = average_precision_score(allans, allprob)
print('PR curve area:' + str(average_precision))
def main(_):
# If you retrain the model, please remember to change the path to your own model below:
pathname = "./model/ATT_GRU_model"
wordembedding = np.load('./data/vec.npy')
test_settings = network.Settings()
test_settings.vocab_size = 16693
test_settings.num_classes = 8
test_settings.big_num = 1
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
def test_step(word_batch, pos1_batch, pos2_batch, y_batch):
feed_dict = {}
total_shape = []
total_num = 0
total_word = []
total_pos1 = []
total_pos2 = []
for i in range(len(word_batch)):
total_shape.append(total_num)
total_num += len(word_batch[i])
for word in word_batch[i]:
total_word.append(word)
for pos1 in pos1_batch[i]:
total_pos1.append(pos1)
for pos2 in pos2_batch[i]:
total_pos2.append(pos2)
total_shape.append(total_num)
total_shape = np.array(total_shape)
total_word = np.array(total_word)
total_pos1 = np.array(total_pos1)
total_pos2 = np.array(total_pos2)
feed_dict[mtest.total_shape] = total_shape
feed_dict[mtest.input_word] = total_word
feed_dict[mtest.input_pos1] = total_pos1
feed_dict[mtest.input_pos2] = total_pos2
feed_dict[mtest.input_y] = y_batch
loss, accuracy, prob = sess.run(
[mtest.loss, mtest.accuracy, mtest.prob], feed_dict)
return prob, accuracy
with tf.variable_scope("model"):
mtest = network.GRU(is_training=False, word_embeddings=wordembedding, settings=test_settings)
names_to_vars = {v.op.name: v for v in tf.global_variables()}
saver = tf.train.Saver(names_to_vars)
saver.restore(sess, pathname)
print('reading word embedding data...')
vec = []
word2id = {}
f = open('./origin_data/token_vec_100.txt', encoding='utf-8')
content = f.readline()
content = content.strip().split()
dim = int(content[1])
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
word2id[content[0]] = len(word2id)
content = content[1:]
content = [(float)(i) for i in content]
vec.append(content)
f.close()
word2id['UNK'] = len(word2id)
word2id['BLANK'] = len(word2id)
print('reading relation to id')
relation2id = {}
id2relation = {}
f = open(
'D:\Asian elephant\毕业\空间关系抽取\所用:可运行 BiGPU关系抽取模型\Information-Extraction-Chinese-master\RE_BGRU_2ATT\origin_data\\ywp_relation2id.txt.',
'r', encoding='utf-8')
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
relation2id[content[0]] = int(content[1])
id2relation[int(content[1])] = content[0]
f.close()
infile = open(
'…….txt',
encoding='utf-8')
for orgline in infile:
print(orgline)
en1, en2, sentence = orgline.split()
ywpoutfile = open(
'…….txt',
'a')
ywpoutfile.write('\n' + en1 + ' ' + en2 + '\n')
# print(sentence)
relation = 0
en1pos = sentence.find(en1)
if en1pos == -1:
en1pos = 0
en2pos = sentence.find(en2)
if en2pos == -1:
en2post = 0
output = []
# length of sentence is 70
fixlen = 70
# max length of position embedding is 60 (-60~+60)
maxlen = 60
# Encoding test x
for i in range(fixlen):
word = word2id['BLANK']
rel_e1 = pos_embed(i - en1pos)
rel_e2 = pos_embed(i - en2pos)
output.append([word, rel_e1, rel_e2])
for i in range(min(fixlen, len(sentence))):
word = 0
if sentence[i] not in word2id:
# print(sentence[i])
# print('==')
word = word2id['UNK']
# print(word)
else:
# print(sentence[i])
# print('||')
word = word2id[sentence[i]]
# print(word)
output[i][0] = word
test_x = []
test_x.append([output])
# Encoding test y
label = [0 for i in range(len(relation2id))]
label[0] = 1
test_y = []
test_y.append(label)
test_x = np.array(test_x)
test_y = np.array(test_y)
test_word = []
test_pos1 = []
test_pos2 = []
for i in range(len(test_x)):
word = []
pos1 = []
pos2 = []
for j in test_x[i]:
temp_word = []
temp_pos1 = []
temp_pos2 = []
for k in j:
temp_word.append(k[0])
temp_pos1.append(k[1])
temp_pos2.append(k[2])
word.append(temp_word)
pos1.append(temp_pos1)
pos2.append(temp_pos2)
test_word.append(word)
test_pos1.append(pos1)
test_pos2.append(pos2)
test_word = np.array(test_word)
test_pos1 = np.array(test_pos1)
test_pos2 = np.array(test_pos2)
# print("test_word Matrix:")
# print(test_word)
# print("test_pos1 Matrix:")
# print(test_pos1)
# print("test_pos2 Matrix:")
# print(test_pos2)
prob, accuracy = test_step(test_word, test_pos1, test_pos2, test_y)
prob = np.reshape(np.array(prob), (1, test_settings.num_classes))[0]
# print("关系是:")
# print(prob)
top3_id = prob.argsort()[-3:][::-1]
for n, rel_id in enumerate(top3_id):
ywpoutfile.write("No." + str(n + 1) + ": " + id2relation[rel_id] + ", Probability is " + str(
prob[rel_id]) + '\n')
# except Exception as e:
# print(e)
# result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag)
# print(result)
if __name__ == "__main__":
tf.app.run()