1.任务介绍
在知识图谱项目中,或者实体抽取项目中,往往会碰到同名但是代表不同实体的情况,如下表:
句子1 | 句子2 | 实体名 | 标签 |
---|---|---|---|
我特别喜欢胡歌演的《神话》这个电视剧。 | 韩红和孙楠演唱的《神话》特别好听。 | 神话 | 0 |
我特别喜欢刘德华演的电影。 | 刘德华是我们90后的偶像。 | 刘德华 | 1 |
实体链接就是判断“神话”、“刘德华”在不同地方代表的是否为同一实体。
2.数据处理
2.1读取数据
def get_index(text, entity):
for i in range(len(text)):
if text[i:i + len(entity)] == entity:
return [i, i + len(entity) - 1]
return [0, 0]
def read_file(path):
'''
:param path: 文件路径
:return: [text1, text2, (0, 4), (4, 8), 0]
'''
data1 = json.load(open(path, 'r', encoding='utf-8'))
all_data = []
for entity, item in data1.items():
sents = list(item.values())
sent1 = re.sub(' ', '', random.choice(sents[0]))
sent2 = re.sub(' ', '', random.choice(sents[1]))
if get_index(sent1, entity)[1] and get_index(sent2, entity)[1]:
f_ = [sent1, sent2, get_index(sent1, entity), get_index(sent2, entity), 0]
all_data.append(f_)
for ss in sents:
if len(ss) > 1:
s1 = re.sub(' ', '', ss[0])
s2 = re.sub(' ', '', ss[1])
if get_index(s1, entity)[1] and get_index(s2, entity)[1]:
z_ = [s1, s2, get_index(s1, entity), get_index(s2, entity), 1]
all_data.append(z_)
return all_data
all_data = 【【句子1,实体下标,句子2,实体下标,标签】, …】
2.2数据迭代器
class data_generator:
def __init__(self, data, batch_size=64):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while 1:
idxs = list(range(len(self.data)))
random.shuffle(idxs)
T1, T2, T1_index, T2_index, L = [], [], [], [], []
for i in idxs:
data = self.data[i]
# print(data)
text1 = data[0]
text2 = data[1]
t1_index = data[2]
t2_index = data[3]
label = [data[4]]
token.fit_on_texts(text1)
token.fit_on_texts(text2)
text1_ids = np.ravel(token.texts_to_sequences(text1))
text2_ids = np.ravel(token.texts_to_sequences(text2))
T1.append(text1_ids)
T2.append(text2_ids)
T1_index.append(t1_index)
T2_index.append(t2_index)
L.append(label)
if len(T1) == self.batch_size or i == idxs[-1]:
T1 = np.array(pad_sequences(T1, maxlen=max_len, padding='post'))
T2 = np.array(pad_sequences(T2, maxlen=max_len, padding='post'))
T1_index = np.array(T1_index)
T2_index = np.array(T2_index)
L = np.array(L)
# 返回:【句子1, 实体1下标, 句子2, 实体2下标】, 标签
yield [T1, T2, T1_index, T2_index], L
T1, T2, T1_index, T2_index, L = [], [], [], [], []
3.建立模型
使用keras建立模型,也可以使用bert进行。
import keras
from keras import layers
import numpy as np
import tensorflow.compat.v1 as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# 输入
input1 = layers.Input(shape=(64,), name='sentence_1')
input2 = layers.Input(shape=(64,), name='sentence_2')
# 嵌入
share_embedding = layers.Embedding(num_words, 256, input_length=max_len)
x1 = share_embedding(input1)
x2 = share_embedding(input2)
# Dense
x1 = layers.Dense(32)(x1)
x2 = layers.Dense(32)(x2)
# # 神经网络
# x1 = layers.LSTM(64)(x1)
# x2 = layers.LSTM(64)(x2)
# 提取实体1
input3 = layers.Input(shape=(2,), name='entity1_ids')
x1 = layers.Lambda(extrac_entity)([x1, input3])
# 提取实体2
input4 = layers.Input(shape=(2,), name='entity2_ids')
x2 = layers.Lambda(extrac_entity)([x2, input4])
x = layers.concatenate([x1, x2])
x = layers.Dense(16)(x)
output = layers.Dense(units=2, activation='softmax', name='softmax')(x)
model = keras.models.Model([input1, input2, input3, input4], output)
model.compile(
loss=keras.losses.sparse_categorical_crossentropy,
optimizer=keras.optimizers.Adam(2e-5),
metrics=['accuracy']
)
model.summary()
# keras.utils.plot_model(model, 'pic/4.png', show_shapes=True)
模型图片
4.训练
all_data = read_file('data/data1.txt')
train_data = all_data
train_data_generator = data_generator(train_data, batch_size=batch_size)
model.fit_generator(
train_data_generator.__iter__(),
steps_per_epoch=len(train_data_generator),
epochs=30
)
model.save_weights('data/best_model.weights')