实体对齐、实体链接模型(Keras)

1.任务介绍

在知识图谱项目中,或者实体抽取项目中,往往会碰到同名但是代表不同实体的情况,如下表:

句子1句子2实体名标签
我特别喜欢胡歌演的《神话》这个电视剧。韩红和孙楠演唱的《神话》特别好听。神话0
我特别喜欢刘德华演的电影。刘德华是我们90后的偶像。刘德华1

实体链接就是判断“神话”、“刘德华”在不同地方代表的是否为同一实体。

2.数据处理

2.1读取数据

def get_index(text, entity):
    for i in range(len(text)):
        if text[i:i + len(entity)] == entity:
            return [i, i + len(entity) - 1]
    return [0, 0]


def read_file(path):
    '''
    :param path: 文件路径
    :return: [text1, text2, (0, 4), (4, 8), 0]
    '''
    data1 = json.load(open(path, 'r', encoding='utf-8'))
    all_data = []
    for entity, item in data1.items():
        sents = list(item.values())
        sent1 = re.sub(' ', '', random.choice(sents[0]))
        sent2 = re.sub(' ', '', random.choice(sents[1]))
        if get_index(sent1, entity)[1] and get_index(sent2, entity)[1]:
            f_ = [sent1, sent2, get_index(sent1, entity), get_index(sent2, entity), 0]
            all_data.append(f_)
        for ss in sents:
            if len(ss) > 1:
                s1 = re.sub(' ', '', ss[0])
                s2 = re.sub(' ', '', ss[1])
                if get_index(s1, entity)[1] and get_index(s2, entity)[1]:
                    z_ = [s1, s2, get_index(s1, entity), get_index(s2, entity), 1]
                    all_data.append(z_)
    return all_data

all_data = 【【句子1,实体下标,句子2,实体下标,标签】, …】

2.2数据迭代器

class data_generator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while 1:
            idxs = list(range(len(self.data)))
            random.shuffle(idxs)
            T1, T2, T1_index, T2_index, L = [], [], [], [], []
            for i in idxs:
                data = self.data[i]
                # print(data)
                text1 = data[0]
                text2 = data[1]
                t1_index = data[2]
                t2_index = data[3]
                label = [data[4]]
                token.fit_on_texts(text1)
                token.fit_on_texts(text2)
                text1_ids = np.ravel(token.texts_to_sequences(text1))
                text2_ids = np.ravel(token.texts_to_sequences(text2))
                T1.append(text1_ids)
                T2.append(text2_ids)
                T1_index.append(t1_index)
                T2_index.append(t2_index)
                L.append(label)
                if len(T1) == self.batch_size or i == idxs[-1]:
                    T1 = np.array(pad_sequences(T1, maxlen=max_len, padding='post'))
                    T2 = np.array(pad_sequences(T2, maxlen=max_len, padding='post'))
                    T1_index = np.array(T1_index)
                    T2_index = np.array(T2_index)
                    L = np.array(L)
                    # 返回:【句子1, 实体1下标, 句子2, 实体2下标】, 标签
                    yield [T1, T2, T1_index, T2_index], L
                    T1, T2, T1_index, T2_index, L = [], [], [], [], []

3.建立模型

使用keras建立模型,也可以使用bert进行。

import keras
from keras import layers
import numpy as np
import tensorflow.compat.v1 as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# 输入
input1 = layers.Input(shape=(64,), name='sentence_1')
input2 = layers.Input(shape=(64,), name='sentence_2')

# 嵌入
share_embedding = layers.Embedding(num_words, 256, input_length=max_len)
x1 = share_embedding(input1)
x2 = share_embedding(input2)

# Dense
x1 = layers.Dense(32)(x1)
x2 = layers.Dense(32)(x2)

# # 神经网络
# x1 = layers.LSTM(64)(x1)
# x2 = layers.LSTM(64)(x2)

# 提取实体1
input3 = layers.Input(shape=(2,), name='entity1_ids')
x1 = layers.Lambda(extrac_entity)([x1, input3])
# 提取实体2
input4 = layers.Input(shape=(2,), name='entity2_ids')
x2 = layers.Lambda(extrac_entity)([x2, input4])

x = layers.concatenate([x1, x2])
x = layers.Dense(16)(x)
output = layers.Dense(units=2, activation='softmax', name='softmax')(x)

model = keras.models.Model([input1, input2, input3, input4], output)
model.compile(
    loss=keras.losses.sparse_categorical_crossentropy,
    optimizer=keras.optimizers.Adam(2e-5),
    metrics=['accuracy']
)
model.summary()
# keras.utils.plot_model(model, 'pic/4.png', show_shapes=True)

模型图片
在这里插入图片描述

4.训练

all_data = read_file('data/data1.txt')
train_data = all_data
train_data_generator = data_generator(train_data, batch_size=batch_size)
model.fit_generator(
    train_data_generator.__iter__(),
    steps_per_epoch=len(train_data_generator),
    epochs=30
)
model.save_weights('data/best_model.weights')
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值