基于数据库的知识图库问答系统

CCKS 2020:新冠知识图谱构建与问答评测(四)新冠知识图谱问答评测

连接:https://www.biendata.xyz/competition/ccks_2020_7_4/evaluation/

在这里插入图片描述

初始化数据种子引入需要的包
import os
import re
import math
import torch
import random
import pickle
import numpy as np
import codecs as cs
import pandas as pd
import torch.nn as nn
import time
import datetime
from transformers import WEIGHTS_NAME

# 设置随机种子.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

PATH = './data/'
CSV_PATH = "./data/csv/"
PICKLE_PATH = './data/pickle/'
INPUT_PATH = './data/PKUBASE/pkubase-complete-2020/pkubase-complete.txt'


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    # 返回 hh:mm:ss 形式的时间
    return str(datetime.timedelta(seconds=elapsed_rounded))


def flat_accuracy(preds, labels, attention):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
导入训练和测试集 并且补充新数据到知识图谱中
def LoadCorpus(path):

    def writefile(text):
        corpus = {}
        kb = []
        for i in range(len(text)):
            # 对问题进行预处理
            question = text[i].split('\r\n')[0].split(':')[1]
            question = re.sub('我想知道', '', question)
            question = re.sub('你了解', '', question)
            question = re.sub('请问', '', question)

            answers = text[i].split('\n')[2].split('\t')
            sql = text[i].split('\n')[1]
            sql = re.findall('{.+}', sql)[0]
            elements = re.findall('<.+?>|\".+?\"|\?\D', sql) + re.findall('\".+?\"', sql)
            # elements中包含创引号的项目可能有重复,需要去重
            new_elements = []
            for e in elements:
                if e[0] == '\"':
                    if e not in new_elements:
                        new_elements.append(e)
                else:
                    new_elements.append(e)
            elements = new_elements
            gold_entitys = []
            gold_relations = []
            for j in range(len(elements)):
                if elements[j][0] == '<' or elements[j][0] == '\"':
                    if j % 3 == 1:
                        gold_relations.append(elements[j])
                    else:
                        gold_entitys.append(elements[j])

            for entity in gold_entitys:
                for relation in gold_relations:
                    for answer in answers:
                        kb.append(entity+"\t"+relation+"\t"+answer)

            gold_tuple = tuple(gold_entitys + gold_relations)
            dic = {}
            dic['question'] = question  # 问题字符串
            dic['answer'] = answers  # 问题的答案
            dic['gold_tuple'] = gold_tuple
            dic['gold_entitys'] = gold_entitys
            dic['gold_relations'] = gold_relations
            dic['sql'] = sql
            corpus[i] = dic

        return corpus,kb

    with cs.open(path, 'r', 'utf-8') as fp:
        train_text = fp.read().split('\r\n\r\n')[:-1]
        length = len(train_text)

        #分出训练集和测试集
        train_corpus_length = math.ceil(0.8 * length)

        train_corpus = train_text[0:train_corpus_length]
        test_corpus = train_text[train_corpus_length:]

        corpus,kb1 = writefile(train_corpus)
        pickle.dump(corpus, open(PICKLE_PATH+'corpus_train.pkl', 'wb'))

        corpus,kb2 = writefile(test_corpus)
        pickle.dump(corpus, open(PICKLE_PATH+'corpus_test.pkl', 'wb'))
        kb = kb1 + kb2

        pickle.dump(kb, open(PICKLE_PATH + 'NEW_KB.pkl', 'wb'))
        fp.close()

#LoadCorpus(PATH + 'task1-4_train_2020.txt')
#simlarity_sentences_examples()
导入测试数据
def TEST_QUESTIONS(QUESTIONS_PATH):
    questions = []
    with open(QUESTIONS_PATH, encoding="utf-8") as f:
        try:
            while True:
                line = f.readline()

                if line:
                    question = line.split(":")[1][:-1]
                    questions.append(question)
                else:
                    break
        finally:
            f.close()

    # 保存csv文件
    link_data = pd.DataFrame(questions)
    link_data.to_csv(CSV_PATH + "task1-4_valid_2020.questions.csv", index=False, sep='\t')
bert next sentence 为答案打分
import pickle
import jieba
from LoadData import *
from enum import Enum
from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
    AdamW,
    BertTokenizer,
    BertForNextSentencePrediction,
    DataProcessor,
    get_linear_schedule_with_warmup,
    WEIGHTS_NAME,
    CONFIG_NAME
)

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForNextSentencePrediction.from_pretrained("bert-base-chinese")
model = model.to(device)
model.load_state_dict(torch.load('./model/simlarity/pytorch_model.bin'))
class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"
# 按照bert的格式输入数据
class NextSentenceProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""

    # 问题和答案做打分bert next sentence的训练数据
    def simlarity_sentences_examples(self,PATH):
        def load_data(PATH):

            train_corpus = pickle.load(open(PATH, 'rb'))
            train_questions = [train_corpus[i]['question'] for i in range(len(train_corpus))]
            train_entitys = [train_corpus[i]['gold_entitys'] for i in range(len(train_corpus))]
            train_entitys = [[entity[1:-1] for entity in line] for line in train_entitys]
            train_tuple = [train_corpus[i]['gold_tuple'] for i in range(len(train_corpus))]
            train_answer = [train_corpus[i]['answer'] for i in range(len(train_corpus))]
            return train_questions, train_entitys, train_tuple, train_answer

        Entities_Answers = pickle.load(open(PICKLE_PATH + 'NEW_ENTITY_ANSWER.pkl', 'rb'))
        Entities_Answers_List = [i for i in Entities_Answers.values()]

        #  训练集
        questions, entitys, tuple, train_answer = load_data(PICKLE_PATH + 'corpus_train.pkl')
        sentences = list()

        for i in range(837, len(questions)):
            print(i)
            size = 2
            if len(tuple[i]) == 1:
                break
            sentence = questions[i] + "\t" + tuple[i][0][1:-1] + "|||" + tuple[i][1][1:-1] + "|||" + train_answer[i][0][1:-1] + "\t" + "0"
            sentences.append(sentence)
            nu = 0
            for p in range(0, len(Entities_Answers_List)):
                if Entities_Answers_List[p][0].find(entitys[i][0]) != -1:

                    if Entities_Answers_List[p][2] != train_answer[i][0][1:-1]:
                        answer = Entities_Answers_List[p]
                        sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
                        sentences.append(sentence)
                        # print(sentence)
                        nu += 1
                        if nu > 2:
                            break

            neg = np.random.randint(len(Entities_Answers_List), size=size)

            for k in range(0, size):
                n = neg[k]
                answer = Entities_Answers_List[n]
                sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
                sentences.append(sentence)

        link_data = pd.DataFrame(sentences)
        link_data.to_csv(CSV_PATH + "sentence_simlarity_train.csv", index=False, sep='\t')

        # 测试集
        questions, entitys, tuple, train_answer = load_data(PICKLE_PATH + 'corpus_test.pkl')
        sentences = list()
        for i in range(0, len(questions)):
            size = 2
            if len(tuple[i]) == 1:
                break
            neg = np.random.randint(len(Entities_Answers_List), size=size)

            sentence = questions[i] + "\t" + tuple[i][0][1:-1] + "|||" + tuple[i][1][1:-1] + "|||" + train_answer[i][0][
                                                                                                     1:-1] + "\t" + "0"
            sentences.append(sentence)
            nu = 0
            for p in range(0, len(Entities_Answers_List)):
                if Entities_Answers_List[p][0].find(entitys[i][0]) != -1:
                    if Entities_Answers_List[p][2] != train_answer[i][0][1:-1]:
                        answer = Entities_Answers_List[p]
                        sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
                        sentences.append(sentence)
                        # print(sentence)
                        nu += 1
                        if nu > 2:
                            break

            for k in range(0, size):
                n = neg[k]
                answer = Entities_Answers_List[n]
                sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
                sentences.append(sentence)

        link_data = pd.DataFrame(sentences)
        link_data.to_csv(CSV_PATH + "sentence_simlarity_test.csv", index=False, sep='\t')
        print("----------------------------------------finish-------------------------------------------")

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "sentence_simlarity_train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "sentence_simlarity_test.csv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "sentence_simlarity_test.csv")), "test")

    def get_labels(self):
        """See base class."""
        return [0, 1]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        test_mode = set_type == "test"
        lines = lines[1:]
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[0][1:]
            text_b = line[1]
            # label = None if test_mode else int(line[0])
            label = line[2][:-1]
            # 这里的InputExample是一个非常简单的类,仅仅包含了text_a, text_b和label三个部分
            # https://github.com/huggingface/transformers/blob/master/src/transformers/data/processors/utils.py#L31
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

        return examples

    def convert_examples_to_features(self, examples, tokenizer, max_length=None, label_list=None, output_mode=None):
        if max_length is None:
            max_length = tokenizer.max_len

        # processor = NextSentenceProcessor()
        if label_list is None:
            label_list = self.get_labels()

        if output_mode is None:
            output_mode = "classification"

        label_map = {label: i for i, label in enumerate(label_list)}

        def label_from_example(example):
            if example.label is None:
                return None
            if output_mode == "classification":
                return label_map[int(example.label)]
            elif output_mode == "regression":
                return float(int(example.label))
            raise KeyError(output_mode)

        labels = [label_from_example(example) for example in examples]

        batch_encoding = tokenizer(
            [(example.text_a, example.text_b) for example in examples],
            max_length=max_length,
            padding="max_length",
            truncation=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}

            # https://github.com/huggingface/transformers/blob/master/src/transformers/data/processors/utils.py#L56
            # InputFeatures当中包含了input_ids, attention_mask, token_type_ids和label四个部分
            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

        return features

    def build_dataset(self, features):
        input_ids = []
        attention_mask = []
        token_type_ids = []
        train_y = []
        for feature in features:
            input_ids.append(feature.input_ids)
            attention_mask.append(feature.attention_mask)
            token_type_ids.append(feature.token_type_ids)
            train_y.append(feature.label)

        input_ids = torch.from_numpy(np.array(input_ids)).long()
        attention_mask = torch.from_numpy(np.array(attention_mask)).long()
        token_type_ids = torch.from_numpy(np.array(token_type_ids)).long()
        train_y = torch.from_numpy(np.array(train_y)).long()
        dataset = TensorDataset(input_ids, attention_mask, token_type_ids, train_y)
        return dataset

    def run(self,PATH, CSV_PATH):

        self.simlarity_sentences_examples(PATH)

        Train_examples = self.get_train_examples(CSV_PATH)
        Test_examples = self.get_test_examples(CSV_PATH)

        Train_features = self.convert_examples_to_features(Train_examples, tokenizer, 150)
        Test_features = self.convert_examples_to_features(Test_examples, tokenizer, 150)

        train_set = self.build_dataset(Train_features)
        test_set = self.build_dataset(Test_features)

        train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)
        validation_dataloader = DataLoader(test_set, batch_size=8, shuffle=True)

        return train_dataloader, validation_dataloader
output_dir = './model/simlarity2/'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)

def train():
    processor = NextSentenceProcessor()
    train_dataloader, validation_dataloader = processor.run(PATH,CSV_PATH)

    # AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
    optimizer = AdamW(model.parameters(),
                      lr=2e-5,  # args.learning_rate - 默认是 5e-5
                      eps=1e-8  # args.adam_epsilon  - 默认是 1e-8, 是为了防止衰减率分母除到0
                      )

    # bert 推荐 epochs 在2到4之间为好。
    epochs = 2

    # training steps 的数量: [number of batches] x [number of epochs].
    total_steps = len(train_dataloader) * epochs

    # 设计 learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,  # Default value in run_glue.py
                                                num_training_steps=total_steps)

    # 设置总时间.
    total_t0 = time.time()
    best_val_accuracy = 0

    for epoch_i in range(0, epochs):
        print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))

        # 记录每个 epoch 所用的时间
        t0 = time.time()
        total_train_loss = 0
        total_train_accuracy = 0
        model.train()

        for step, batch in enumerate(train_dataloader):

            # 每隔40个batch 输出一下所用时间.
            if step % 300 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # `batch` 包括3个 tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_type = batch[1].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[3].to(device)

            # 清空梯度
            model.zero_grad()

            # forward
            # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            loss, logits = model(b_input_ids, b_input_type, b_input_mask, next_sentence_label=b_labels)

            total_train_loss += loss.item()

            # backward 更新 gradients.
            loss.backward()

            # 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # 更新模型参数
            optimizer.step()

            # 更新 learning rate.
            scheduler.step()

            logit = logits.detach().cpu().numpy()
            label_id = b_labels.to('cpu').numpy()
            attention_mask = b_input_mask.cpu().numpy()

            # 计算training 句子的准确度.
            total_train_accuracy += flat_accuracy(logit, label_id, attention_mask)

        # 计算batches的平均损失.
        avg_train_loss = total_train_loss / len(train_dataloader)
        # 计算训练时间.
        training_time = format_time(time.time() - t0)

        # 训练集的准确率.
        avg_train_accuracy = total_train_accuracy / len(train_dataloader)
        print("  训练准确率: {0:.2f}".format(avg_train_accuracy))
        print("  平均训练损失 loss: {0:.2f}".format(avg_train_loss))
        print("  训练时间: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================

        t0 = time.time()

        # 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同
        model.eval()

        # 设置参数
        total_eval_accuracy = 0
        total_eval_loss = 0

        for batch in validation_dataloader:
            # `batch` 包括3个 tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_type = batch[1].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[3].to(device)

            # 在valuation 状态,不更新权值,不改变计算图
            with torch.no_grad():
                # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                loss, logits = model(b_input_ids, b_input_type, b_input_mask, next_sentence_label=b_labels)

            # 计算 validation loss.
            total_eval_loss += loss.item()
            logit = logits.detach().cpu().numpy()
            label_id = b_labels.to('cpu').numpy()
            attention_mask = b_input_mask.cpu().numpy()
            # predicty = np.array([[1 if each > 0.5 else 0 for each in line] for line in logit])
            # 计算 validation 句子的准确度.
            total_eval_accuracy += flat_accuracy(logit, label_id, attention_mask)

        # 计算 validation 的准确率.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("")
        print("  测试准确率: {0:.2f}".format(avg_val_accuracy))

        if avg_val_accuracy > best_val_accuracy:
            best_val_accuracy = avg_val_accuracy
            torch.save(model.state_dict(), output_model_file)
            # model.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_dir)

        # 计算batches的平均损失.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # 计算validation 时间.
        validation_time = format_time(time.time() - t0)

        print("  平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
        print("  测试时间: {:}".format(validation_time))

    print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

# 训练模型
#train()
bert 命名实体识别 把entity识别出来
from LoadData import *
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW,BertTokenizer, RobertaForTokenClassification,DataProcessor
from torch.utils.data import DataLoader, RandomSampler, TensorDataset

class enity_identifing(nn.Module):
    def __init__(self,embedding_dim):
        super(enity_identifing, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("RoBERTa_zh_Large_PyTorch")
        self.embed = nn.Embedding(self.tokenizer.vocab_size, embedding_dim)
        self.bert_model = RobertaForTokenClassification.from_pretrained(
            "./RoBERTa_zh_Large_PyTorch/",  # 使用 12-layer 的 BERT 模型.
            num_labels=5000,  # 多分类任务的输出标签为 len(tag2idx)个.
            output_attentions=False,  # 不返回 attentions weights.
            output_hidden_states=False,  # 不返回 all hidden-states.
        ).to(device)
        initrange = 0.1
        self.embed.weight.data.uniform_(-initrange, initrange)
        self.rnn_type = "LSTM"
        self.nhid = 512
        self.rnn = nn.LSTM(5000, self.nhid, bidirectional=True, dropout=0.5).to(device)
        self.output = nn.Linear(2 * self.nhid, 1).to(device)
        self.loss_fn = nn.BCEWithLogitsLoss().to(device)
        self.sig = nn.Sigmoid().to(device)

    def forward(self, inputs, type_ids,mask ,y):
        #  输入bert
        out = self.bert_model(inputs, type_ids, mask)
        #  输入LSTM
        hidden, states = self.rnn(out[0].contiguous())
        logits = self.output(hidden)

        loss = self.loss_fn(logits.squeeze(),y.float())*mask
        loss = (torch.sum(loss) / torch.sum(mask))
        logits = self.sig(logits.squeeze())
        return loss, logits

    def predict(self, inputs, type_ids, mask ):
        out = self.bert_model(inputs, type_ids, mask)
        #  输入LSTM
        hidden, states = self.rnn(out[0].contiguous())
        logits = self.output(hidden)
        logits = self.sig(logits.squeeze())
        return logits

    def restore_entity_from_labels(self,labels, question):
        question = self.tokenizer.convert_ids_to_tokens(question)
        entitys = []
        str = ''
        labels = labels[1:-1]
        question = question[1:-1]
        for i in range(min(len(labels), len(question))):
            if labels[i] == 1:
                str += question[i]
            else:
                if len(str):
                    entitys.append(str)
                    str = ''
        if len(str):
            entitys.append(str)
        return entitys

    def restore_entity_from_labels_on_corpus(self, predicty, questions):
        all_entitys = []
        for i in range(len(predicty)):
            all_entitys.append(self.restore_entity_from_labels(predicty[i], questions[i]))
        return all_entitys


class NER_bert:
    def __init__(self,embedding_dim):
        super(NER_bert, self).__init__()
        self.model = enity_identifing(embedding_dim)
        # 推荐batch_size 为 16 或者 32
        self.batch_size = 8
        self.tokenizer =  self.model.tokenizer
        self.max_seq_len = 40

    def format_time(self,elapsed):
        elapsed_rounded = int(round((elapsed)))
        # 返回 hh:mm:ss 形式的时间
        return str(datetime.timedelta(seconds=elapsed_rounded))

    def find_lcsubstr(self,s1, s2):
        m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]  # 生成0矩阵,为方便后续计算,比字符串长度多了一列
        mmax = 0  # 最长匹配的长度
        p = 0  # 最长匹配对应在s1中的最后一位
        for i in range(len(s1)):
            for j in range(len(s2)):
                if s1[i] == s2[j]:
                    m[i + 1][j + 1] = m[i][j] + 1
                if m[i + 1][j + 1] > mmax:
                    mmax = m[i + 1][j + 1]
                    p = i + 1
        return s1[p - mmax:p]

    def GetXY(self, questions, entitys):
        X1, X2, X3, Y = [], [], [], []
        for i in range(len(questions)):
            q = questions[i]
            encoded_dict = self.tokenizer(q, max_length=self.max_seq_len, pad_to_max_length=True,
                                     return_tensors='pt')  # 分别是 词索引序列和分块索引序列
            x1, x2, x3 = encoded_dict["input_ids"][0], encoded_dict["token_type_ids"][0], \
                         encoded_dict["attention_mask"][0]
            y = [[0] for j in range(self.max_seq_len)]

            assert len(x1) == len(y)
            for e in entitys[i]:
                # 得到实体名和问题的最长连续公共子串
                e = self.find_lcsubstr(e, q)
                if e in q:
                    begin = q.index(e) + 1
                    end = begin + len(e)
                    if end < self.max_seq_len - 1:
                        for pos in range(begin, end):
                            y[pos] = [1]

            X1.append(x1.tolist())
            X2.append(x2.tolist())
            X3.append(x3.tolist())
            Y.append(y)
        X1 = torch.tensor(X1).long()
        X2 = torch.tensor(X2).long()
        X3 = torch.tensor(X3).long()
        Y = torch.tensor(np.array(Y)).squeeze().long()
        return X1, X2, X3, Y

    def my_dataloader(self,PATH):
        train_corpus = pickle.load(open(PATH + 'corpus_train.pkl', 'rb'))
        train_questions = [train_corpus[i]['question'] for i in range(len(train_corpus))]
        train_entitys = [train_corpus[i]['gold_entitys'] for i in range(len(train_corpus))]
        train_entitys = [[entity[1:-1].split('_')[0] for entity in line] for line in train_entitys]

        test_corpus = pickle.load(open(PATH + 'corpus_test.pkl', 'rb'))
        test_questions = [test_corpus[i]['question'] for i in range(len(test_corpus))]
        test_entitys = [test_corpus[i]['gold_entitys'] for i in range(len(test_corpus))]
        test_entitys = [[entity[1:-1].split('_')[0] for entity in line] for line in test_entitys]


        trainx1, trainx2, trainx3, trainy = self.GetXY(train_questions, train_entitys)
        testx1, testx2, testx3, testy = self.GetXY(test_questions, test_entitys)

        # 把input 放入 TensorDataset。
        train_dataset = TensorDataset(trainx1, trainx2, trainx3, trainy)
        test_dataset = TensorDataset(testx1, testx2, testx3, testy)

        # 为训练数据集和验证数据集设计DataLoaders.
        train_dataloader = DataLoader(
            train_dataset,  # 训练数据.
            sampler=RandomSampler(train_dataset),  # 打乱顺序
            batch_size=self.batch_size
        )

        validation_dataloader = DataLoader(
            test_dataset,  # 验证数据.
            sampler=RandomSampler(test_dataset),  # 打乱顺序
            batch_size=self.batch_size
        )
        return train_dataloader,validation_dataloader

    def flat_accuracy(self, preds, labels, attention):

        scores = (preds * attention == labels * attention)
        rights = 0
        for score in scores:
            if sum(score) == len(labels[0]):
                rights += 1

        return rights / len(labels)

    def eval(self,validation_dataloader):

        # 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同
        self.model.eval()
        # 设置参数
        t0 = time.time()
        total_eval_accuracy = 0
        total_eval_loss = 0

        for batch in validation_dataloader:
            # `batch` 包括3个 tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_type = batch[1].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[3].to(device)

            # 在valuation 状态,不更新权值,不改变计算图
            with torch.no_grad():
                # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                loss, logits = self.model(b_input_ids, b_input_type, b_input_mask, b_labels)

            # 计算 validation loss.
            total_eval_loss += loss.item()
            logit = logits.detach().cpu().numpy()
            label_id = b_labels.to('cpu').numpy()
            attention_mask = b_input_mask.cpu().numpy()
            predicty = np.array([[1 if each > 0.5 else 0 for each in line] for line in logit])
            # 计算 validation 句子的准确度.
            total_eval_accuracy += self.flat_accuracy(predicty, label_id, attention_mask)

        # 计算 validation 的准确率.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("")
        print("  测试准确率: {0:.2f}".format(avg_val_accuracy))

        # 计算batches的平均损失.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # 计算validation 时间.
        validation_time = self.format_time(time.time() - t0)
        print("  平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
        print("  测试时间: {:}".format(validation_time))

        return avg_val_accuracy

    def run(self,PATH):
        train_dataloader, validation_dataloader = self.my_dataloader(PATH)
        # bert 推荐 epochs 在2到4之间为好。
        epochs = 4

        # AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
        self.optimizer = AdamW(self.model.parameters(),
                          lr=2e-5,  # args.learning_rate - 默认是 5e-5
                          eps=1e-8  # args.adam_epsilon  - 默认是 1e-8, 是为了防止衰减率分母除到0
                          )

        # training steps 的数量: [number of batches] x [number of epochs].
        total_steps = len(train_dataloader) * epochs

        # 设计 learning rate scheduler.
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                    num_warmup_steps=0,  # Default value in run_glue.py
                                                    num_training_steps=total_steps)

        output_dir = './model/ner/'
        output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
        training_stats = []
        # 设置总时间.
        total_t0 = time.time()
        best_val_accuracy = 0

        for epoch_i in range(0, epochs):
            print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
            # ========================================
            #               training
            # ========================================
            self.train(train_dataloader)
            # ========================================
            #               Validation
            # ========================================
            avg_val_accuracy = self.eval(validation_dataloader)
            if avg_val_accuracy > best_val_accuracy:
                best_val_accuracy = avg_val_accuracy
                torch.save(self.model.state_dict(), output_model_file)
                self.tokenizer.save_vocabulary(output_dir)


        print("训练一共用了 {:} (h:mm:ss)".format(self.format_time(time.time() - total_t0)))

    def train(self,train_dataloader):

        # 记录每个 epoch 所用的时间
        t0 = time.time()
        total_train_loss = 0
        total_train_accuracy = 0
        self.model.train()

        for step, batch in enumerate(train_dataloader):

            # 每隔40个batch 输出一下所用时间.
            if step % 100 == 0 and not step == 0:
                elapsed = self.format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # `batch` 包括3个 tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_type = batch[1].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[3].to(device)

            # 清空梯度
            self.model.zero_grad()

            # forward
            # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            loss, logits = self.model(b_input_ids, b_input_type, b_input_mask, b_labels)

            total_train_loss += loss.item()

            # backward 更新 gradients.
            loss.backward()

            # 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            # 更新模型参数
            self.optimizer.step()

            # 更新 learning rate.
            self.scheduler.step()

            logit = logits.detach().cpu().numpy()
            label_id = b_labels.to('cpu').numpy()
            attention_mask = b_input_mask.cpu().numpy()

            predicty = np.array([[1. if each > 0.5 else 0 for each in line] for line in logit])
            # 计算training 句子的准确度.
            total_train_accuracy += self.flat_accuracy(predicty, label_id, attention_mask)

        # 计算batches的平均损失.
        avg_train_loss = total_train_loss / len(train_dataloader)
        # 计算训练时间.
        training_time = self.format_time(time.time() - t0)

        # 训练集的准确率.
        avg_train_accuracy = total_train_accuracy / len(train_dataloader)
        print("  训练准确率: {0:.2f}".format(avg_train_accuracy))
        print("  平均训练损失 loss: {0:.2f}".format(avg_train_loss))
        print("  训练时间: {:}".format(training_time))

        return total_train_loss,total_train_accuracy

# processor = NER_bert(1000)
# processor.run(PICKLE_PATH)
提取有可能的谓语
import pickle
import codecs as cs
import re
import thulac
from LoadData import *

class PropExtractor(object):
    def __init__(self):
        self.prop_dic, self.char_2_prop = self.load_data()
        self.segger = thulac.thulac()

    def load_data(self):

        prop_dic = dict()
        entities_tuples = pickle.load(open(PICKLE_PATH + 'NEW_ENTITI_ANSER.pkl', 'rb'))
        for entity_tuple in entities_tuples:

            property = entity_tuple.split('-')[-1]
            if property in prop_dic:
                prop_dic[property] += 1
            else:
                prop_dic[property] = 1

        char_2_prop = dict()
        for prop in prop_dic:
            if len(prop) < 20:  # 这里设置最大长度,不考虑长度过长的属性值
                chars = set(prop)
                for char in chars:
                    try:
                        char_2_prop[char].append(prop)
                    except:
                        char_2_prop[char] = [prop]

        return prop_dic,char_2_prop


    def extract_properties(self, question):
        '''
        输入一个问题,抽取出所有能和知识库中的属性值匹配的字符串,筛选后返回
        input:
            question : python-str
        output:
            props : python-dic
        '''

        props = {}  # 键为知识库里prop,值为mention
        QUES = question

        # 包含在双引号 书名号里的属性
        mark_props = {}
        elements = re.findall('\".+\"|《.+》', question)
        if len(elements) > 0:
            for e in elements:  # '甲天下', '完美的搜索引擎,'
                if e in self.prop_dic:  # 一般书名号的属性就是需要的属性
                    mark_props[e] = e
                question = re.sub(e, '', question)
        props['mark_props'] = mark_props

        # 时间属性
        time_props = {}
        # 提取年月日
        year_month_day = re.findall('\d+年\d+月\d+日|\d+年\d+月\d+号|\d+\.\d+\.\d+', question)
        for ymd in year_month_day:
            rml_norm = self.TransNormalTime(ymd)
            time_props[rml_norm] = ymd
            question = re.sub(ymd, '', question)
        # 提取月日
        month_day = re.findall('\d+月\d+日|\d+月\d+号|\d+年\d+月', question)
        for ymd in month_day:
            rml_norm = self.TransNormalTime(ymd)
            time_props[rml_norm] = ymd
            question = re.sub(ymd, '', question)
        # 提取年份
        years = re.findall('\d+年', question)
        for ymd in years:
            rml_norm = self.TransNormalTime(ymd)
            time_props[rml_norm] = ymd
            question = re.sub(ymd, '', question)
        props['time_props'] = time_props
        # 数字属性
        digit_props = {}
        elements = re.findall('\d+', question)
        if len(elements) > 0:
            for e in elements:
                if e in self.prop_dic:
                    digit_props[e] = e
        props['digit_props'] = digit_props

        # 其他属性,去重
        other_props = {}
        length = len(question)
        props_ngram = []
        max_len = 0
        for l in range(length, 0, -1):  # 只考虑长度大于1的可匹配属性值
            for i in range(length - l + 1):
                if question[i:i + l] in self.prop_dic:
                    props_ngram.append(question[i:i + l])
                    if len(question[i:i + l]) > max_len:
                        max_len = len(question[i:i + l])

        stop_props = []
        for p in props_ngram:
            for q in props_ngram:
                if p in q and p != q and self.segger.cut(p)[0][1] not in ['ns']:  # 加拿大的,台湾的等问题 p不是地名
                    stop_props.append(p)

        new_props = []  # 去掉包含在更长属性值中的属性值
        for p in props_ngram:
            if p not in stop_props:
                new_props.append(p)

        new_new_props = []  # 去掉长度过于短的属性值
        for p in new_props:
            if len(p) == 1 and self.segger.cut(p)[0][1] in ['n']:  # 单字名词
                new_new_props.append(p)
            elif (len(p) >= (max_len * 0.5) and len(p) != 1) or self.segger.cut(p)[0][1] in ['n',
                                                                                             'ns'] or self.exist_digit(
                    p):  # 长度过短且词性名词比较重要
                new_new_props.append(p)

        for p in new_new_props:
            other_props[p] = p
        props['other_props'] = other_props

        # 模糊匹配得到的属性
        stop_dic = {'有', '的', '是', '在', '上', '哪', '里', '\"', '什', '么', '中', '个'}
        prop2num = {}
        for char in QUES:
            if char in stop_dic:
                continue
            else:
                try:
                    for p in self.char_2_prop[char]:
                        if p in prop2num:
                            prop2num[p] += 1
                        else:
                            prop2num[p] = 1
                except:
                    continue
        sort_props = sorted(prop2num.items(), key=lambda prop2num: prop2num[1], reverse=True)
        top3_props = [key for key, value in sort_props[:3]]  # top3
        fuzzy_props = {}
        for p in top3_props:
            fuzzy_props[p] = p
        props['fuzzy_props'] = fuzzy_props  # 取与问题中匹配字数最多的属性作为候选

        return props

    def extract_subject_properties(self, question):
        '''
        输入一个问题,抽取出所有能和知识库中的属性值匹配的字符串,并将更有可能作为简单问题主语的属性值提取出来
        input:
            question : python-str
        output:
            props : python-dic
        '''
        pred_props = self.extract_properties(question)
        if len(pred_props['mark_props']) != 0:
            subject_props = pred_props['mark_props']
        elif len(pred_props['time_props']) != 0:
            subject_props = pred_props['time_props']
        elif len(pred_props['digit_props']) != 0:
            subject_props = pred_props['digit_props']
        else:
            subject_props = pred_props['other_props']
            subject_props.update(pred_props['fuzzy_props'])
        return subject_props

    def GetProps(self, corpus):
        gold_num = 0
        true_num = 0
        entity_error = []
        irregular = []
        all_props_num = 0.0
        for i in range(len(corpus)):
            question = corpus[i]['question']
            #gold_entitys = corpus[i]['gold_entitys']
            gold_entitys = corpus[i]['entity_mention']
            # 提取gold props
            gold_props = []
            for x in gold_entitys:
                #print(x)
                # if x == "汪炜_(黄山市民营企业家协会二届会长)":
                #     print("yes")
                #     print(x)
                if len(x) == 0:
                    continue

                if x[0] == '\"':
                    if x[0] == "汪炜_(黄山市民营企业家协会二届会长)":
                        print("-----")
                    gold_props.append(x)

            # 得到抽取出的属性字典并保存
            pred_props = self.extract_properties(question)  # 得到的均不包含引号
            corpus[i]['all_props'] = pred_props

            # 得到所有可能的属性corpus[i]['subject_props']
            subject_props = {}
            subject_props.update(pred_props['mark_props'])
            subject_props.update(pred_props['time_props'])
            subject_props.update(pred_props['digit_props'])
            subject_props.update(pred_props['other_props'])
            subject_props.update(pred_props['fuzzy_props'])
            corpus[i]['subject_props'] = subject_props
            all_props_num += len(corpus[i]['subject_props'])

            # 统计该模块抽取唯一主语实体的召回率
            if len(gold_props) == 1 and len(gold_entitys) == 1:
                gold_num += 1
                if_same = self.CheckSame(gold_props, subject_props)  # 判断抽取出的属性值是否完全包括了gold props
                true_num += if_same
                if not if_same:
                    print('主语属性值抽取失败')
                    entity_error.append(i)
                else:
                    print('主语属性值抽取成功')
                print(i, question)
                print(gold_props)
                print(subject_props)
                print('\n')
        if gold_num != 0:
            print('单主语且主语为属性值问题中,能找到所有主语属性值的比例为:%.2f' % (true_num / gold_num))
        else:
            print('单主语且主语为属性值问题中,找不到主语属性值')
        print('平均每个问题属性为:%.2f' % (all_props_num / len(corpus)))
        print(entity_error)
        print(irregular)
        return corpus

    def CheckSame(self, gold_props, pred_props):
        pred_props_list = []
        for p in pred_props:  # 取得是key键
            pred_props_list.append('\"' + p + '\"')
        join_props = set(pred_props_list).intersection(set(gold_props))
        if len(join_props) == len(gold_props):
            return 1
        else:
            return 0

    def exist_digit(self, p):
        '''
        判断字符串中是否存在数字
        '''
        for i in range(10):
            if str(i) in p:
                return 1
        return 0

    def TransNormalTime(self, time):
        digits = re.findall('\d+', time)
        elements = []
        for d in digits:
            if len(d) > 2:
                elements.append(d)
            elif len(d) == 2:
                if int(d[0]) > 3:
                    elements.append('19' + d)
                else:
                    elements.append(d)
            else:
                elements.append('0' + d)
        return '-'.join(elements)
inputpaths = ['ENTITY_MENTIONS_VALIDATION3.pkl']
outputpaths = ['ALL_MENTIONS_VALIDATION3.pkl']
starttime = time.time()
pe = PropExtractor()
for i in range(0, 1):
    inputpath = inputpaths[i]
    outputpath = outputpaths[i]
    corpus = pickle.load(open(PICKLE_PATH + inputpath, 'rb'))
    corpus = pe.GetProps(corpus)
    print('得到实体mention')
    pickle.dump(corpus, open(PICKLE_PATH + outputpath, 'wb'))
print('耗费时间%.2f秒' % (time.time() - starttime))
#### 开始预测
from answer_score_bert import *
import pymysql

class Entity_answer():
    def __init__(self):

        self.entity_to_pro = pickle.load(open(PICKLE_PATH + 'ENTITY_TO_PROPERTIES.pkl', 'rb'))
        self.conn = pymysql.connect(
            host='localhost',
            user='root',
            password='123',
            db='pkubase',
            charset='utf8',
            # autocommit=True,    # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
        )
        # ****python, 必须有一个游标对象, 用来给数据库发送sql语句, 并执行的.
        # 2. 创建游标对象,
        self.cur = self.conn.cursor()

        self.predicates = pickle.load(open(PICKLE_PATH +'PREDICATES.pkl', 'rb'))
        self.words_Frequent_dic = pickle.load(open(PICKLE_PATH + 'WORDS_FREQUENCY.pkl', 'rb'))
        self.stopwords = self.stopwordslist(PATH + 'stop_words.txt')
        self.pass_words = {'是什么', '在哪里', '哪里', '什么', '提出的', '有什么', '国家', '哪个', '所在','哪一年',
                                 '培养出', '为什么', '什么时候', '人', '你知道', '都包括', '是谁', '告诉我', '又叫做', '有', '是'}

    def stopwordslist(self,filepath):
        stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
        return stopwords

    def movestopwords(self,sentence):
        # 去掉常用词
        if sentence not in self.words_Frequent_dic and sentence not in self.entity_to_pro:
            while len(sentence)!=1:

                if sentence[0] in self.stopwords:
                    sentence = sentence[1:]
                else:
                    break
            while len(sentence) != 1:

                if sentence[-1] in self.stopwords:
                    sentence = sentence[:-1]
                else:
                    break
        return sentence


    def predicate(self,data):

        question = data['question']
        Entities = list(data['entity_mention'])
        print(question)
        Predicate_List = jieba.cut_for_search(question.replace("?","").replace("。",""))
        Predicate_List = (" ".join(Predicate_List)).split(" ")
        Properties = list(data['subject_props'])

        # 用jieba分词 初略的定位谓语
        Predicates = []

        for e in (Predicate_List + Properties):
            # 保留单个字的谓语
            pass_predicates = ["高"]
            if e in pass_predicates:
                Predicates.append(e)
                continue
            # if e not in self.words_Frequent_dic or len(e) == 1:
            #     continue
            if e not in self.words_Frequent_dic:
                continue
            t = 0

            for pass_words in self.pass_words:
                if pass_words.find(e) != -1:
                    t =1
                    break
            if t == 1:
                continue

            if e in question:
                if e == "属于":
                    e = "类型"
                elif e == "科":
                    e = "科室"
                elif "生"  and "年"in question:
                    e = "出生日期"
                else :
                    if "生" and "地" in question:
                       e = "出生地"

            if e not in Predicates:
                for predicate in self.predicates:
                    if predicate.find(e) != -1:
                        Predicates.append(predicate)

        if Predicates == []:
            print("空的!!!!")
            print(Entities)
            print(Predicates)

        start = time.time()
        count = 0
        high_score = 0
        nu_tuples = len(set(Predicates))*len(Entities)
        answer = ""
        finish = 0
        for Entitie in Entities:
            if nu_tuples> 100000000:
                if Entitie not in self.entity_to_pro or question.find(Entitie) == -1:
                    continue

            if Entitie not in self.entity_to_pro or "\"" in Entitie:
                continue

            pros = self.entity_to_pro[Entitie]
            for property in list(set(Predicates)):
                count += 1
                if property in pros:

                    candidate = "\'"+Entitie+"-"+property+"\'"
                    sqli = "SELECT * FROM kb WHERE candidate = " + "\""+candidate+ "\""
                    result = self.cur.execute(sqli)  # 默认不返回查询结果集, 返回数据记录数。
                    if result == 1:

                        info = self.cur.fetchall()  # 3). 获取所有的查询结果
                        subject = info[0][1][1:-1]

                        # 用bert 做next sentence predicate, 0 答案正确, 1 为答案错误
                        encoding = tokenizer(question,subject + "|||" + info[0][2][1:-1], return_tensors='pt')
                        logits = model(**encoding.to(device))

                        # 答案正确,进行打分
                        if torch.argmax(logits[0],dim=-1) == 0:
                            # 每次取打分结果最高的, 作为最后的答案
                            if logits[0][0][0] - logits[0][0][1] > high_score:
                                high_score = logits[0][0][0] - logits[0][0][1]
                                answer = info[0][3]

                                for element in answer.split("\t"):

                                    element = element.replace("\t","")[1:-1]
                                    if element in self.entity_to_pro:

                                        pros_secondary = self.entity_to_pro[element]
                                        for pro in pros_secondary:
                                            if pro in list(set(Predicates)):

                                                candidate = "\'" + element.split("_")[0] + "-" + pro + "\'"
                                                sqli = "SELECT * FROM kb WHERE candidate = " + "\"" + candidate + "\""

                                                if 1 == self.cur.execute(sqli):
                                                    info = self.cur.fetchall()
                                                    answer = info[0][3]


        training_time = format_time(time.time() - start)

        # 训练集的准确率.
        print("count: ",count)
        print("时间: {:}".format(training_time))
        print(answer)

        return answer

    def run(self,data):
        print("一共", len(data), "数据")

        # for i in range(1, len(data)):
        for i in range(20, len(data)):
            with open('./data/answer1.txt', 'a', encoding="utf-8") as f:
                print(i + 1)
                t0 = time.time()
                record = answers.predicate(data[i])
                f.write(record)
                print(record)
                print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - t0)))
            f.close()
        print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - total_time)))
        # 4. 关闭游标
        self.cur.close()
        # 5. 关闭连接
        self.conn.close()
# 导入数据
with open(PICKLE_PATH + 'ALL_MENTIONS_VALIDATION2.pkl', 'rb') as f:
    data = pickle.load(f)

answers = Entity_answer()
total_time = time.time()
answers.run(data)
测试函数
from LoadData import *
class Eval(object):
    """
    Entity Linking Evaluation
    """

    def __init__(self, golden_file_path):
        self.golden_file_path = golden_file_path
        self.user_file_path = golden_file_path
        self.tp = 0
        self.fp = 0
        self.total_recall = 0
        self.errno = None


    def micro_f1(self):
        """
        :return: float类型:精确率,召回率,Micro-F1值
        """
        # 文本格式验证
        precision = 0
        recall = 0
        self.tp = 0
        self.fp = 0
        self.precision = 0
        self.recall  = 0
        self.total_recall = 0
        entities_ansewers = pickle.load(open(self.golden_file_path, 'rb'))


        for line in entities_ansewers:
            print(line)
            A = []
            properties = line[3][2].split(" ")
            for i in range(0,len(properties)):
                if len(properties[i]) != 0:
                    A.append(properties[i])


            G = []
            for i in range(0,len(line[1][:])):
                if len(line[1][:][i]) == 0:
                    continue
                else:
                    G.append(line[1][:][i][1:-1])

            c = 0
            for e in list(set(A)):
                if e in G:
                    c += 1
            if len(A) == 0:
                self.precision += 0
            else:
                self.precision += c/len(A)

            if len(G) == 0:
                self.recall += 0
            else:
                self.recall += c / len(G)

        precision = self.precision/len(entities_ansewers)
        recall = self.recall/len(entities_ansewers)

        a = 2 * precision * recall
        b = precision + recall
        if b == 0:
            return 0, 0, 0
        f1 = a / b
        return precision, recall, f1
eval = Eval('./data/pickle/RESULT2.pkl')
prec, recall, f1 = eval.micro_f1()
print(prec, recall, f1)
if eval.errno:
    print(eval.errno)

  • 4
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值