CCKS 2020:新冠知识图谱构建与问答评测(四)新冠知识图谱问答评测
连接:https://www.biendata.xyz/competition/ccks_2020_7_4/evaluation/
初始化数据种子引入需要的包
import os
import re
import math
import torch
import random
import pickle
import numpy as np
import codecs as cs
import pandas as pd
import torch.nn as nn
import time
import datetime
from transformers import WEIGHTS_NAME
# 设置随机种子.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
PATH = './data/'
CSV_PATH = "./data/csv/"
PICKLE_PATH = './data/pickle/'
INPUT_PATH = './data/PKUBASE/pkubase-complete-2020/pkubase-complete.txt'
def format_time(elapsed):
elapsed_rounded = int(round((elapsed)))
# 返回 hh:mm:ss 形式的时间
return str(datetime.timedelta(seconds=elapsed_rounded))
def flat_accuracy(preds, labels, attention):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
导入训练和测试集 并且补充新数据到知识图谱中
def LoadCorpus(path):
def writefile(text):
corpus = {}
kb = []
for i in range(len(text)):
# 对问题进行预处理
question = text[i].split('\r\n')[0].split(':')[1]
question = re.sub('我想知道', '', question)
question = re.sub('你了解', '', question)
question = re.sub('请问', '', question)
answers = text[i].split('\n')[2].split('\t')
sql = text[i].split('\n')[1]
sql = re.findall('{.+}', sql)[0]
elements = re.findall('<.+?>|\".+?\"|\?\D', sql) + re.findall('\".+?\"', sql)
# elements中包含创引号的项目可能有重复,需要去重
new_elements = []
for e in elements:
if e[0] == '\"':
if e not in new_elements:
new_elements.append(e)
else:
new_elements.append(e)
elements = new_elements
gold_entitys = []
gold_relations = []
for j in range(len(elements)):
if elements[j][0] == '<' or elements[j][0] == '\"':
if j % 3 == 1:
gold_relations.append(elements[j])
else:
gold_entitys.append(elements[j])
for entity in gold_entitys:
for relation in gold_relations:
for answer in answers:
kb.append(entity+"\t"+relation+"\t"+answer)
gold_tuple = tuple(gold_entitys + gold_relations)
dic = {}
dic['question'] = question # 问题字符串
dic['answer'] = answers # 问题的答案
dic['gold_tuple'] = gold_tuple
dic['gold_entitys'] = gold_entitys
dic['gold_relations'] = gold_relations
dic['sql'] = sql
corpus[i] = dic
return corpus,kb
with cs.open(path, 'r', 'utf-8') as fp:
train_text = fp.read().split('\r\n\r\n')[:-1]
length = len(train_text)
#分出训练集和测试集
train_corpus_length = math.ceil(0.8 * length)
train_corpus = train_text[0:train_corpus_length]
test_corpus = train_text[train_corpus_length:]
corpus,kb1 = writefile(train_corpus)
pickle.dump(corpus, open(PICKLE_PATH+'corpus_train.pkl', 'wb'))
corpus,kb2 = writefile(test_corpus)
pickle.dump(corpus, open(PICKLE_PATH+'corpus_test.pkl', 'wb'))
kb = kb1 + kb2
pickle.dump(kb, open(PICKLE_PATH + 'NEW_KB.pkl', 'wb'))
fp.close()
#LoadCorpus(PATH + 'task1-4_train_2020.txt')
#simlarity_sentences_examples()
导入测试数据
def TEST_QUESTIONS(QUESTIONS_PATH):
questions = []
with open(QUESTIONS_PATH, encoding="utf-8") as f:
try:
while True:
line = f.readline()
if line:
question = line.split(":")[1][:-1]
questions.append(question)
else:
break
finally:
f.close()
# 保存csv文件
link_data = pd.DataFrame(questions)
link_data.to_csv(CSV_PATH + "task1-4_valid_2020.questions.csv", index=False, sep='\t')
bert next sentence 为答案打分
import pickle
import jieba
from LoadData import *
from enum import Enum
from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
AdamW,
BertTokenizer,
BertForNextSentencePrediction,
DataProcessor,
get_linear_schedule_with_warmup,
WEIGHTS_NAME,
CONFIG_NAME
)
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForNextSentencePrediction.from_pretrained("bert-base-chinese")
model = model.to(device)
model.load_state_dict(torch.load('./model/simlarity/pytorch_model.bin'))
class Split(Enum):
train = "train"
dev = "dev"
test = "test"
# 按照bert的格式输入数据
class NextSentenceProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
# 问题和答案做打分bert next sentence的训练数据
def simlarity_sentences_examples(self,PATH):
def load_data(PATH):
train_corpus = pickle.load(open(PATH, 'rb'))
train_questions = [train_corpus[i]['question'] for i in range(len(train_corpus))]
train_entitys = [train_corpus[i]['gold_entitys'] for i in range(len(train_corpus))]
train_entitys = [[entity[1:-1] for entity in line] for line in train_entitys]
train_tuple = [train_corpus[i]['gold_tuple'] for i in range(len(train_corpus))]
train_answer = [train_corpus[i]['answer'] for i in range(len(train_corpus))]
return train_questions, train_entitys, train_tuple, train_answer
Entities_Answers = pickle.load(open(PICKLE_PATH + 'NEW_ENTITY_ANSWER.pkl', 'rb'))
Entities_Answers_List = [i for i in Entities_Answers.values()]
# 训练集
questions, entitys, tuple, train_answer = load_data(PICKLE_PATH + 'corpus_train.pkl')
sentences = list()
for i in range(837, len(questions)):
print(i)
size = 2
if len(tuple[i]) == 1:
break
sentence = questions[i] + "\t" + tuple[i][0][1:-1] + "|||" + tuple[i][1][1:-1] + "|||" + train_answer[i][0][1:-1] + "\t" + "0"
sentences.append(sentence)
nu = 0
for p in range(0, len(Entities_Answers_List)):
if Entities_Answers_List[p][0].find(entitys[i][0]) != -1:
if Entities_Answers_List[p][2] != train_answer[i][0][1:-1]:
answer = Entities_Answers_List[p]
sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
sentences.append(sentence)
# print(sentence)
nu += 1
if nu > 2:
break
neg = np.random.randint(len(Entities_Answers_List), size=size)
for k in range(0, size):
n = neg[k]
answer = Entities_Answers_List[n]
sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
sentences.append(sentence)
link_data = pd.DataFrame(sentences)
link_data.to_csv(CSV_PATH + "sentence_simlarity_train.csv", index=False, sep='\t')
# 测试集
questions, entitys, tuple, train_answer = load_data(PICKLE_PATH + 'corpus_test.pkl')
sentences = list()
for i in range(0, len(questions)):
size = 2
if len(tuple[i]) == 1:
break
neg = np.random.randint(len(Entities_Answers_List), size=size)
sentence = questions[i] + "\t" + tuple[i][0][1:-1] + "|||" + tuple[i][1][1:-1] + "|||" + train_answer[i][0][
1:-1] + "\t" + "0"
sentences.append(sentence)
nu = 0
for p in range(0, len(Entities_Answers_List)):
if Entities_Answers_List[p][0].find(entitys[i][0]) != -1:
if Entities_Answers_List[p][2] != train_answer[i][0][1:-1]:
answer = Entities_Answers_List[p]
sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
sentences.append(sentence)
# print(sentence)
nu += 1
if nu > 2:
break
for k in range(0, size):
n = neg[k]
answer = Entities_Answers_List[n]
sentence = questions[i] + "\t" + answer[0] + "|||" + answer[1] + "|||" + answer[2] + "\t" + "1"
sentences.append(sentence)
link_data = pd.DataFrame(sentences)
link_data.to_csv(CSV_PATH + "sentence_simlarity_test.csv", index=False, sep='\t')
print("----------------------------------------finish-------------------------------------------")
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "sentence_simlarity_train.csv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "sentence_simlarity_test.csv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "sentence_simlarity_test.csv")), "test")
def get_labels(self):
"""See base class."""
return [0, 1]
def _create_examples(self, lines, set_type):
"""Creates examples for the training, dev and test sets."""
test_mode = set_type == "test"
lines = lines[1:]
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = line[0][1:]
text_b = line[1]
# label = None if test_mode else int(line[0])
label = line[2][:-1]
# 这里的InputExample是一个非常简单的类,仅仅包含了text_a, text_b和label三个部分
# https://github.com/huggingface/transformers/blob/master/src/transformers/data/processors/utils.py#L31
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def convert_examples_to_features(self, examples, tokenizer, max_length=None, label_list=None, output_mode=None):
if max_length is None:
max_length = tokenizer.max_len
# processor = NextSentenceProcessor()
if label_list is None:
label_list = self.get_labels()
if output_mode is None:
output_mode = "classification"
label_map = {label: i for i, label in enumerate(label_list)}
def label_from_example(example):
if example.label is None:
return None
if output_mode == "classification":
return label_map[int(example.label)]
elif output_mode == "regression":
return float(int(example.label))
raise KeyError(output_mode)
labels = [label_from_example(example) for example in examples]
batch_encoding = tokenizer(
[(example.text_a, example.text_b) for example in examples],
max_length=max_length,
padding="max_length",
truncation=True,
)
features = []
for i in range(len(examples)):
inputs = {k: batch_encoding[k][i] for k in batch_encoding}
# https://github.com/huggingface/transformers/blob/master/src/transformers/data/processors/utils.py#L56
# InputFeatures当中包含了input_ids, attention_mask, token_type_ids和label四个部分
feature = InputFeatures(**inputs, label=labels[i])
features.append(feature)
return features
def build_dataset(self, features):
input_ids = []
attention_mask = []
token_type_ids = []
train_y = []
for feature in features:
input_ids.append(feature.input_ids)
attention_mask.append(feature.attention_mask)
token_type_ids.append(feature.token_type_ids)
train_y.append(feature.label)
input_ids = torch.from_numpy(np.array(input_ids)).long()
attention_mask = torch.from_numpy(np.array(attention_mask)).long()
token_type_ids = torch.from_numpy(np.array(token_type_ids)).long()
train_y = torch.from_numpy(np.array(train_y)).long()
dataset = TensorDataset(input_ids, attention_mask, token_type_ids, train_y)
return dataset
def run(self,PATH, CSV_PATH):
self.simlarity_sentences_examples(PATH)
Train_examples = self.get_train_examples(CSV_PATH)
Test_examples = self.get_test_examples(CSV_PATH)
Train_features = self.convert_examples_to_features(Train_examples, tokenizer, 150)
Test_features = self.convert_examples_to_features(Test_examples, tokenizer, 150)
train_set = self.build_dataset(Train_features)
test_set = self.build_dataset(Test_features)
train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(test_set, batch_size=8, shuffle=True)
return train_dataloader, validation_dataloader
output_dir = './model/simlarity2/'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
def train():
processor = NextSentenceProcessor()
train_dataloader, validation_dataloader = processor.run(PATH,CSV_PATH)
# AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
optimizer = AdamW(model.parameters(),
lr=2e-5, # args.learning_rate - 默认是 5e-5
eps=1e-8 # args.adam_epsilon - 默认是 1e-8, 是为了防止衰减率分母除到0
)
# bert 推荐 epochs 在2到4之间为好。
epochs = 2
# training steps 的数量: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0, # Default value in run_glue.py
num_training_steps=total_steps)
# 设置总时间.
total_t0 = time.time()
best_val_accuracy = 0
for epoch_i in range(0, epochs):
print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
# 记录每个 epoch 所用的时间
t0 = time.time()
total_train_loss = 0
total_train_accuracy = 0
model.train()
for step, batch in enumerate(train_dataloader):
# 每隔40个batch 输出一下所用时间.
if step % 300 == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# `batch` 包括3个 tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_type = batch[1].to(device)
b_input_mask = batch[2].to(device)
b_labels = batch[3].to(device)
# 清空梯度
model.zero_grad()
# forward
# 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
loss, logits = model(b_input_ids, b_input_type, b_input_mask, next_sentence_label=b_labels)
total_train_loss += loss.item()
# backward 更新 gradients.
loss.backward()
# 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# 更新模型参数
optimizer.step()
# 更新 learning rate.
scheduler.step()
logit = logits.detach().cpu().numpy()
label_id = b_labels.to('cpu').numpy()
attention_mask = b_input_mask.cpu().numpy()
# 计算training 句子的准确度.
total_train_accuracy += flat_accuracy(logit, label_id, attention_mask)
# 计算batches的平均损失.
avg_train_loss = total_train_loss / len(train_dataloader)
# 计算训练时间.
training_time = format_time(time.time() - t0)
# 训练集的准确率.
avg_train_accuracy = total_train_accuracy / len(train_dataloader)
print(" 训练准确率: {0:.2f}".format(avg_train_accuracy))
print(" 平均训练损失 loss: {0:.2f}".format(avg_train_loss))
print(" 训练时间: {:}".format(training_time))
# ========================================
# Validation
# ========================================
t0 = time.time()
# 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同
model.eval()
# 设置参数
total_eval_accuracy = 0
total_eval_loss = 0
for batch in validation_dataloader:
# `batch` 包括3个 tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_type = batch[1].to(device)
b_input_mask = batch[2].to(device)
b_labels = batch[3].to(device)
# 在valuation 状态,不更新权值,不改变计算图
with torch.no_grad():
# 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
loss, logits = model(b_input_ids, b_input_type, b_input_mask, next_sentence_label=b_labels)
# 计算 validation loss.
total_eval_loss += loss.item()
logit = logits.detach().cpu().numpy()
label_id = b_labels.to('cpu').numpy()
attention_mask = b_input_mask.cpu().numpy()
# predicty = np.array([[1 if each > 0.5 else 0 for each in line] for line in logit])
# 计算 validation 句子的准确度.
total_eval_accuracy += flat_accuracy(logit, label_id, attention_mask)
# 计算 validation 的准确率.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print("")
print(" 测试准确率: {0:.2f}".format(avg_val_accuracy))
if avg_val_accuracy > best_val_accuracy:
best_val_accuracy = avg_val_accuracy
torch.save(model.state_dict(), output_model_file)
# model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)
# 计算batches的平均损失.
avg_val_loss = total_eval_loss / len(validation_dataloader)
# 计算validation 时间.
validation_time = format_time(time.time() - t0)
print(" 平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
print(" 测试时间: {:}".format(validation_time))
print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))
# 训练模型
#train()
bert 命名实体识别 把entity识别出来
from LoadData import *
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW,BertTokenizer, RobertaForTokenClassification,DataProcessor
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
class enity_identifing(nn.Module):
def __init__(self,embedding_dim):
super(enity_identifing, self).__init__()
self.tokenizer = BertTokenizer.from_pretrained("RoBERTa_zh_Large_PyTorch")
self.embed = nn.Embedding(self.tokenizer.vocab_size, embedding_dim)
self.bert_model = RobertaForTokenClassification.from_pretrained(
"./RoBERTa_zh_Large_PyTorch/", # 使用 12-layer 的 BERT 模型.
num_labels=5000, # 多分类任务的输出标签为 len(tag2idx)个.
output_attentions=False, # 不返回 attentions weights.
output_hidden_states=False, # 不返回 all hidden-states.
).to(device)
initrange = 0.1
self.embed.weight.data.uniform_(-initrange, initrange)
self.rnn_type = "LSTM"
self.nhid = 512
self.rnn = nn.LSTM(5000, self.nhid, bidirectional=True, dropout=0.5).to(device)
self.output = nn.Linear(2 * self.nhid, 1).to(device)
self.loss_fn = nn.BCEWithLogitsLoss().to(device)
self.sig = nn.Sigmoid().to(device)
def forward(self, inputs, type_ids,mask ,y):
# 输入bert
out = self.bert_model(inputs, type_ids, mask)
# 输入LSTM
hidden, states = self.rnn(out[0].contiguous())
logits = self.output(hidden)
loss = self.loss_fn(logits.squeeze(),y.float())*mask
loss = (torch.sum(loss) / torch.sum(mask))
logits = self.sig(logits.squeeze())
return loss, logits
def predict(self, inputs, type_ids, mask ):
out = self.bert_model(inputs, type_ids, mask)
# 输入LSTM
hidden, states = self.rnn(out[0].contiguous())
logits = self.output(hidden)
logits = self.sig(logits.squeeze())
return logits
def restore_entity_from_labels(self,labels, question):
question = self.tokenizer.convert_ids_to_tokens(question)
entitys = []
str = ''
labels = labels[1:-1]
question = question[1:-1]
for i in range(min(len(labels), len(question))):
if labels[i] == 1:
str += question[i]
else:
if len(str):
entitys.append(str)
str = ''
if len(str):
entitys.append(str)
return entitys
def restore_entity_from_labels_on_corpus(self, predicty, questions):
all_entitys = []
for i in range(len(predicty)):
all_entitys.append(self.restore_entity_from_labels(predicty[i], questions[i]))
return all_entitys
class NER_bert:
def __init__(self,embedding_dim):
super(NER_bert, self).__init__()
self.model = enity_identifing(embedding_dim)
# 推荐batch_size 为 16 或者 32
self.batch_size = 8
self.tokenizer = self.model.tokenizer
self.max_seq_len = 40
def format_time(self,elapsed):
elapsed_rounded = int(round((elapsed)))
# 返回 hh:mm:ss 形式的时间
return str(datetime.timedelta(seconds=elapsed_rounded))
def find_lcsubstr(self,s1, s2):
m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)] # 生成0矩阵,为方便后续计算,比字符串长度多了一列
mmax = 0 # 最长匹配的长度
p = 0 # 最长匹配对应在s1中的最后一位
for i in range(len(s1)):
for j in range(len(s2)):
if s1[i] == s2[j]:
m[i + 1][j + 1] = m[i][j] + 1
if m[i + 1][j + 1] > mmax:
mmax = m[i + 1][j + 1]
p = i + 1
return s1[p - mmax:p]
def GetXY(self, questions, entitys):
X1, X2, X3, Y = [], [], [], []
for i in range(len(questions)):
q = questions[i]
encoded_dict = self.tokenizer(q, max_length=self.max_seq_len, pad_to_max_length=True,
return_tensors='pt') # 分别是 词索引序列和分块索引序列
x1, x2, x3 = encoded_dict["input_ids"][0], encoded_dict["token_type_ids"][0], \
encoded_dict["attention_mask"][0]
y = [[0] for j in range(self.max_seq_len)]
assert len(x1) == len(y)
for e in entitys[i]:
# 得到实体名和问题的最长连续公共子串
e = self.find_lcsubstr(e, q)
if e in q:
begin = q.index(e) + 1
end = begin + len(e)
if end < self.max_seq_len - 1:
for pos in range(begin, end):
y[pos] = [1]
X1.append(x1.tolist())
X2.append(x2.tolist())
X3.append(x3.tolist())
Y.append(y)
X1 = torch.tensor(X1).long()
X2 = torch.tensor(X2).long()
X3 = torch.tensor(X3).long()
Y = torch.tensor(np.array(Y)).squeeze().long()
return X1, X2, X3, Y
def my_dataloader(self,PATH):
train_corpus = pickle.load(open(PATH + 'corpus_train.pkl', 'rb'))
train_questions = [train_corpus[i]['question'] for i in range(len(train_corpus))]
train_entitys = [train_corpus[i]['gold_entitys'] for i in range(len(train_corpus))]
train_entitys = [[entity[1:-1].split('_')[0] for entity in line] for line in train_entitys]
test_corpus = pickle.load(open(PATH + 'corpus_test.pkl', 'rb'))
test_questions = [test_corpus[i]['question'] for i in range(len(test_corpus))]
test_entitys = [test_corpus[i]['gold_entitys'] for i in range(len(test_corpus))]
test_entitys = [[entity[1:-1].split('_')[0] for entity in line] for line in test_entitys]
trainx1, trainx2, trainx3, trainy = self.GetXY(train_questions, train_entitys)
testx1, testx2, testx3, testy = self.GetXY(test_questions, test_entitys)
# 把input 放入 TensorDataset。
train_dataset = TensorDataset(trainx1, trainx2, trainx3, trainy)
test_dataset = TensorDataset(testx1, testx2, testx3, testy)
# 为训练数据集和验证数据集设计DataLoaders.
train_dataloader = DataLoader(
train_dataset, # 训练数据.
sampler=RandomSampler(train_dataset), # 打乱顺序
batch_size=self.batch_size
)
validation_dataloader = DataLoader(
test_dataset, # 验证数据.
sampler=RandomSampler(test_dataset), # 打乱顺序
batch_size=self.batch_size
)
return train_dataloader,validation_dataloader
def flat_accuracy(self, preds, labels, attention):
scores = (preds * attention == labels * attention)
rights = 0
for score in scores:
if sum(score) == len(labels[0]):
rights += 1
return rights / len(labels)
def eval(self,validation_dataloader):
# 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同
self.model.eval()
# 设置参数
t0 = time.time()
total_eval_accuracy = 0
total_eval_loss = 0
for batch in validation_dataloader:
# `batch` 包括3个 tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_type = batch[1].to(device)
b_input_mask = batch[2].to(device)
b_labels = batch[3].to(device)
# 在valuation 状态,不更新权值,不改变计算图
with torch.no_grad():
# 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
loss, logits = self.model(b_input_ids, b_input_type, b_input_mask, b_labels)
# 计算 validation loss.
total_eval_loss += loss.item()
logit = logits.detach().cpu().numpy()
label_id = b_labels.to('cpu').numpy()
attention_mask = b_input_mask.cpu().numpy()
predicty = np.array([[1 if each > 0.5 else 0 for each in line] for line in logit])
# 计算 validation 句子的准确度.
total_eval_accuracy += self.flat_accuracy(predicty, label_id, attention_mask)
# 计算 validation 的准确率.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print("")
print(" 测试准确率: {0:.2f}".format(avg_val_accuracy))
# 计算batches的平均损失.
avg_val_loss = total_eval_loss / len(validation_dataloader)
# 计算validation 时间.
validation_time = self.format_time(time.time() - t0)
print(" 平均测试损失 Loss: {0:.2f}".format(avg_val_loss))
print(" 测试时间: {:}".format(validation_time))
return avg_val_accuracy
def run(self,PATH):
train_dataloader, validation_dataloader = self.my_dataloader(PATH)
# bert 推荐 epochs 在2到4之间为好。
epochs = 4
# AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
self.optimizer = AdamW(self.model.parameters(),
lr=2e-5, # args.learning_rate - 默认是 5e-5
eps=1e-8 # args.adam_epsilon - 默认是 1e-8, 是为了防止衰减率分母除到0
)
# training steps 的数量: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
# 设计 learning rate scheduler.
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
num_warmup_steps=0, # Default value in run_glue.py
num_training_steps=total_steps)
output_dir = './model/ner/'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
training_stats = []
# 设置总时间.
total_t0 = time.time()
best_val_accuracy = 0
for epoch_i in range(0, epochs):
print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
# ========================================
# training
# ========================================
self.train(train_dataloader)
# ========================================
# Validation
# ========================================
avg_val_accuracy = self.eval(validation_dataloader)
if avg_val_accuracy > best_val_accuracy:
best_val_accuracy = avg_val_accuracy
torch.save(self.model.state_dict(), output_model_file)
self.tokenizer.save_vocabulary(output_dir)
print("训练一共用了 {:} (h:mm:ss)".format(self.format_time(time.time() - total_t0)))
def train(self,train_dataloader):
# 记录每个 epoch 所用的时间
t0 = time.time()
total_train_loss = 0
total_train_accuracy = 0
self.model.train()
for step, batch in enumerate(train_dataloader):
# 每隔40个batch 输出一下所用时间.
if step % 100 == 0 and not step == 0:
elapsed = self.format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# `batch` 包括3个 tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_type = batch[1].to(device)
b_input_mask = batch[2].to(device)
b_labels = batch[3].to(device)
# 清空梯度
self.model.zero_grad()
# forward
# 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
loss, logits = self.model(b_input_ids, b_input_type, b_input_mask, b_labels)
total_train_loss += loss.item()
# backward 更新 gradients.
loss.backward()
# 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸.
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
# 更新模型参数
self.optimizer.step()
# 更新 learning rate.
self.scheduler.step()
logit = logits.detach().cpu().numpy()
label_id = b_labels.to('cpu').numpy()
attention_mask = b_input_mask.cpu().numpy()
predicty = np.array([[1. if each > 0.5 else 0 for each in line] for line in logit])
# 计算training 句子的准确度.
total_train_accuracy += self.flat_accuracy(predicty, label_id, attention_mask)
# 计算batches的平均损失.
avg_train_loss = total_train_loss / len(train_dataloader)
# 计算训练时间.
training_time = self.format_time(time.time() - t0)
# 训练集的准确率.
avg_train_accuracy = total_train_accuracy / len(train_dataloader)
print(" 训练准确率: {0:.2f}".format(avg_train_accuracy))
print(" 平均训练损失 loss: {0:.2f}".format(avg_train_loss))
print(" 训练时间: {:}".format(training_time))
return total_train_loss,total_train_accuracy
# processor = NER_bert(1000)
# processor.run(PICKLE_PATH)
提取有可能的谓语
import pickle
import codecs as cs
import re
import thulac
from LoadData import *
class PropExtractor(object):
def __init__(self):
self.prop_dic, self.char_2_prop = self.load_data()
self.segger = thulac.thulac()
def load_data(self):
prop_dic = dict()
entities_tuples = pickle.load(open(PICKLE_PATH + 'NEW_ENTITI_ANSER.pkl', 'rb'))
for entity_tuple in entities_tuples:
property = entity_tuple.split('-')[-1]
if property in prop_dic:
prop_dic[property] += 1
else:
prop_dic[property] = 1
char_2_prop = dict()
for prop in prop_dic:
if len(prop) < 20: # 这里设置最大长度,不考虑长度过长的属性值
chars = set(prop)
for char in chars:
try:
char_2_prop[char].append(prop)
except:
char_2_prop[char] = [prop]
return prop_dic,char_2_prop
def extract_properties(self, question):
'''
输入一个问题,抽取出所有能和知识库中的属性值匹配的字符串,筛选后返回
input:
question : python-str
output:
props : python-dic
'''
props = {} # 键为知识库里prop,值为mention
QUES = question
# 包含在双引号 书名号里的属性
mark_props = {}
elements = re.findall('\".+\"|《.+》', question)
if len(elements) > 0:
for e in elements: # '甲天下', '完美的搜索引擎,'
if e in self.prop_dic: # 一般书名号的属性就是需要的属性
mark_props[e] = e
question = re.sub(e, '', question)
props['mark_props'] = mark_props
# 时间属性
time_props = {}
# 提取年月日
year_month_day = re.findall('\d+年\d+月\d+日|\d+年\d+月\d+号|\d+\.\d+\.\d+', question)
for ymd in year_month_day:
rml_norm = self.TransNormalTime(ymd)
time_props[rml_norm] = ymd
question = re.sub(ymd, '', question)
# 提取月日
month_day = re.findall('\d+月\d+日|\d+月\d+号|\d+年\d+月', question)
for ymd in month_day:
rml_norm = self.TransNormalTime(ymd)
time_props[rml_norm] = ymd
question = re.sub(ymd, '', question)
# 提取年份
years = re.findall('\d+年', question)
for ymd in years:
rml_norm = self.TransNormalTime(ymd)
time_props[rml_norm] = ymd
question = re.sub(ymd, '', question)
props['time_props'] = time_props
# 数字属性
digit_props = {}
elements = re.findall('\d+', question)
if len(elements) > 0:
for e in elements:
if e in self.prop_dic:
digit_props[e] = e
props['digit_props'] = digit_props
# 其他属性,去重
other_props = {}
length = len(question)
props_ngram = []
max_len = 0
for l in range(length, 0, -1): # 只考虑长度大于1的可匹配属性值
for i in range(length - l + 1):
if question[i:i + l] in self.prop_dic:
props_ngram.append(question[i:i + l])
if len(question[i:i + l]) > max_len:
max_len = len(question[i:i + l])
stop_props = []
for p in props_ngram:
for q in props_ngram:
if p in q and p != q and self.segger.cut(p)[0][1] not in ['ns']: # 加拿大的,台湾的等问题 p不是地名
stop_props.append(p)
new_props = [] # 去掉包含在更长属性值中的属性值
for p in props_ngram:
if p not in stop_props:
new_props.append(p)
new_new_props = [] # 去掉长度过于短的属性值
for p in new_props:
if len(p) == 1 and self.segger.cut(p)[0][1] in ['n']: # 单字名词
new_new_props.append(p)
elif (len(p) >= (max_len * 0.5) and len(p) != 1) or self.segger.cut(p)[0][1] in ['n',
'ns'] or self.exist_digit(
p): # 长度过短且词性名词比较重要
new_new_props.append(p)
for p in new_new_props:
other_props[p] = p
props['other_props'] = other_props
# 模糊匹配得到的属性
stop_dic = {'有', '的', '是', '在', '上', '哪', '里', '\"', '什', '么', '中', '个'}
prop2num = {}
for char in QUES:
if char in stop_dic:
continue
else:
try:
for p in self.char_2_prop[char]:
if p in prop2num:
prop2num[p] += 1
else:
prop2num[p] = 1
except:
continue
sort_props = sorted(prop2num.items(), key=lambda prop2num: prop2num[1], reverse=True)
top3_props = [key for key, value in sort_props[:3]] # top3
fuzzy_props = {}
for p in top3_props:
fuzzy_props[p] = p
props['fuzzy_props'] = fuzzy_props # 取与问题中匹配字数最多的属性作为候选
return props
def extract_subject_properties(self, question):
'''
输入一个问题,抽取出所有能和知识库中的属性值匹配的字符串,并将更有可能作为简单问题主语的属性值提取出来
input:
question : python-str
output:
props : python-dic
'''
pred_props = self.extract_properties(question)
if len(pred_props['mark_props']) != 0:
subject_props = pred_props['mark_props']
elif len(pred_props['time_props']) != 0:
subject_props = pred_props['time_props']
elif len(pred_props['digit_props']) != 0:
subject_props = pred_props['digit_props']
else:
subject_props = pred_props['other_props']
subject_props.update(pred_props['fuzzy_props'])
return subject_props
def GetProps(self, corpus):
gold_num = 0
true_num = 0
entity_error = []
irregular = []
all_props_num = 0.0
for i in range(len(corpus)):
question = corpus[i]['question']
#gold_entitys = corpus[i]['gold_entitys']
gold_entitys = corpus[i]['entity_mention']
# 提取gold props
gold_props = []
for x in gold_entitys:
#print(x)
# if x == "汪炜_(黄山市民营企业家协会二届会长)":
# print("yes")
# print(x)
if len(x) == 0:
continue
if x[0] == '\"':
if x[0] == "汪炜_(黄山市民营企业家协会二届会长)":
print("-----")
gold_props.append(x)
# 得到抽取出的属性字典并保存
pred_props = self.extract_properties(question) # 得到的均不包含引号
corpus[i]['all_props'] = pred_props
# 得到所有可能的属性corpus[i]['subject_props']
subject_props = {}
subject_props.update(pred_props['mark_props'])
subject_props.update(pred_props['time_props'])
subject_props.update(pred_props['digit_props'])
subject_props.update(pred_props['other_props'])
subject_props.update(pred_props['fuzzy_props'])
corpus[i]['subject_props'] = subject_props
all_props_num += len(corpus[i]['subject_props'])
# 统计该模块抽取唯一主语实体的召回率
if len(gold_props) == 1 and len(gold_entitys) == 1:
gold_num += 1
if_same = self.CheckSame(gold_props, subject_props) # 判断抽取出的属性值是否完全包括了gold props
true_num += if_same
if not if_same:
print('主语属性值抽取失败')
entity_error.append(i)
else:
print('主语属性值抽取成功')
print(i, question)
print(gold_props)
print(subject_props)
print('\n')
if gold_num != 0:
print('单主语且主语为属性值问题中,能找到所有主语属性值的比例为:%.2f' % (true_num / gold_num))
else:
print('单主语且主语为属性值问题中,找不到主语属性值')
print('平均每个问题属性为:%.2f' % (all_props_num / len(corpus)))
print(entity_error)
print(irregular)
return corpus
def CheckSame(self, gold_props, pred_props):
pred_props_list = []
for p in pred_props: # 取得是key键
pred_props_list.append('\"' + p + '\"')
join_props = set(pred_props_list).intersection(set(gold_props))
if len(join_props) == len(gold_props):
return 1
else:
return 0
def exist_digit(self, p):
'''
判断字符串中是否存在数字
'''
for i in range(10):
if str(i) in p:
return 1
return 0
def TransNormalTime(self, time):
digits = re.findall('\d+', time)
elements = []
for d in digits:
if len(d) > 2:
elements.append(d)
elif len(d) == 2:
if int(d[0]) > 3:
elements.append('19' + d)
else:
elements.append(d)
else:
elements.append('0' + d)
return '-'.join(elements)
inputpaths = ['ENTITY_MENTIONS_VALIDATION3.pkl']
outputpaths = ['ALL_MENTIONS_VALIDATION3.pkl']
starttime = time.time()
pe = PropExtractor()
for i in range(0, 1):
inputpath = inputpaths[i]
outputpath = outputpaths[i]
corpus = pickle.load(open(PICKLE_PATH + inputpath, 'rb'))
corpus = pe.GetProps(corpus)
print('得到实体mention')
pickle.dump(corpus, open(PICKLE_PATH + outputpath, 'wb'))
print('耗费时间%.2f秒' % (time.time() - starttime))
#### 开始预测
from answer_score_bert import *
import pymysql
class Entity_answer():
def __init__(self):
self.entity_to_pro = pickle.load(open(PICKLE_PATH + 'ENTITY_TO_PROPERTIES.pkl', 'rb'))
self.conn = pymysql.connect(
host='localhost',
user='root',
password='123',
db='pkubase',
charset='utf8',
# autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
# ****python, 必须有一个游标对象, 用来给数据库发送sql语句, 并执行的.
# 2. 创建游标对象,
self.cur = self.conn.cursor()
self.predicates = pickle.load(open(PICKLE_PATH +'PREDICATES.pkl', 'rb'))
self.words_Frequent_dic = pickle.load(open(PICKLE_PATH + 'WORDS_FREQUENCY.pkl', 'rb'))
self.stopwords = self.stopwordslist(PATH + 'stop_words.txt')
self.pass_words = {'是什么', '在哪里', '哪里', '什么', '提出的', '有什么', '国家', '哪个', '所在','哪一年',
'培养出', '为什么', '什么时候', '人', '你知道', '都包括', '是谁', '告诉我', '又叫做', '有', '是'}
def stopwordslist(self,filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def movestopwords(self,sentence):
# 去掉常用词
if sentence not in self.words_Frequent_dic and sentence not in self.entity_to_pro:
while len(sentence)!=1:
if sentence[0] in self.stopwords:
sentence = sentence[1:]
else:
break
while len(sentence) != 1:
if sentence[-1] in self.stopwords:
sentence = sentence[:-1]
else:
break
return sentence
def predicate(self,data):
question = data['question']
Entities = list(data['entity_mention'])
print(question)
Predicate_List = jieba.cut_for_search(question.replace("?","").replace("。",""))
Predicate_List = (" ".join(Predicate_List)).split(" ")
Properties = list(data['subject_props'])
# 用jieba分词 初略的定位谓语
Predicates = []
for e in (Predicate_List + Properties):
# 保留单个字的谓语
pass_predicates = ["高"]
if e in pass_predicates:
Predicates.append(e)
continue
# if e not in self.words_Frequent_dic or len(e) == 1:
# continue
if e not in self.words_Frequent_dic:
continue
t = 0
for pass_words in self.pass_words:
if pass_words.find(e) != -1:
t =1
break
if t == 1:
continue
if e in question:
if e == "属于":
e = "类型"
elif e == "科":
e = "科室"
elif "生" and "年"in question:
e = "出生日期"
else :
if "生" and "地" in question:
e = "出生地"
if e not in Predicates:
for predicate in self.predicates:
if predicate.find(e) != -1:
Predicates.append(predicate)
if Predicates == []:
print("空的!!!!")
print(Entities)
print(Predicates)
start = time.time()
count = 0
high_score = 0
nu_tuples = len(set(Predicates))*len(Entities)
answer = ""
finish = 0
for Entitie in Entities:
if nu_tuples> 100000000:
if Entitie not in self.entity_to_pro or question.find(Entitie) == -1:
continue
if Entitie not in self.entity_to_pro or "\"" in Entitie:
continue
pros = self.entity_to_pro[Entitie]
for property in list(set(Predicates)):
count += 1
if property in pros:
candidate = "\'"+Entitie+"-"+property+"\'"
sqli = "SELECT * FROM kb WHERE candidate = " + "\""+candidate+ "\""
result = self.cur.execute(sqli) # 默认不返回查询结果集, 返回数据记录数。
if result == 1:
info = self.cur.fetchall() # 3). 获取所有的查询结果
subject = info[0][1][1:-1]
# 用bert 做next sentence predicate, 0 答案正确, 1 为答案错误
encoding = tokenizer(question,subject + "|||" + info[0][2][1:-1], return_tensors='pt')
logits = model(**encoding.to(device))
# 答案正确,进行打分
if torch.argmax(logits[0],dim=-1) == 0:
# 每次取打分结果最高的, 作为最后的答案
if logits[0][0][0] - logits[0][0][1] > high_score:
high_score = logits[0][0][0] - logits[0][0][1]
answer = info[0][3]
for element in answer.split("\t"):
element = element.replace("\t","")[1:-1]
if element in self.entity_to_pro:
pros_secondary = self.entity_to_pro[element]
for pro in pros_secondary:
if pro in list(set(Predicates)):
candidate = "\'" + element.split("_")[0] + "-" + pro + "\'"
sqli = "SELECT * FROM kb WHERE candidate = " + "\"" + candidate + "\""
if 1 == self.cur.execute(sqli):
info = self.cur.fetchall()
answer = info[0][3]
training_time = format_time(time.time() - start)
# 训练集的准确率.
print("count: ",count)
print("时间: {:}".format(training_time))
print(answer)
return answer
def run(self,data):
print("一共", len(data), "数据")
# for i in range(1, len(data)):
for i in range(20, len(data)):
with open('./data/answer1.txt', 'a', encoding="utf-8") as f:
print(i + 1)
t0 = time.time()
record = answers.predicate(data[i])
f.write(record)
print(record)
print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - t0)))
f.close()
print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time() - total_time)))
# 4. 关闭游标
self.cur.close()
# 5. 关闭连接
self.conn.close()
# 导入数据
with open(PICKLE_PATH + 'ALL_MENTIONS_VALIDATION2.pkl', 'rb') as f:
data = pickle.load(f)
answers = Entity_answer()
total_time = time.time()
answers.run(data)
测试函数
from LoadData import *
class Eval(object):
"""
Entity Linking Evaluation
"""
def __init__(self, golden_file_path):
self.golden_file_path = golden_file_path
self.user_file_path = golden_file_path
self.tp = 0
self.fp = 0
self.total_recall = 0
self.errno = None
def micro_f1(self):
"""
:return: float类型:精确率,召回率,Micro-F1值
"""
# 文本格式验证
precision = 0
recall = 0
self.tp = 0
self.fp = 0
self.precision = 0
self.recall = 0
self.total_recall = 0
entities_ansewers = pickle.load(open(self.golden_file_path, 'rb'))
for line in entities_ansewers:
print(line)
A = []
properties = line[3][2].split(" ")
for i in range(0,len(properties)):
if len(properties[i]) != 0:
A.append(properties[i])
G = []
for i in range(0,len(line[1][:])):
if len(line[1][:][i]) == 0:
continue
else:
G.append(line[1][:][i][1:-1])
c = 0
for e in list(set(A)):
if e in G:
c += 1
if len(A) == 0:
self.precision += 0
else:
self.precision += c/len(A)
if len(G) == 0:
self.recall += 0
else:
self.recall += c / len(G)
precision = self.precision/len(entities_ansewers)
recall = self.recall/len(entities_ansewers)
a = 2 * precision * recall
b = precision + recall
if b == 0:
return 0, 0, 0
f1 = a / b
return precision, recall, f1
eval = Eval('./data/pickle/RESULT2.pkl')
prec, recall, f1 = eval.micro_f1()
print(prec, recall, f1)
if eval.errno:
print(eval.errno)