实体标注
文本实体标注,用于做ner等的训练测试集;
1、输入(原始文件–output/data.txt)
对应的实体有:
PER:克马尔 让娜 佩雷拉
ORG:家委会 沙特队
LOC:…
2、输出(标注结果–output/labeling.txt):
3、标注代码
class LabelingSentences():
def __init__(self):
self.perList = ner_list('per.txt') #将PER实体初始化到perList 列表中
self.locList = ner_list('loc.txt') #将LOC实体初始化到locList 列表中
self.orgList = ner_list('org.txt') #将ORG实体初始化到orgList 列表中
def ner_list(self,file):
nerList = []
with open(file, 'r', encoding='UTF-8-sig') as f:
l = f.readlines()
for i in l:
i = i.strip('\n').replace(" ", "")
nerList.append(i)
return nerList
def ner(self):
L_final = []
all_write = open('output/labeling.txt', 'a', encoding='utf-8')
with open('output/data.txt', 'r', encoding='UTF-8-sig') as f:
#对每一行进行实体标注
for line in f.readlines():
if line.split():
line = line.strip('\n')
line_list = list(line)
#每一行的标注列表
type_ner = ["O"] * len(line_list)
# type_ner = list(range(0, len(line_list)))
#将每一行的实体提取出来放在字典中
entity_list = {}
for per in self.perList:
if per in line:
entity_list[per] = "PER"
for loc in self.locList :
if loc in line:
entity_list[loc] = "LOC"
for org in self.orgList:
if org in line:
entity_list[org] = "ORG"
for entity in entity_list.keys():
search = str(entity)
start = 0
while True:
# if os in line:
len_a = len(list(entity))
index = line.find(search,start)
if index == -1:
break
index_s = index
index_e = index + len_a
for j in range(0,len(line_list)):
if j in range(index_s, index_e):
if j == index_s:
type_ner[j] = str("B-"+ entity_list[entity] +"")
else:
type_ner[j] = str("I-"+ entity_list[entity] +"")
# else:
# type_ner[j] = str("O")
start = index + 1
final_result = []
for i in range(0, len(line_list)):
line_list = [str(x) for x in line_list]
type_ner = [str(x) for x in type_ner]
final = ' '.join([line_list[i], type_ner[i]])
final_result.append(final)
final_result.append('\n')
L_final = L_final + final_result
end = '\n'.join(final_result)
all_write.write(end)
模型训练
安装bert4keras:pip install bert4keras
环境配置:
理论上兼容Python2和Python3,实验环境是Python 2.7、Tesorflow 1.14+以及Keras 2.3.1(已经在2.2.4、2.3.0、2.3.1、tf.keras下测试通过)。
#! -*- coding: utf-8 -*-
# 用CRF做中文命名实体识别
# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 实测验证集的F1可以到96.18%,测试集的F1可以到95.35%
import numpy as np
import os
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, ViterbiDecoder, to_array
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
from keras.models import load_model
from sklearn.externals import joblib #jbolib模块
maxlen = 256
epochs = 10
batch_size = 32
bert_layers = 12
learing_rate = 1e-5 # bert_layers越小,学习率应该要越大
crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率
# bert配置
curPath = os.path.abspath(os.path.dirname(__file__))
config_path = os.path.join(curPath,'model/electra/electra_small/config.json')
checkpoint_path = os.path.join(curPath,'model/electra/electra_small/electra_small')
dict_path = os.path.join(curPath,'model/electra/electra_small/vocab.txt')
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d, last_flag = [], ''
for c in l.split('\n'):
char, this_flag = c.split(' ')
if this_flag == 'O' and last_flag == 'O':
d[-1][0] += char
elif this_flag == 'O' and last_flag != 'O':
d.append([char, 'O'])
elif this_flag[:1] == 'B':
d.append([char, this_flag[2:]])
else:
d[-1][0] += char
last_flag = this_flag
D.append(d)
return D
# 标注数据
train_data = load_data(os.path.join(curPath,'NERdata/sequence_labeling/data.train'))
valid_data = load_data(os.path.join(curPath,'NERdata/sequence_labeling/data.dev'))
test_data = load_data(os.path.join(curPath,'NERdata/sequence_labeling/data.test'))
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 类别映射
labels = ['OS', 'LOC', 'SYS', 'SER']
id2label = dict(enumerate(labels)) #{0: 'OS', 1: 'LOC', 2: 'SYS',3: 'SER'}
label2id = {j: i for i, j in id2label.items()} #{'OS': 0, 'LOC': 1, 'SYS': 2, 'SER':3}
num_labels = len(labels) * 2 + 1
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, item in self.sample(random):
token_ids, labels = [tokenizer._token_start_id], [0]
for w, l in item:
w_token_ids = tokenizer.encode(w)[0][1:-1]
if len(token_ids) + len(w_token_ids) < maxlen:
token_ids += w_token_ids
if l == 'O':
labels += [0] * len(w_token_ids)
else:
B = label2id[l] * 2 + 1
I = label2id[l] * 2 + 2
labels += ([B] + [I] * (len(w_token_ids) - 1))
else:
break
token_ids += [tokenizer._token_end_id]
labels += [0]
segment_ids = [0] * len(token_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
"""
后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
model = build_transformer_model(
config_path,
checkpoint_path,
model='albert',
)
output_layer = 'Transformer-FeedForward-Norm'
output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
"""
model = build_transformer_model(
config_path,
checkpoint_path,
model='electra',
)
output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
output = model.get_layer(output_layer).output
output = Dense(num_labels)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)
model = Model(model.input, output)
model.summary()
model.compile(
loss=CRF.sparse_loss,
optimizer=Adam(learing_rate),
metrics=[CRF.sparse_accuracy]
)
class NamedEntityRecognizer(ViterbiDecoder):
"""命名实体识别器
"""
def recognize(self, text):
tokens = tokenizer.tokenize(text)
while len(tokens) > 512:
tokens.pop(-2)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
token_ids, segment_ids = to_array([token_ids], [segment_ids])
model.load_weights(os.path.join(curPath, "output/sequence_labeling.weights"))
nodes = model.predict([token_ids, segment_ids])[0]
labels = self.decode(nodes)
entities, starting = [], False
for i, label in enumerate(labels):
if label > 0:
if label % 2 == 1:
starting = True
entities.append([[i], id2label[(label - 1) // 2]])
elif starting:
entities[-1][0].append(i)
else:
starting = False
else:
starting = False
return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
for w, l in entities]
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
def evaluate(data):
"""评测函数
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
print("predict..............")
for d in tqdm(data):
text = ''.join([i[0] for i in d])
print(text)
R = set(NER.recognize(text))
print(R)
T = set([tuple(i) for i in d if i[1] != 'O'])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(keras.callbacks.Callback):
def __init__(self):
self.best_val_f1 = 0
def on_epoch_end(self, epoch, logs=None):
trans = K.eval(CRF.trans)
NER.trans = trans
print(NER.trans)
f1, precision, recall = evaluate(valid_data)
# 保存最优
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(os.path.join(curPath,'output/sequence_labeling.weights'))
print(
'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
f1, precision, recall = evaluate(test_data)
print(
'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
(f1, precision, recall)
)
if __name__ == '__main__':
#1、模型训练
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit_generator(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
#保存模型的权重文件
model.load_weights(os.path.join(curPath,'output/sequence_labeling.weights'))