NLP项目8-命名实体识别
命名实体识别
- 命名实体识别(Named Entity Recognition简称NER):又称作“专名识别”,是指识别文本中具有特定意义的实体,主要包括人名、地名、机构名、专有名词等。
1.分词器
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('hfl/rbt6') # hf l 不是1
print(tokenizer)
BertTokenizerFast(name_or_path='hfl/rbt6', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
2.批编码
tokenizer.batch_encode_plus([[
'海', '钓', 'b', 's', '地', '点', '在', 'x', '门', '与', 'j', '门', '之', '间',
'的', '海', '域', '。'
],
[ '这', '座', '依', '山', '傍', '水', '的', '博', '物', '馆', '由', 'g', 'n', 'y',
'l', '的', '设', '计', '师', '主', '持', '设', '计', ',', 'z', '个', '建', '筑',
'群', '精', '美', '而', '恢', '宏', '。']],
truncation=True,
padding=True,
return_tensors='pt',
is_split_into_words=True)
{'input_ids': tensor([[ 101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032,
7305, 722, 7313, 4638, 3862, 1818, 511, 102, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0],
[ 101, 6821, 2429, 898, 2255, 988, 3717, 4638, 1300, 4289, 7667, 4507,
1744, 1079, 671, 3837, 4638, 6392, 6369, 2360, 712, 2898, 6392, 6369,
8024, 3146, 702, 2456, 5029, 5408, 5125, 5401, 5445, 2612, 2131, 511,
102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
3.数据加载
from datasets import load_from_disk
dataset = load_from_disk('../data/peoples_daily_ner')
dataset
DatasetDict({
train: Dataset({
features: ['id', 'tokens', 'ner_tags'],
num_rows: 20865
})
validation: Dataset({
features: ['id', 'tokens', 'ner_tags'],
num_rows: 2319
})
test: Dataset({
features: ['id', 'tokens', 'ner_tags'],
num_rows: 4637
})
})
dataset['train'][0]
{'id': '0',
'tokens': ['海',
'钓',
'比',
'赛',
'地',
'点',
'在',
'x',
'门',
'与',
'j',
'门',
'之',
'间',
'的',
'海',
'域',
'。'],
'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}
4.数据集定义和预处理
import torch
class Dataset(torch.utils.data.Dataset):
def __init__(self, split):
# names [ '0', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
dataset = load_from_disk('../data/peoples_daily_ner')[split]
def f(data): # 过滤掉太长的句子
return len(data['tokens']) <= 512 - 2
dataset = dataset.filter(f)
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
tokens = self.dataset[i]['tokens']
labels = self.dataset[i]['ner_tags']
return tokens, labels
dataset = Dataset('train')
tokens, labels = dataset[0]
len(dataset), tokens, labels
(20852,
['海',
'钓',
'比',
'赛',
'地',
'点',
'在',
'x',
'门',
'与',
'j',
'门',
'之',
'间',
'的',
'海',
'域',
'。'],
[0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0])
5.重写Collate_fn 批量读取数据
def collate_fn(data):
tokens = [i[0] for i in data]
labels = [i[1] for i in data]
inputs = tokenizer.batch_encode_plus(tokens,
truncation=True,
padding=True,
return_tensors='pt',
is_split_into_words=True)
lens = inputs['input_ids'].shape[1]
for i in range(len(labels)):
labels[i] = [7] + labels[i]
labels[i] += [7] * lens
labels[i] = labels[i][:lens]
return inputs, torch.LongTensor(labels)
6.数据加载器 Dataset的Tokens转为Loader的Input_ids
loader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for (inputs, labels) in loader:
break
len(loader)
1303
tokenizer.decode(inputs['input_ids'][0])
'[CLS] 他 们 傲 慢 地 声 称 : 『 我 们 的 生 产 技 术 是 非 常 不 错 的 , 加 工 水 平 是 s j 上 z 先 进 的 , 产 品 j d 不 会 出 现 r h 问 题 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'
labels[0]
tensor([7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])
for k, v in inputs.items():
print(k, v.shape)
input_ids torch.Size([16, 86])
token_type_ids torch.Size([16, 86])
attention_mask torch.Size([16, 86])
7.加载预训练模型
from transformers import AutoModel
pretrained = AutoModel.from_pretrained('hfl/rbt6').to(device)
print(sum(i.numel() for i in pretrained.parameters()))
59740416
pretrained(**inputs).last_hidden_state.shape
torch.Size([16, 86, 768])
8.定义下游任务模型
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.tuneing = False
self.pretrained = None
self.rnn = torch.nn.GRU(768, 768, batch_first=True)
self.fc = torch.nn.Linear(768, 8)
def forward(self, input_ids, token_type_ids, attention_mask):
if self.tuneing:
out = self.pretrained(input_ids, token_type_ids, attention_mask).last_hidden_state
else:
out = pretrained(input_ids, token_type_ids, attention_mask).last_hidden_state
out, _ = self.rnn(out)
out = self.fc(out).softmax(dim=2) # [16, 86, 8]
return out
def fine_tuneing(self, tuneing):
self.tuneing = tuneing
if tuneing:
for i in pretrained.parameters():
i.requires_grad = True
pretrained.train()
self.pretrained = pretrained
else:
for i in pretrained.parameters():
i.requires_grad_(False)
pretrained.eval()
self.pretrained = None
model = Model()
model(**inputs).shape
torch.Size([16, 86, 8])
labels.shape
torch.Size([16, 86])
inputs['attention_mask'].shape
torch.Size([16, 86])
inputs['attention_mask']
tensor([[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
...,
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0]])
9.对结果和Label进行变形, 移除Pad
def reshape_and_remove_pad(outs, labels, attention_mask):
outs = outs.reshape(-1, 8) # [batch, lens, 8] -> [batch * lens, 8]
labels = labels.reshape(-1) # [batch, lens] -> [batch * lens]
condition = attention_mask.reshape(-1) == 1 # 忽略pad对计算影响 attention_mask [batch, lens]
outs = outs[condition]
labels = labels[condition]
return outs, labels
reshape_and_remove_pad(torch.randn(2, 3, 8), torch.ones(2, 3), torch.ones(2, 3))
(tensor([[-0.3954, -2.0475, -0.4138, -1.2135, 0.7311, -1.9530, -1.1810, -0.4241],
[-1.4138, -0.6140, -1.0825, -0.0478, 0.6542, 0.7930, -0.4205, 1.3577],
[ 0.2879, -0.8446, -1.2150, -0.2138, 0.3338, -1.6618, 0.7989, 2.1103],
[ 0.1951, -2.4724, 0.2160, 1.4213, -0.8427, 1.7165, -0.1597, 2.1683],
[ 0.6389, -1.1863, -1.0862, -1.4376, -0.2547, 0.0889, 1.0011, -0.3153],
[ 0.7700, -0.8136, 0.8115, 0.4005, 1.7522, 1.6029, -1.5573, 0.8988]]),
tensor([1., 1., 1., 1., 1., 1.]))
10.获取正确数量和总数
def get_correct_and_total_count(labels, outs):
outs = outs.argmax(dim=1) # [b*lens, 8] -> [b*lens]
correct = (outs == labels).sum().item()
total = len(labels)
select = labels != 0
outs = outs[select]
labels = labels[select]
correct_content = (outs == labels).sum().item()
total_content = len(labels)
return correct, total, correct_content, total_content
get_correct_and_total_count(torch.ones(16), torch.randn(16, 8))
(1, 16, 1, 16)
11.训练
from transformers import AdamW
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)
inputs
{'input_ids': tensor([[ 101, 800, 812, ..., 0, 0, 0],
[ 101, 679, 1398, ..., 0, 0, 0],
[ 101, 1762, 671, ..., 0, 0, 0],
...,
[ 101, 1079, 1849, ..., 0, 0, 0],
[ 101, 915, 1762, ..., 0, 0, 0],
[ 101, 679, 5307, ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
...,
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0]])}
def train(epochs):
lr = 2e-5 if model.tuneing else 5e-4
optimizer = AdamW(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
model.to(device)
model.train()
for epoch in range(epochs):
for step, (inputs, labels) in enumerate(loader):
input_ids, token_type_ids, attention_mask = inputs['input_ids'], inputs['token_type_ids'], inputs['attention_mask']
input_ids, token_type_ids, attention_mask = input_ids.to(device), token_type_ids.to(device), attention_mask.to(device)
labels = labels.to(device)
# print(input_ids.device, labels.device, attention_mask.device, token_type_ids.device)
outs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
outs, labels = reshape_and_remove_pad(outs, labels, attention_mask)
loss = criterion(outs, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if step % 50 == 0:
counts = get_correct_and_total_count(labels, outs)
accuracy = counts[0] / counts[1]
accuracy_content = counts[2] / counts[3]
print(epoch, step, loss.item(), accuracy, accuracy_content)
torch.save(model, '../data/中文命名实体识别.model')
model = Model()
model.fine_tuneing(False) # 没有做fine tuneing, 只训练下游任务部分的参数
train(2)
0 0 2.0735361576080322 0.17672413793103448 0.2608695652173913
0 50 1.45952570438385 0.8118421052631579 0.0
...
1 1250 1.379892349243164 0.8941176470588236 0.2831858407079646
1 1300 1.3760504722595215 0.8979591836734694 0.26229508196721313
print(sum(p.numel() for p in model.parameters()))
3549704
model = Model()
model.fine_tuneing(True)
train(2)
0 0 2.088731288909912 0.008610086100861008 0.046153846153846156
0 50 1.3765133619308472 0.8982857142857142 0.0
...
1 1250 1.3655085563659668 0.9085144927536232 0.24060150375939848
1 1300 1.362223744392395 0.9118012422360249 0.3106796116504854
print(sum(p.numel() for p in model.parameters()))
63290120
12.模型保存
torch.save(model, '../data/中文命名实体识别.model')
13.测试
def test():
model_load= torch.load('../data/中文命名实体识别.model', map_location='cpu')
model_load.eval()
loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
batch_size=128,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
correct = 0
total = 0
correct_content = 0
total_content = 0
for step, (inputs, labels) in enumerate(loader_test):
if step == 5:
break
print(step)
with torch.no_grad():
outs = model_load(**inputs)
outs, labels = reshape_and_remove_pad(outs, labels, inputs['attention_mask'])
counts = get_correct_and_total_count(labels, outs)
correct += counts[0]
total += counts[1]
correct_content += counts[2]
total_content += counts[3]
print(correct / total, correct_content / total_content)
test()
0
1
2
3
4
0.8940288610312924 0.27610008628127697
14.预测
def predict():
model_load = torch.load('../data/中文命名实体识别.model', map_location='cpu')
model_load.eval()
loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
batch_size=32,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for inputs, labels in loader_test:
break
with torch.no_grad():
outs = model_load(**inputs).argmax(dim=2)
for i in range(32):
select = inputs['attention_mask'][i] == 1 # 移除pad
input_id = inputs['input_ids'][i, select]
out = outs[i, select]
label = labels[i, select]
print(tokenizer.decode(input_id).replace(' ', '')) # 输出原句子
for tag in [label, out]: # 输出tag标签
s = ''
for j in range(len(tag)):
if tag[j] == 0:
s += '·'
continue
s += tokenizer.decode(input_id[j])
s += str(tag[j].item())
print(s)
print('=============================================')
predict()
[CLS]老区的人纯朴但较保守,安于贫穷,许多先进农业技术硬是推广不开,特别在养殖方面,许多人家还是照老一套去做,费了老大劲儿却总是zf无门。[SEP]
[CLS]7··································································[SEP]7
[CLS]7··································································[SEP]7
=============================================
[CLS]60年代前,这里是牧牛的草场,如今,矗立在芳草绿树中的片片建筑群,成为宇航员们生活和训练的地方,更成为关于未来太空探索思想的发源地。[SEP]
[CLS]7··································································[SEP]7
[CLS]7··································································[SEP]7
=============================================
[CLS]我立在峭岩上俯身向下看去,深涧万丈,从下到上刀斫斧砍一般![SEP]
[CLS]7·····························[SEP]7
[CLS]7·····························[SEP]7
=============================================
[CLS]没有思想,就没有创造;没有创造,就没有人类的未来。[SEP]
[CLS]7·························[SEP]7
[CLS]7·························[SEP]7
=============================================
[CLS]如今,他在新单位干得有声有色,被提拔为配电室主任。[SEP]
[CLS]7·························[SEP]7
[CLS]7·························[SEP]7
=============================================
[CLS]去年12月3日这天,对于32岁的张雅茹女士而言,是一个值得庆贺的日子。[SEP]
[CLS]7················张1雅2茹2················[SEP]7
[CLS]7···································[SEP]7
=============================================
[CLS]这位新部长说,不用了,咱们一起到食堂吃吧![SEP]
[CLS]7·····················[SEP]7
[CLS]7·····················[SEP]7
=============================================
[CLS][UNK]我是有信心的[UNK](附图片1张)[SEP]
[CLS]7···············[SEP]7
[CLS]7···············[SEP]7
=============================================
[CLS]6月17日下午3时,雨后天晴,天安门广场一片洁净。[SEP]
[CLS]7···············天5安6门6广6场6·····[SEP]7
[CLS]7·························[SEP]7
=============================================
[CLS]卢亮,北京琉璃河水泥厂新线分厂厂长,他在琉璃河水泥厂工作16年间,有9年时间是在国内外进行培训,是琉璃河水泥厂的挑大梁管理干部。[SEP]
[CLS]7卢1亮2·北3京4琉4璃4河4水4泥4厂4新4线4分4厂4·····琉3璃4河4水4泥4厂4·······················琉3璃4河4水4泥4厂4·········[SEP]7
[CLS]7································································[SEP]7
=============================================