1、NER(Named Entity Recognition, NER)
NER 是 NLP 的基础任务,指从文本中识别出命名性指称项,为关系抽取等任务做铺垫。狭义上,是识别出人名、地名和组织结构名这三类命名实体。当然,在特定领域中,会相应地定义领域内地各种实体类型。
2、常见地公开的数据集
CoNLL 2003(https://www.clips.uantwerpen.be/conll2003/ner/)
CoNLL2003 中,实体被标注为四种类型:
①LOC(Location,地名)
②ORG(organization,组织机构)
③PER(person,人名)
④MISC(miscellaneous,其他)
一条标注数据的组织形式如下:
[word][POS tag][chunk tag][NER tag]
3、标注方法
(1) IOB 标注法
IOB 标注法,是 CoNLL2003 采用的标注法,I 表示 inside,O 表示 Outside,B 表示 Begin。而标注的 label 是 I-XXX 的,表示这个字符,在 XXX 类命名实体的内部(inside)。B 用于标记一个命名实体的开始。
例如:
sequence: Tom hanks is my name
label: B-PER I-PER O O O
(2)BIOES 标注法
这是在 IOB 方法上,扩展出的一个更复杂,但更完备的标注方法。E(end) 表示这个词位于一个实体的结束位置,S (single)表示这个词是可以自己就组成一个实体。
4、使用 NLTK 实现 NER
import re
import pandas as pd
import nltk
def parse_document(document):
document = re.sub('\n', ' ', document)
if isinstance(document, str):
document = document
else:
raise ValueError('Document is not string!')
document = document.strip()
sentences = nltk.sent_tokenize(document)
sentences = [sentence.strip() for sentence in sentences]
return sentences
# sample document
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium,
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its
membership now comprises 211 national associations. Member countries must each also be members of one of
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America
and the Caribbean, Oceania, and South America.
"""
# tokenize sentences
sentences = parse_document(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]
# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
for tagged_tree in ne_tagged_sentence:
# extract only chunks having NE labels
if hasattr(tagged_tree, 'label'):
entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #get NE name
entity_type = tagged_tree.label() # get NE category
named_entities.append((entity_name, entity_type))
# get unique named entities
named_entities = list(set(named_entities))
# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# display results
print(entity_frame)
5、使用条件随机场CRF
from sklearn_crfsuite import CRF # CRF的具体实现太过复杂,这里我们借助一个外部的库
def word2features(sent, i):
"""抽取单个字的特征"""
word = sent[i]
prev_word = "<s>" if i == 0 else sent[i-1]
next_word = "</s>" if i == (len(sent)-1) else sent[i+1]
# 因为每个词相邻的词会影响这个词的标记
# 所以我们使用:
# 前一个词,当前词,后一个词,
# 前一个词+当前词, 当前词+后一个词
# 作为特征
features = {
'w': word,
'w-1': prev_word,
'w+1': next_word,
'w-1:w': prev_word+word,
'w:w+1': word+next_word,
'bias': 1
}
return features
def sent2features(sent):
"""抽取序列特征"""
return [word2features(sent, i) for i in range(len(sent))]
class CRFModel(object):
def __init__(self,
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=False
):
self.model = CRF(algorithm=algorithm,
c1=c1,
c2=c2,
max_iterations=max_iterations,
all_possible_transitions=all_possible_transitions)
def train(self, sentences, tag_lists):
"""训练模型"""
features = [sent2features(s) for s in sentences]
self.model.fit(features, tag_lists)
def test(self, sentences):
"""解码,对给定句子预测其标注"""
features = [sent2features(s) for s in sentences]
pred_tag_lists = self.model.predict(features)
return pred_tag_lists
6、BI-LSTM实现 NER
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
class BiLSTM(nn.Module):
def __init__(self, vocab_size, emb_size, hidden_size, out_size):
"""初始化参数:
vocab_size:字典的大小
emb_size:词向量的维数
hidden_size:隐向量的维数
out_size:标注的种类
"""
super(BiLSTM, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_size)
self.bilstm = nn.LSTM(emb_size, hidden_size,
batch_first=True,
bidirectional=True)
self.lin = nn.Linear(2*hidden_size, out_size)
def forward(self, sents_tensor, lengths):
emb = self.embedding(sents_tensor) # [B, L, emb_size]
packed = pack_padded_sequence(emb, lengths, batch_first=True)
rnn_out, _ = self.bilstm(packed)
# rnn_out:[B, L, hidden_size*2]
rnn_out, _ = pad_packed_sequence(rnn_out, batch_first=True)
scores = self.lin(rnn_out) # [B, L, out_size]
return scores
def test(self, sents_tensor, lengths, _):
"""解码"""
logits = self.forward(sents_tensor, lengths) # [B, L, out_size]
_, batch_tagids = torch.max(logits, dim=2)
return batch_tagid
7、BI-LSTM + CRF 实现 NER
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, emb_size, hidden_size, out_size):
"""初始化参数:
vocab_size:字典的大小
emb_size:词向量的维数
hidden_size:隐向量的维数
out_size:标注的种类
"""
super(BiLSTM_CRF, self).__init__()
# 这里的BiLSTM就是LSTM模型部分所定义的BiLSTM模型
self.bilstm = BiLSTM(vocab_size, emb_size, hidden_size, out_size)
# CRF实际上就是多学习一个转移矩阵 [out_size, out_size] 初始化为均匀分布
self.transition = nn.Parameter(
torch.ones(out_size, out_size) * 1/out_size)
# self.transition.data.zero_()
def forward(self, sents_tensor, lengths):
# [B, L, out_size]
emission = self.bilstm(sents_tensor, lengths)
# 计算CRF scores, 这个scores大小为[B, L, out_size, out_size]
# 也就是每个字对应对应一个 [out_size, out_size]的矩阵
# 这个矩阵第i行第j列的元素的含义是:上一时刻tag为i,这一时刻tag为j的分数
batch_size, max_len, out_size = emission.size()
crf_scores = emission.unsqueeze(
2).expand(-1, -1, out_size, -1) + self.transition.unsqueeze(0)
return crf_scores
def test(self, test_sents_tensor, lengths, tag2id):
"""使用维特比算法进行解码"""
start_id = tag2id['<start>']
end_id = tag2id['<end>']
pad = tag2id['<pad>']
tagset_size = len(tag2id)
crf_scores = self.forward(test_sents_tensor, lengths)
device = crf_scores.device
# B:batch_size, L:max_len, T:target set size
B, L, T, _ = crf_scores.size()
# viterbi[i, j, k]表示第i个句子,第j个字对应第k个标记的最大分数
viterbi = torch.zeros(B, L, T).to(device)
# backpointer[i, j, k]表示第i个句子,第j个字对应第k个标记时前一个标记的id,用于回溯
backpointer = (torch.zeros(B, L, T).long() * end_id).to(device)
lengths = torch.LongTensor(lengths).to(device)
# 向前递推
for step in range(L):
batch_size_t = (lengths > step).sum().item()
if step == 0:
# 第一个字它的前一个标记只能是start_id
viterbi[:batch_size_t, step,
:] = crf_scores[: batch_size_t, step, start_id, :]
backpointer[: batch_size_t, step, :] = start_id
else:
max_scores, prev_tags = torch.max(
viterbi[:batch_size_t, step-1, :].unsqueeze(2) +
crf_scores[:batch_size_t, step, :, :], # [B, T, T]
dim=1
)
viterbi[:batch_size_t, step, :] = max_scores
backpointer[:batch_size_t, step, :] = prev_tags
# 在回溯的时候我们只需要用到backpointer矩阵
backpointer = backpointer.view(B, -1) # [B, L * T]
tagids = [] # 存放结果
tags_t = None
for step in range(L-1, 0, -1):
batch_size_t = (lengths > step).sum().item()
if step == L-1:
index = torch.ones(batch_size_t).long() * (step * tagset_size)
index = index.to(device)
index += end_id
else:
prev_batch_size_t = len(tags_t)
new_in_batch = torch.LongTensor(
[end_id] * (batch_size_t - prev_batch_size_t)).to(device)
offset = torch.cat(
[tags_t, new_in_batch],
dim=0
) # 这个offset实际上就是前一时刻的
index = torch.ones(batch_size_t).long() * (step * tagset_size)
index = index.to(device)
index += offset.long()
tags_t = backpointer[:batch_size_t].gather(
dim=1, index=index.unsqueeze(1).long())
tags_t = tags_t.squeeze(1)
tagids.append(tags_t.tolist())
# tagids:[L-1](L-1是因为扣去了end_token),大小的liebiao
# 其中列表内的元素是该batch在该时刻的标记
# 下面修正其顺序,并将维度转换为 [B, L]
tagids = list(zip_longest(*reversed(tagids), fillvalue=pad))
tagids = torch.Tensor(tagids).long()
# 返回解码的结果
return tagids
参考资料:
(1)https://www.cnblogs.com/bep-feijin/articles/9650898.html
(2)https://www.cnblogs.com/chen8023miss/p/11446884.html