记录第一次使用FastNLP做NER任务
数据处理
在进行任务时训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary。在相关的开源代码中对比较热门的数据已经写好了处理的代码,只需将其对应的路径传入即可。本任务针对ConLL2003数据集进行展示。
from fastNLP.io import Conll2003NERPipe
paths = {
'train': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.train.txt",
'dev': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.valid.txt",
"test": "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.test.txt"
}
data_bundle = Conll2003NERPipe().process_from_file(paths)
对数据进行处理之后,进行embedding操作。
from fastNLP.embeddings import get_embeddings, CNNCharEmbedding, StackEmbedding, StaticEmbedding, BertEmbedding
vocab = data_bundle.get_vocab('words')
char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=32)
word_embed = BertEmbedding(vocab=vocab, model_dir_or_name='en', auto_truncate=True)
# word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-300d')
embed = StackEmbedding([word_embed, char_embed])
模型加载
模型部分为基于BaseModel修改得到
from fastNLP.models import BaseModel
from fastNLP.modules import LSTM
from fastNLP import LossInForward, seq_len_to_mask
class MyBiLSTMCRF(BaseModel):
r"""
结构为embedding + BiLSTM + FC + Dropout + CRF.
"""
def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5,
target_vocab=None):
r"""
:param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100)
:param num_classes: 一共多少个类
:param num_layers: BiLSTM的层数
:param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向)
:param dropout: dropout的概率,0为不dropout
:param target_vocab: Vocabulary对象,target与index的对应关系。如果传入该值,将自动避免非法的解码序列。
"""
super().__init__()
self.embed = get_embeddings(embed)
if num_layers > 1:
self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
bidirectional=True,
batch_first=True, dropout=dropout)
else:
self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
bidirectional=True,
batch_first=True)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size * 2, num_classes)
trans = None
if target_vocab is not None:
assert len(
target_vocab) == num_classes, "The number of classes should be same with the length of target vocabulary."
trans = allowed_transitions(target_vocab.idx2word, include_start_end=True)
self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)
def _forward(self, words, seq_len=None, target=None):
words = self.embed(words)
feats, _ = self.lstm(words, seq_len=seq_len)
feats = self.fc(feats)
feats = self.dropout(feats)
logits = F.log_softmax(feats, dim=-1)
mask = seq_len_to_mask(seq_len).cuda()
if target is None:
pred, _ = self.crf.viterbi_decode(logits, mask)
return {C.OUTPUT: pred}
else:
loss = self.crf(logits, target, mask).mean()
return {C.LOSS: loss}
def forward(self, words, seq_len, target):
return self._forward(words, seq_len.cpu(), target)
def predict(self, words, seq_len):
return self._forward(words, seq_len.cpu())
训练部分
from fastNLP import Trainer
from torch.optim import Adam
model = MyBiLSTMCRF(
embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=128,
dropout=0.4, target_vocab=data_bundle.get_vocab('target')
)
metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2.0e-5)
loss = LossInForward()
device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
train_data=data_bundle.get_dataset('train'),
dev_data=data_bundle.get_dataset('dev'),
batch_size=64, num_workers=8,
model=model, loss=loss, optimizer=optimizer,
metrics=metric, device=device
)
trainer.train()
评测部分
tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()
完整代码
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastNLP import LossInForward, seq_len_to_mask
from fastNLP import SpanFPreRecMetric
from fastNLP import Tester
from fastNLP import Trainer
from fastNLP.core.const import Const as C
from fastNLP.embeddings import get_embeddings, CNNCharEmbedding, StackEmbedding, StaticEmbedding, BertEmbedding
from fastNLP.io import Conll2003NERPipe
from fastNLP.models import BaseModel
from fastNLP.modules import LSTM, allowed_transitions, ConditionalRandomField, TransformerEncoder
from torch.optim import Adam
# data_bundle = WeiboNERPipe().process_from_file()
paths = {
'train': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.train.txt",
'dev': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.valid.txt",
"test": "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.test.txt"
}
data_bundle = Conll2003NERPipe().process_from_file(paths)
# data_bundle.rename_field('chars', 'words')
class MyBiLSTMCRF(BaseModel):
r"""
结构为embedding + BiLSTM + FC + Dropout + CRF.
"""
def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5,
target_vocab=None):
r"""
:param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100)
:param num_classes: 一共多少个类
:param num_layers: BiLSTM的层数
:param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向)
:param dropout: dropout的概率,0为不dropout
:param target_vocab: Vocabulary对象,target与index的对应关系。如果传入该值,将自动避免非法的解码序列。
"""
super().__init__()
self.embed = get_embeddings(embed)
if num_layers > 1:
self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
bidirectional=True,
batch_first=True, dropout=dropout)
else:
self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
bidirectional=True,
batch_first=True)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size * 2, num_classes)
trans = None
if target_vocab is not None:
assert len(
target_vocab) == num_classes, "The number of classes should be same with the length of target vocabulary."
trans = allowed_transitions(target_vocab.idx2word, include_start_end=True)
self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)
def _forward(self, words, seq_len=None, target=None):
words = self.embed(words)
feats, _ = self.lstm(words, seq_len=seq_len)
feats = self.fc(feats)
feats = self.dropout(feats)
logits = F.log_softmax(feats, dim=-1)
mask = seq_len_to_mask(seq_len).cuda()
if target is None:
pred, _ = self.crf.viterbi_decode(logits, mask)
return {C.OUTPUT: pred}
else:
loss = self.crf(logits, target, mask).mean()
return {C.LOSS: loss}
def forward(self, words, seq_len, target):
return self._forward(words, seq_len.cpu(), target)
def predict(self, words, seq_len):
return self._forward(words, seq_len.cpu())
vocab = data_bundle.get_vocab('words')
char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=32)
word_embed = BertEmbedding(vocab=vocab, model_dir_or_name='en', auto_truncate=True)
# word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-300d')
embed = StackEmbedding([word_embed, char_embed])
model = MyBiLSTMCRF(
embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=128,
dropout=0.4, target_vocab=data_bundle.get_vocab('target')
)
metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2.0e-5)
loss = LossInForward()
device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
train_data=data_bundle.get_dataset('train'),
dev_data=data_bundle.get_dataset('dev'),
batch_size=64, num_workers=8,
model=model, loss=loss, optimizer=optimizer,
metrics=metric, device=device
)
trainer.train()
tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()