记录第一次使用FastNLP做NER任务

记录第一次使用FastNLP做NER任务

数据处理

在进行任务时训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary。在相关的开源代码中对比较热门的数据已经写好了处理的代码,只需将其对应的路径传入即可。本任务针对ConLL2003数据集进行展示。

from fastNLP.io import Conll2003NERPipe
paths = {
    'train': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.train.txt",
    'dev': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.valid.txt",
    "test": "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.test.txt"
}

data_bundle = Conll2003NERPipe().process_from_file(paths)

对数据进行处理之后,进行embedding操作。

from fastNLP.embeddings import get_embeddings, CNNCharEmbedding, StackEmbedding, StaticEmbedding, BertEmbedding
vocab = data_bundle.get_vocab('words')

char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=32)

word_embed = BertEmbedding(vocab=vocab, model_dir_or_name='en', auto_truncate=True)
# word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-300d')

embed = StackEmbedding([word_embed, char_embed])

模型加载

模型部分为基于BaseModel修改得到

from fastNLP.models import BaseModel
from fastNLP.modules import LSTM
from fastNLP import LossInForward, seq_len_to_mask
class MyBiLSTMCRF(BaseModel):
    r"""
    结构为embedding + BiLSTM + FC + Dropout + CRF.

    """

    def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5,
                 target_vocab=None):
        r"""

        :param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100)
        :param num_classes: 一共多少个类
        :param num_layers: BiLSTM的层数
        :param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向)
        :param dropout: dropout的概率,0为不dropout
        :param target_vocab: Vocabulary对象,target与index的对应关系。如果传入该值,将自动避免非法的解码序列。
        """
        super().__init__()
        self.embed = get_embeddings(embed)

        if num_layers > 1:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True, dropout=dropout)
        else:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

        trans = None
        if target_vocab is not None:
            assert len(
                target_vocab) == num_classes, "The number of classes should be same with the length of target vocabulary."
            trans = allowed_transitions(target_vocab.idx2word, include_start_end=True)

        self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)

    def _forward(self, words, seq_len=None, target=None):
        words = self.embed(words)
        feats, _ = self.lstm(words, seq_len=seq_len)
        feats = self.fc(feats)
        feats = self.dropout(feats)
        logits = F.log_softmax(feats, dim=-1)
        mask = seq_len_to_mask(seq_len).cuda()
        if target is None:
            pred, _ = self.crf.viterbi_decode(logits, mask)
            return {C.OUTPUT: pred}
        else:
            loss = self.crf(logits, target, mask).mean()
            return {C.LOSS: loss}

    def forward(self, words, seq_len, target):
        return self._forward(words, seq_len.cpu(), target)

    def predict(self, words, seq_len):
        return self._forward(words, seq_len.cpu())

训练部分

from fastNLP import Trainer
from torch.optim import Adam
model = MyBiLSTMCRF(
    embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=128,
    dropout=0.4, target_vocab=data_bundle.get_vocab('target')
)

metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2.0e-5)
loss = LossInForward()

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
    train_data=data_bundle.get_dataset('train'),
    dev_data=data_bundle.get_dataset('dev'),
    batch_size=64, num_workers=8,
    model=model, loss=loss, optimizer=optimizer,
    metrics=metric, device=device
)
trainer.train()

评测部分

tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

完整代码

import torch
import torch.nn as nn
import torch.nn.functional as F
from fastNLP import LossInForward, seq_len_to_mask
from fastNLP import SpanFPreRecMetric
from fastNLP import Tester
from fastNLP import Trainer
from fastNLP.core.const import Const as C
from fastNLP.embeddings import get_embeddings, CNNCharEmbedding, StackEmbedding, StaticEmbedding, BertEmbedding
from fastNLP.io import Conll2003NERPipe
from fastNLP.models import BaseModel
from fastNLP.modules import LSTM, allowed_transitions, ConditionalRandomField, TransformerEncoder
from torch.optim import Adam

# data_bundle = WeiboNERPipe().process_from_file()
paths = {
    'train': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.train.txt",
    'dev': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.valid.txt",
    "test": "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.test.txt"
}

data_bundle = Conll2003NERPipe().process_from_file(paths)


# data_bundle.rename_field('chars', 'words')


class MyBiLSTMCRF(BaseModel):
    r"""
    结构为embedding + BiLSTM + FC + Dropout + CRF.

    """

    def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5,
                 target_vocab=None):
        r"""

        :param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100)
        :param num_classes: 一共多少个类
        :param num_layers: BiLSTM的层数
        :param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向)
        :param dropout: dropout的概率,0为不dropout
        :param target_vocab: Vocabulary对象,target与index的对应关系。如果传入该值,将自动避免非法的解码序列。
        """
        super().__init__()
        self.embed = get_embeddings(embed)

        if num_layers > 1:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True, dropout=dropout)
        else:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

        trans = None
        if target_vocab is not None:
            assert len(
                target_vocab) == num_classes, "The number of classes should be same with the length of target vocabulary."
            trans = allowed_transitions(target_vocab.idx2word, include_start_end=True)

        self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)

    def _forward(self, words, seq_len=None, target=None):
        words = self.embed(words)
        feats, _ = self.lstm(words, seq_len=seq_len)
        feats = self.fc(feats)
        feats = self.dropout(feats)
        logits = F.log_softmax(feats, dim=-1)
        mask = seq_len_to_mask(seq_len).cuda()
        if target is None:
            pred, _ = self.crf.viterbi_decode(logits, mask)
            return {C.OUTPUT: pred}
        else:
            loss = self.crf(logits, target, mask).mean()
            return {C.LOSS: loss}

    def forward(self, words, seq_len, target):
        return self._forward(words, seq_len.cpu(), target)

    def predict(self, words, seq_len):
        return self._forward(words, seq_len.cpu())


vocab = data_bundle.get_vocab('words')

char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=32)

word_embed = BertEmbedding(vocab=vocab, model_dir_or_name='en', auto_truncate=True)
# word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-300d')

embed = StackEmbedding([word_embed, char_embed])

model = MyBiLSTMCRF(
    embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=128,
    dropout=0.4, target_vocab=data_bundle.get_vocab('target')
)

metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2.0e-5)
loss = LossInForward()

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
    train_data=data_bundle.get_dataset('train'),
    dev_data=data_bundle.get_dataset('dev'),
    batch_size=64, num_workers=8,
    model=model, loss=loss, optimizer=optimizer,
    metrics=metric, device=device
)
trainer.train()

tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()
  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值