记录第一次使用FastNLP做NER任务

最新推荐文章于 2022-05-13 14:55:37 发布

nowrun

最新推荐文章于 2022-05-13 14:55:37 发布

阅读量789

点赞数 2

分类专栏： fastnlp 文章标签：自然语言处理深度学习神经网络

本文链接：https://blog.csdn.net/weixin_43909659/article/details/120210053

版权

fastnlp 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

记录第一次使用FastNLP做NER任务

代码结构

记录第一次使用FastNLP做NER任务

数据处理

在进行任务时训练集，验证集和测试集会共用同一个词表以及具有相同的目标值，所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary。在相关的开源代码中对比较热门的数据已经写好了处理的代码，只需将其对应的路径传入即可。本任务针对ConLL2003数据集进行展示。

from fastNLP.io import Conll2003NERPipe
paths = {
    'train': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.train.txt",
    'dev': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.valid.txt",
    "test": "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.test.txt"
}

data_bundle = Conll2003NERPipe().process_from_file(paths)

对数据进行处理之后，进行embedding操作。

from fastNLP.embeddings import get_embeddings, CNNCharEmbedding, StackEmbedding, StaticEmbedding, BertEmbedding
vocab = data_bundle.get_vocab('words')

char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=32)

word_embed = BertEmbedding(vocab=vocab, model_dir_or_name='en', auto_truncate=True)
# word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-300d')

embed = StackEmbedding([word_embed, char_embed])

模型加载

模型部分为基于BaseModel修改得到

from fastNLP.models import BaseModel
from fastNLP.modules import LSTM
from fastNLP import LossInForward, seq_len_to_mask
class MyBiLSTMCRF(BaseModel):
    r"""
    结构为embedding + BiLSTM + FC + Dropout + CRF.

    """

    def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5,
                 target_vocab=None):
        r"""

        :param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100)
        :param num_classes: 一共多少个类
        :param num_layers: BiLSTM的层数
        :param hidden_size: BiLSTM的hidden_size，实际hidden size为该值的两倍(前向、后向)
        :param dropout: dropout的概率，0为不dropout
        :param target_vocab: Vocabulary对象，target与index的对应关系。如果传入该值，将自动避免非法的解码序列。
        """
        super().__init__()
        self.embed = get_embeddings(embed)

        if num_layers > 1:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True, dropout=dropout)
        else:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

        trans = None
        if target_vocab is not None:
            assert len(
                target_vocab) == num_classes, "The number of classes should be same with the length of target vocabulary."
            trans = allowed_transitions(target_vocab.idx2word, include_start_end=True)

        self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)

    def _forward(self, words, seq_len=None, target=None):
        words = self.embed(words)
        feats, _ = self.lstm(words, seq_len=seq_len)
        feats = self.fc(feats)
        feats = self.dropout(feats)
        logits = F.log_softmax(feats, dim=-1)
        mask = seq_len_to_mask(seq_len).cuda()
        if target is None:
            pred, _ = self.crf.viterbi_decode(logits, mask)
            return {C.OUTPUT: pred}
        else:
            loss = self.crf(logits, target, mask).mean()
            return {C.LOSS: loss}

    def forward(self, words, seq_len, target):
        return self._forward(words, seq_len.cpu(), target)

    def predict(self, words, seq_len):
        return self._forward(words, seq_len.cpu())

训练部分

from fastNLP import Trainer
from torch.optim import Adam
model = MyBiLSTMCRF(
    embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=128,
    dropout=0.4, target_vocab=data_bundle.get_vocab('target')
)

metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2.0e-5)
loss = LossInForward()

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
    train_data=data_bundle.get_dataset('train'),
    dev_data=data_bundle.get_dataset('dev'),
    batch_size=64, num_workers=8,
    model=model, loss=loss, optimizer=optimizer,
    metrics=metric, device=device
)
trainer.train()

评测部分

tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

完整代码

import torch
import torch.nn as nn
import torch.nn.functional as F
from fastNLP import LossInForward, seq_len_to_mask
from fastNLP import SpanFPreRecMetric
from fastNLP import Tester
from fastNLP import Trainer
from fastNLP.core.const import Const as C
from fastNLP.embeddings import get_embeddings, CNNCharEmbedding, StackEmbedding, StaticEmbedding, BertEmbedding
from fastNLP.io import Conll2003NERPipe
from fastNLP.models import BaseModel
from fastNLP.modules import LSTM, allowed_transitions, ConditionalRandomField, TransformerEncoder
from torch.optim import Adam

# data_bundle = WeiboNERPipe().process_from_file()
paths = {
    'train': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.train.txt",
    'dev': "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.valid.txt",
    "test": "/home/zutnlp/data/zutnlp/sequence/CoNLL-2003/eng.test.txt"
}

data_bundle = Conll2003NERPipe().process_from_file(paths)


# data_bundle.rename_field('chars', 'words')


class MyBiLSTMCRF(BaseModel):
    r"""
    结构为embedding + BiLSTM + FC + Dropout + CRF.

    """

    def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5,
                 target_vocab=None):
        r"""

        :param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100)
        :param num_classes: 一共多少个类
        :param num_layers: BiLSTM的层数
        :param hidden_size: BiLSTM的hidden_size，实际hidden size为该值的两倍(前向、后向)
        :param dropout: dropout的概率，0为不dropout
        :param target_vocab: Vocabulary对象，target与index的对应关系。如果传入该值，将自动避免非法的解码序列。
        """
        super().__init__()
        self.embed = get_embeddings(embed)

        if num_layers > 1:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True, dropout=dropout)
        else:
            self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

        trans = None
        if target_vocab is not None:
            assert len(
                target_vocab) == num_classes, "The number of classes should be same with the length of target vocabulary."
            trans = allowed_transitions(target_vocab.idx2word, include_start_end=True)

        self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)

    def _forward(self, words, seq_len=None, target=None):
        words = self.embed(words)
        feats, _ = self.lstm(words, seq_len=seq_len)
        feats = self.fc(feats)
        feats = self.dropout(feats)
        logits = F.log_softmax(feats, dim=-1)
        mask = seq_len_to_mask(seq_len).cuda()
        if target is None:
            pred, _ = self.crf.viterbi_decode(logits, mask)
            return {C.OUTPUT: pred}
        else:
            loss = self.crf(logits, target, mask).mean()
            return {C.LOSS: loss}

    def forward(self, words, seq_len, target):
        return self._forward(words, seq_len.cpu(), target)

    def predict(self, words, seq_len):
        return self._forward(words, seq_len.cpu())


vocab = data_bundle.get_vocab('words')

char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=32)

word_embed = BertEmbedding(vocab=vocab, model_dir_or_name='en', auto_truncate=True)
# word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-300d')

embed = StackEmbedding([word_embed, char_embed])

model = MyBiLSTMCRF(
    embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=128,
    dropout=0.4, target_vocab=data_bundle.get_vocab('target')
)

metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2.0e-5)
loss = LossInForward()

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
    train_data=data_bundle.get_dataset('train'),
    dev_data=data_bundle.get_dataset('dev'),
    batch_size=64, num_workers=8,
    model=model, loss=loss, optimizer=optimizer,
    metrics=metric, device=device
)
trainer.train()

tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

nowrun

关注

2
点赞
踩
6

收藏

觉得还不错? 一键收藏
6
评论
记录第一次使用FastNLP做NER任务

记录第一次使用FastNLP做NER任务代码结构记录第一次使用FastNLP做NER任务数据处理模型加载训练部分评测部分完整代码数据处理在进行任务时训练集，验证集和测试集会共用同一个词表以及具有相同的目标值，所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary。在相关的开源代码中对比较热门的数据已经写好了处理的代码，只需将其对应的路径传入即可。本任务针对ConLL2003数据集进行展示。from fastNLP.i
复制链接

扫一扫