对pytorch官网的bilstm+crf的batch版本的实现+用中文实体识别的数据训练batch版本bilstm+crf

最新推荐文章于 2023-03-30 16:22:26 发布

lzh_zyy

最新推荐文章于 2023-03-30 16:22:26 发布

阅读量3k

点赞数 3

分类专栏： NLP

本文链接：https://blog.csdn.net/baqnliaozhihui/article/details/109244094

版权

NLP 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

代码的git地址(https://github.com/liaozhihui/MY_MODEL/blob/master/batch_bilstm_crf.py)

文章目录

前言
代码的实现
在github的代码中用中文实体识别的数据训练了以下batch版本的bilstm+crf

前言

对pytorch官网的bilstm+crf的batch版本的实现，和只传入模型一句句子不通，一个batch里句子的长短不一样，需要对句子进行padding，以及要将padding的位置不能加入分数的计算中
代码的git地址

代码的实现

创建一个mask的矩阵，矩阵维度batch_size*max_len，句子的长度内为1，padding的位置为0：

 def mask_maxtric(self,lengths):
        """
        创建一个矩阵，矩阵维度batch_size*max_len
        1的个数为该句子的长度
        [[1,1,1,1,1],
        [1,1,1,0,0],
        [1,1,0,0,0],
        ...]
        :param lengths:
        :return:
        """
        cre = torch.zeros((len(lengths), lengths[0]))
        cre_maxtrix = torch.ones((len(lengths), lengths[0],len(tag_to_ix)))
        for i, lens in enumerate(lengths):
            one = torch.zeros(len(tag_to_ix))
            one[-1] = 1
            cre[i][:lens] = 1
            cre_maxtrix[i][lens-1:] = one

        return cre,cre_maxtrix

会在每一个步长相加出乘以cre[:,i]，(i为当前的位置)如果某句句子的该位置是padding的，那么乘以cre[:,i]就为0。


 def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        cre, cre_maxtrix = self.mask_maxtric(lengths)
        score = torch.zeros((feats.size()[0], 1))
        start = torch.ones((feats.size()[0], 1), dtype=torch.long)*self.tag_to_ix[START_TAG]
        tags = torch.cat([start, tags], dim=1)
        for i in range(lengths[0]):

            feat = feats[:,i, :]
            trans_ = self.transitions[tags[:,i + 1]]
            trans_ = torch.gather(trans_,1,tags[:,i].unsqueeze(1))
            trans = cre[:,i].unsqueeze(1) * trans_
            idxs = tags[:, i + 1].unsqueeze(1)
            emit_ = torch.gather(feat,1,idxs)
            emit = cre[:, i].unsqueeze(1) * emit_
            score = score + trans +emit
        last_tags = torch.gather(tags, 1, torch.tensor(lengths).unsqueeze(1))
        last_score = self.transitions[self.tag_to_ix[STOP_TAG], last_tags]
        score = score + last_score
        return score.squeeze(1)

在_vertibi_decode中，用一个矩阵forward_var_table 来记录每一步，最后利用torch.gather函数来获取每句句子的最后一个位置的分数，在_forward_alg中也是这样的方式处理


 forward_var_table = torch.zeros((len(lengths), lengths[0], self.tagset_size - 1))
 # Iterate through the sentence
 for i in range(lengths[0]):
     feat = feats[:, i, :]
     alphas_t = []  # The forward tensors at this timestep
     for next_tag in range(self.tagset_size-1):
         # broadcast the emission score: it is the same regardless of
         # the previous tag
         temp = feat[:, next_tag]
         temp = temp.view(batch_size,-1)
         temp = temp.repeat(1,self.tagset_size-1)
         emit_score = temp
         # the ith entry of trans_score is the score of transitioning to
         # next_tag from i
         trans_score = self.transitions[next_tag]

         # The ith entry of next_tag_var is the value for the
         # edge (i -> next_tag) before we do log-sum-exp
         next_tag_var = forward_var + trans_score + emit_score
         # The forward variable for this tag is log-sum-exp of all the
         # scores.
         result = log_sum_exp(next_tag_var)

         alphas_t.append(result)
     forward_var = torch.cat(alphas_t,1)

     forward_var_table[:, i, :] = forward_var
 index = torch.tensor(lengths).reshape(len(lengths), 1, 1).repeat(1, 1, self.tagset_size - 1) - 1
 final_forward_var = torch.gather(forward_var_table, dim=1, index=index)
 terminal_var = final_forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]

所有的代码，可直接运行


import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence,pad_sequence,pack_padded_sequence
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

use_gpu = torch.cuda.is_available()
torch.manual_seed(1)
EMBEDDING_DIM = 64
HIDDEN_DIM = 32
START_TAG = "<START>"
STOP_TAG = "<STOP>"
UNK = "<UNK>"
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seqs, word_to_ix):
    idxs = [torch.tensor([word_to_ix.get(w,word_to_ix.get(UNK)) for w in seq],dtype=torch.long) for seq in seqs]
    lengths_ = list(map(len,idxs))
    _,idx_sort = torch.sort(torch.tensor(lengths_),descending=True)
    idxs = sorted(idxs,key=lambda x:len(x),reverse=True)
    lengths = list(map(len,idxs))
    if len(seqs):
        idxs = pad_sequence(idxs,batch_first=True)

    return idxs,lengths,idx_sort


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_ids = torch.argmax(vec, dim=1).view(-1,1)
    max_score = torch.gather(vec, 1, max_ids)
    max_score_broadcast = max_score.view(vec.size()[0], -1).repeat(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast), dim=1).unsqueeze(1))


class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size-1)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size-1, self.tagset_size-1))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 2, self.hidden_dim // 2,device=device),
                torch.randn(2, 2, self.hidden_dim // 2,device=device))

    def _forward_alg(self, feats,lengths):
        if len(feats.size()) == 2:
            feats = feats.unsqueeze(0)
        # Initialize the viterbi variables in log space

        batch_size = feats.size()[0]

        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((batch_size, self.tagset_size-1), -10000.).to(device)
        # START_TAG has all of the score.

        init_alphas[:, self.tag_to_ix[START_TAG]] = 0
        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas


        forward_var_table = torch.zeros((len(lengths), lengths[0], self.tagset_size - 1)).to(device)
        # Iterate through the sentence
        for i in range(lengths[0]):
            feat = feats[:, i, :]
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size-1):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                temp = feat[:, next_tag]
                temp = temp.view(batch_size,-1)
                temp = temp.repeat(1,self.tagset_size-1)
                emit_score = temp
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag]

                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                result = log_sum_exp(next_tag_var)

                alphas_t.append(result)
            forward_var = torch.cat(alphas_t,1)

            forward_var_table[:, i, :] = forward_var
        index = torch.tensor(lengths).reshape(len(lengths), 1, 1).repeat(1, 1, self.tagset_size - 1) - 1
        final_forward_var = torch.gather(forward_var_table, dim=1, index=index.to(device))
        terminal_var = final_forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        terminal_var = terminal_var.squeeze()
        alpha = log_sum_exp(terminal_var).squeeze(1)
        return alpha

    def _get_lstm_features(self, sentence,lengths):
        sentence = sentence.to(device)
        self.hidden = self.init_hidden()
        if len(sentence)>1:
            embeds = self.word_embeds(sentence)
            lstm_out, self.hidden = self.lstm(embeds)
            embeds = pack_padded_sequence(embeds, lengths=lengths, batch_first=True)
            lstm_out, self.hidden = self.lstm(embeds)
            lstm_out = pad_packed_sequence(lstm_out,batch_first=True)[0]
            lstm_feats = self.hidden2tag(lstm_out)
        else:
            embeds = self.word_embeds(sentence).view(sentence.size()[1], 1, -1)
            lstm_out, self.hidden = self.lstm(embeds)
            lstm_out = lstm_out.view(sentence.size()[1], self.hidden_dim)
            lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats

    def _score_sentence(self, feats, tags, lengths):
        # Gives the score of a provided tag sequence
        cre, cre_maxtrix = self.mask_maxtric(lengths)
        score = torch.zeros((feats.size()[0], 1)).to(device)
        start = torch.ones((feats.size()[0], 1), dtype=torch.long).to(device)*self.tag_to_ix[START_TAG]
        tags = torch.cat([start, tags], dim=1).to(device)
        for i in range(lengths[0]):

            feat = feats[:,i, :]
            trans_ = self.transitions[tags[:,i + 1]]
            trans_ = torch.gather(trans_,1,tags[:,i].unsqueeze(1))
            trans = cre[:,i].unsqueeze(1) * trans_
            idxs = tags[:, i + 1].unsqueeze(1)
            emit_ = torch.gather(feat,1,idxs)
            emit = cre[:, i].unsqueeze(1) * emit_
            score = score + trans +emit
        last_tags = torch.gather(tags, 1, torch.tensor(lengths,device=device).unsqueeze(1))
        last_score = self.transitions[self.tag_to_ix[STOP_TAG], last_tags]
        score = score + last_score
        return score.squeeze(1)

    def mask_maxtric(self,lengths):
        """
        创建一个矩阵，矩阵维度batch_size*max_len
        1的个数为该句子的长度
        [[1,1,1,1,1],
        [1,1,1,0,0],
        [1,1,0,0,0],
        ...]
        :param lengths:
        :return:
        """
        cre = torch.zeros((len(lengths), lengths[0]))
        cre_maxtrix = torch.ones((len(lengths), lengths[0],len(self.tag_to_ix)))
        for i, lens in enumerate(lengths):
            one = torch.zeros(len(self.tag_to_ix))
            one[-1] = 1
            cre[i][:lens] = 1
            cre_maxtrix[i][lens-1:] = one

        return cre.to(device),cre_maxtrix.to(device)



    def _viterbi_decode(self, feats, lengths, mode=None):

        if len(feats.size()) == 2:
            feats = feats.unsqueeze(0)
        # Initialize the viterbi variables in log space

        batch_size = feats.size()[0]
        init_vvars = torch.full((batch_size, self.tagset_size-1), -10000.).to(device)
        init_vvars[:, self.tag_to_ix[START_TAG]] = 0
        backpointers = []

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        forward_var_table = torch.zeros((len(lengths),lengths[0],self.tagset_size-1)).to(device)
        for i in range(lengths[0]):
            feat = feats[:,i,:]
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step
            for next_tag in range(self.tagset_size-1):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)

                next_tag_var = forward_var +self.transitions[next_tag]
                best_tag_id = torch.argmax(next_tag_var,dim=1).unsqueeze(1)
                bptrs_t.append(best_tag_id)
                best_node_score = torch.gather(next_tag_var, dim=1, index=best_tag_id)

                viterbivars_t.append(best_node_score)

            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            # result = cre[:, i].unsqueeze(1) * feat
            forward_var = torch.cat(viterbivars_t, 1) + feat
            forward_var_table[:, i, :] = forward_var
            backpointers.append(torch.cat(bptrs_t, 1))
        index = torch.tensor(lengths).reshape(len(lengths),1,1).repeat(1,1,self.tagset_size-1)-1
        final_forward_var = torch.gather(forward_var_table,dim=1,index=index.to(device))

        # Transition to STOP_TAG
        terminal_var = final_forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        terminal_var = terminal_var.squeeze(1)
        best_tag_id = torch.argmax(terminal_var, dim=1).unsqueeze(1)
        path_score = torch.gather(terminal_var, dim=1,index=best_tag_id)
        # Follow the back pointers to decode the best path.
        # best_path = [best_tag_id]
        if not mode: #如果mode是训练的话那么只用返回分数
            return path_score,[]
        best_path = [best_tag_id]

        backpointers_mat = torch.cat(backpointers,1).to(device).reshape(feats.size()[0],-1,self.tagset_size-1) #
        backpointers_mat = torch.cat([backpointers_mat,torch.ones(len(lengths),lengths[0],1,device=device).long()*(self.tagset_size-1)],-1) #给每一个backpointers加一个padding

        for i,length in enumerate(lengths):
            backpointers_mat[i][lengths[0]-length:]=backpointers_mat.clone()[i][:length]
            backpointers_mat[i][:lengths[0] - length]=self.tagset_size-1
        for i in range(backpointers_mat.size()[1]-1,-1,-1):
            bptrs_t = backpointers_mat[:,i,:]
            best_tag_id = torch.gather(bptrs_t, dim=1, index=best_tag_id)
            best_path.append(best_tag_id)

        best_path.reverse()
        best_path = torch.cat(best_path, 1)
        last_paths = []

        for i in range(len(lengths)):

            path = best_path[i][lengths[0]-lengths[i]:]
            if path[0] == self.tag_to_ix[START_TAG]:
                last_paths.append(path[1:])
            else:
                last_paths.append(None)

        return path_score, last_paths



    def neg_log_likelihood(self, sentence, tags,lengths):
        feats = self._get_lstm_features(sentence,lengths)
        forward_score = self._forward_alg(feats,lengths)
        gold_score = self._score_sentence(feats, tags,lengths)
        return torch.mean(forward_score - gold_score)



    def forward(self, sentence,lengths,mode=None):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,lengths)
        # Find the best path, given the features.
        # score, tag_seq = self._viterbi_decode(lstm_feats)
        score, tag_seq = self._viterbi_decode(lstm_feats,lengths,mode=mode)
        return score, tag_seq

在github的代码中用中文实体识别的数据训练了以下batch版本的bilstm+crf

代码目录结构

main.py 主程序
batch_bilstm_crf.py 模型文件
NerDataLoader.py 数据处理文件
ResumeNER 中文实体识别的数据

如有问题，欢迎指正,谢谢！后续还会更新更完善的代码

lzh_zyy

关注

3
点赞
踩
11

收藏

觉得还不错? 一键收藏
24
评论
对pytorch官网的bilstm+crf的batch版本的实现+用中文实体识别的数据训练batch版本bilstm+crf

代码的git地址(https://github.com/liaozhihui/MY_MODEL/blob/master/batch_bilstm_crf.py)文章目录前言代码的实现前言对pytorch官网的bilstm+crf的batch版本的实现，和只传入模型一句句子不通，一个batch里句子的长短不一样，需要对句子进行padding，以及要将padding的位置不能加入分数的计算中代码的git地址代码的实现创建一个mask的矩阵，矩阵维度batch_size*max_len，句子的长
复制链接

扫一扫