复旦nlp实验室 nlp-beginner 任务四:基于LSTM+CRF的序列标注

经历了期末摸鱼之后它终于来了

认认真真的学了CRF,先上个Demo版本

model

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torchcrf import CRF

class LSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_index, embedding_size, hidden_size, max_length, vectors=None):
        super(LSTM_CRF, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.tag_to_index = tag_to_index
        self.target_size = len(tag_to_index)
        if vectors is None:
            self.embedding = nn.Embedding(vocab_size, embedding_size)
        else:
            self.embedding = nn.Embedding.from_pretrained(vectors)
        self.lstm = nn.LSTM(embedding_size, hidden_size // 2, bidirectional=True)
        self.hidden_to_tag = nn.Linear(hidden_size, self.target_size)
        self.crf = CRF(self.target_size, batch_first=True)
        self.max_length = max_length

    def get_mask(self, length_list):
        mask = []
        for length in length_list:
            mask.append([1 for i in range(length)] + [0 for j in range(self.max_length - length)])
        return torch.tensor(mask, dtype=torch.bool)

    def LSTM_Layer(self, sentences, length_list):

        embeds = self.embedding(sentences)
        packed_sentences = pack_padded_sequence(embeds, lengths=length_list, batch_first=True, enforce_sorted=False)
        
        lstm_out, _ = self.lstm(packed_sentences)
        
        result, _ = pad_packed_sequence(lstm_out, batch_first=True, total_length=self.max_length)

        feature = self.hidden_to_tag(result)
        return feature

    def CRF_layer(self, input, targets, length_list):
        """Compute the conditional log likelihood of a sequence of tags given emission scores.
        Args:
            emissions (`~torch.Tensor`): Emission score tensor of size
                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length, num_tags)`` otherwise.
            tags (`~torch.LongTensor`): Sequence of tags tensor of size
                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length)`` otherwise.
            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
            reduction: Specifies  the reduction to apply to the output:
                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
                ``sum``: the output will be summed over batches. ``mean``: the output will be
                averaged over batches. ``token_mean``: the output will be averaged over tokens.

        Returns:
            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
            reduction is ``none``, ``()`` otherwise.
        """
        return self.crf(input, targets, self.get_mask(length_list))

    def forward(self, sentences, length_list, targets):
        x = self.LSTM_Layer(sentences, length_list)
        x = self.CRF_layer(x, targets, length_list)

        return x

    def predict(self, sentences, length_list):
        out = self.LSTM_Layer(sentences, length_list)
        mask = self.get_mask(length_list)

        return self.crf.decode(out, mask)



utils

import torch
from torch.utils.data import DataLoader, Dataset

def read_data(path, length):
    sentences_list = []         # 每一个元素是一整个句子
    sentences_list_labels = []  # 每个元素是一整个句子的标签
    with open(path, 'r', encoding='UTF-8') as f:
        sentence_labels = []    # 每个元素是这个句子的每个单词的标签
        sentence = []           # 每个元素是这个句子的每个单词

        for line in f:
            line = line.strip()
            if not line:        # 如果遇到了空白行
                if sentence:    # 防止空白行连续多个,导致出现空白的句子
                    sentences_list.append(' '.join(sentence))
                    sentences_list_labels.append(' '.join(sentence_labels))

                    sentence = []
                    sentence_labels = []
                                # 创建新的句子的list,准备读入下一个句子
            else:
                res = line.split()
                assert len(res) == 4
                if res[0] == '-DOCSTART-':
                    continue
                sentence.append(res[0])
                sentence_labels.append(res[3])

        if sentence:            # 防止最后一行没有空白行,导致最后一句话录入不到
            sentences_list.append(sentence)
            sentences_list_labels.append(sentence_labels)
    return sentences_list[:length], sentences_list_labels[:length]

def build_vocab(sentences_list):
    ret = []
    for sentences in sentences_list:
        ret += [word for word in sentences.split()]
    return list(set(ret))

class mydataset(Dataset):
    def __init__(self, x : torch.Tensor, y : torch.Tensor, length_list):
        self.x = x
        self.y = y
        self.length_list = length_list
    def __getitem__(self, index):
        data = self.x[index]
        labels = self.y[index]
        length = self.length_list[index]
        return data, labels, length
    def __len__(self):
        return len(self.x)

def get_idx(word, d):
    if d[word] is not None:
        return d[word]
    else:
        return d['<unknown>']

def sentence2vector(sentence, d):
    return [get_idx(word, d) for word in sentence.split()]

def padding(x, max_length, d):
    length = 0
    for i in range(max_length - len(x)):
        x.append(d['<pad>'])
    return x

def get_dataloader(x, y, batch_size):
    word2idx, tag2idx, vocab_size = pre_processing()
    inputs = [sentence2vector(s, word2idx) for s in x] # 每一个句子都转化成vector
    targets = [sentence2vector(s, tag2idx) for s in y]

    length_list = [len(sentence) for sentence in inputs]

    max_length = 0
    max_length = max(max(length_list), max_length)
    max_length = 124

    inputs = torch.tensor([padding(sentence, max_length, word2idx) for sentence in inputs])
    targets = torch.tensor([padding(sentence, max_length, tag2idx) for sentence in targets], dtype=torch.long)

    dataset = mydataset(inputs, targets, length_list)
    dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size)

    return dataloader, max_length

def pre_processing():
    x_train, y_train = read_data("data/conll2003/train.txt", 14000)
    x_test, y_test = read_data("data/conll2003/test.txt", 3200)
    d_x = build_vocab(x_train+x_test)
    d_y = build_vocab(y_train+y_test)
    word2idx = {d_x[i]: i for i in range(len(d_x))}
    tag2idx = {d_y[i]: i for i in range(len(d_y))}
    tag2idx["<START>"] = 9
    tag2idx["<STOP>"] = 10
    pad_idx = len(word2idx)
    word2idx['<pad>'] = pad_idx
    tag2idx['<pad>'] = len(tag2idx)
    vocab_size = len(word2idx)
    idx2tag = {value: key for key, value in tag2idx.items()}
    print(tag2idx)
    return word2idx, tag2idx, vocab_size

def compute_f1(pred, targets, length_list):
    tp, fn, fp = [], [], []
    for i in range(15):
        tp.append(0)
        fn.append(0)
        fp.append(0)
    for i, length in enumerate(length_list):
        for j in range(length):
            a, b = pred[i][j], targets[i][j]
            if (a == b):
                tp[a] += 1
            else:
                fp[a] += 1
                fn[b] += 1
    tps = 0
    fps = 0
    fns = 0
    for i in range(9):
        tps += tp[i]
        fps += fp[i]
        fns += fn[i]
    p = tps / (tps + fps)
    r = tps / (tps + fns)
    return 2 * p * r / (p + r)



main

import numpy as np
from utils import read_data
from utils import get_dataloader
from utils import pre_processing
from model import LSTM_CRF
import time
import torch
import matplotlib.pyplot as plt
from torchtext.vocab import Vectors
from utils import compute_f1

n_classes = 5
batch_size = 250
embedding_size = 100
hidden_size = 20
epochs = 20
vectors = Vectors('glove.6B.100d.txt',
                  'C:/Users/Mechrevo/Desktop/nlp-beginner/code-for-nlp-beginner-master/Task2-Text Classification (RNN&CNN)/embedding')


def train(model, vocab_size, tag2idx, embedding_size, hidden_size, max_length, vectors=None):
    model = model(vocab_size, tag2idx, embedding_size, hidden_size, max_length, vectors=vectors)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    start_time = time.time()
    loss_history = []
    print("dataloader length: ", len(train_dataloader))
    model.train()
    f1_history = []
    idx2tag = {value: key for key, value in tag2idx.items()}
    for epoch in range(epochs):
        total_loss = 0.
        f1 = 0
        for idx, (inputs, targets, length_list) in enumerate(train_dataloader):

            model.zero_grad()
            loss = (-1) * model(inputs, length_list, targets)
            total_loss += loss.item()
            pred = model.predict(inputs, length_list)
            f1 += compute_f1(pred, targets, length_list)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            if (idx + 1) % 10 == 0 and idx:
                cur_loss = total_loss
                loss_history.append(cur_loss / (idx+1))
                f1_history.append(f1 / (idx+1))
                total_loss = 0
                print("epochs : {}, batch : {}, loss : {}, f1 : {}".format(epoch+1, idx*batch_size,
                                                                           cur_loss / (idx * batch_size), f1 / (idx+1)))

    plt.plot(np.arange(len(loss_history)), np.array(loss_history))
    plt.xlabel('Iterations')
    plt.ylabel('Training Loss')
    plt.title('LSTM+CRF model')
    plt.show()

    plt.plot(np.arange(len(f1_history)), np.array(f1_history))
    plt.title('train f1 scores')
    plt.show()

    model.eval()
    f1 = 0
    f1_history = []
    s = 0
    with torch.no_grad():
        for idx, (inputs, targets, length_list) in enumerate(test_dataloader):
            loss = (-1) * model(inputs, length_list, targets)
            total_loss += loss.item()
            pred = model.predict(inputs, length_list)
            f1 += compute_f1(pred, targets, length_list) * 32
    print("f1 score : {}, test size = {}".format(f1/3200, 3200))

if __name__ == '__main__':
    x_train, y_train = read_data("data/conll2003/train.txt", 14000)
    x_test, y_test = read_data("data/conll2003/test.txt", 3200)
    word2idx, tag2idx, vocab_size = pre_processing()
    train_dataloader, train_max_length = get_dataloader(x_train, y_train, batch_size)
    test_dataloader, test_max_length = get_dataloader(x_test, y_test, 32)
    train(LSTM_CRF, vocab_size, tag2idx, embedding_size, hidden_size, max_length=train_max_length, vectors=None)


    

训练结果:

在这里插入图片描述
在这里插入图片描述
离paper with code上的最好得分90.94差了一个多点,大概是我没有调参?(不想调了,1660ti跑的太慢惹)

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值