经历了期末摸鱼之后它终于来了
认认真真的学了CRF,先上个Demo版本
model
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torchcrf import CRF
class LSTM_CRF(nn.Module):
def __init__(self, vocab_size, tag_to_index, embedding_size, hidden_size, max_length, vectors=None):
super(LSTM_CRF, self).__init__()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.tag_to_index = tag_to_index
self.target_size = len(tag_to_index)
if vectors is None:
self.embedding = nn.Embedding(vocab_size, embedding_size)
else:
self.embedding = nn.Embedding.from_pretrained(vectors)
self.lstm = nn.LSTM(embedding_size, hidden_size // 2, bidirectional=True)
self.hidden_to_tag = nn.Linear(hidden_size, self.target_size)
self.crf = CRF(self.target_size, batch_first=True)
self.max_length = max_length
def get_mask(self, length_list):
mask = []
for length in length_list:
mask.append([1 for i in range(length)] + [0 for j in range(self.max_length - length)])
return torch.tensor(mask, dtype=torch.bool)
def LSTM_Layer(self, sentences, length_list):
embeds = self.embedding(sentences)
packed_sentences = pack_padded_sequence(embeds, lengths=length_list, batch_first=True, enforce_sorted=False)
lstm_out, _ = self.lstm(packed_sentences)
result, _ = pad_packed_sequence(lstm_out, batch_first=True, total_length=self.max_length)
feature = self.hidden_to_tag(result)
return feature
def CRF_layer(self, input, targets, length_list):
"""Compute the conditional log likelihood of a sequence of tags given emission scores.
Args:
emissions (`~torch.Tensor`): Emission score tensor of size
``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
``(batch_size, seq_length, num_tags)`` otherwise.
tags (`~torch.LongTensor`): Sequence of tags tensor of size
``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
``(batch_size, seq_length)`` otherwise.
mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
reduction: Specifies the reduction to apply to the output:
``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
``sum``: the output will be summed over batches. ``mean``: the output will be
averaged over batches. ``token_mean``: the output will be averaged over tokens.
Returns:
`~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
reduction is ``none``, ``()`` otherwise.
"""
return self.crf(input, targets, self.get_mask(length_list))
def forward(self, sentences, length_list, targets):
x = self.LSTM_Layer(sentences, length_list)
x = self.CRF_layer(x, targets, length_list)
return x
def predict(self, sentences, length_list):
out = self.LSTM_Layer(sentences, length_list)
mask = self.get_mask(length_list)
return self.crf.decode(out, mask)
utils
import torch
from torch.utils.data import DataLoader, Dataset
def read_data(path, length):
sentences_list = [] # 每一个元素是一整个句子
sentences_list_labels = [] # 每个元素是一整个句子的标签
with open(path, 'r', encoding='UTF-8') as f:
sentence_labels = [] # 每个元素是这个句子的每个单词的标签
sentence = [] # 每个元素是这个句子的每个单词
for line in f:
line = line.strip()
if not line: # 如果遇到了空白行
if sentence: # 防止空白行连续多个,导致出现空白的句子
sentences_list.append(' '.join(sentence))
sentences_list_labels.append(' '.join(sentence_labels))
sentence = []
sentence_labels = []
# 创建新的句子的list,准备读入下一个句子
else:
res = line.split()
assert len(res) == 4
if res[0] == '-DOCSTART-':
continue
sentence.append(res[0])
sentence_labels.append(res[3])
if sentence: # 防止最后一行没有空白行,导致最后一句话录入不到
sentences_list.append(sentence)
sentences_list_labels.append(sentence_labels)
return sentences_list[:length], sentences_list_labels[:length]
def build_vocab(sentences_list):
ret = []
for sentences in sentences_list:
ret += [word for word in sentences.split()]
return list(set(ret))
class mydataset(Dataset):
def __init__(self, x : torch.Tensor, y : torch.Tensor, length_list):
self.x = x
self.y = y
self.length_list = length_list
def __getitem__(self, index):
data = self.x[index]
labels = self.y[index]
length = self.length_list[index]
return data, labels, length
def __len__(self):
return len(self.x)
def get_idx(word, d):
if d[word] is not None:
return d[word]
else:
return d['<unknown>']
def sentence2vector(sentence, d):
return [get_idx(word, d) for word in sentence.split()]
def padding(x, max_length, d):
length = 0
for i in range(max_length - len(x)):
x.append(d['<pad>'])
return x
def get_dataloader(x, y, batch_size):
word2idx, tag2idx, vocab_size = pre_processing()
inputs = [sentence2vector(s, word2idx) for s in x] # 每一个句子都转化成vector
targets = [sentence2vector(s, tag2idx) for s in y]
length_list = [len(sentence) for sentence in inputs]
max_length = 0
max_length = max(max(length_list), max_length)
max_length = 124
inputs = torch.tensor([padding(sentence, max_length, word2idx) for sentence in inputs])
targets = torch.tensor([padding(sentence, max_length, tag2idx) for sentence in targets], dtype=torch.long)
dataset = mydataset(inputs, targets, length_list)
dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size)
return dataloader, max_length
def pre_processing():
x_train, y_train = read_data("data/conll2003/train.txt", 14000)
x_test, y_test = read_data("data/conll2003/test.txt", 3200)
d_x = build_vocab(x_train+x_test)
d_y = build_vocab(y_train+y_test)
word2idx = {d_x[i]: i for i in range(len(d_x))}
tag2idx = {d_y[i]: i for i in range(len(d_y))}
tag2idx["<START>"] = 9
tag2idx["<STOP>"] = 10
pad_idx = len(word2idx)
word2idx['<pad>'] = pad_idx
tag2idx['<pad>'] = len(tag2idx)
vocab_size = len(word2idx)
idx2tag = {value: key for key, value in tag2idx.items()}
print(tag2idx)
return word2idx, tag2idx, vocab_size
def compute_f1(pred, targets, length_list):
tp, fn, fp = [], [], []
for i in range(15):
tp.append(0)
fn.append(0)
fp.append(0)
for i, length in enumerate(length_list):
for j in range(length):
a, b = pred[i][j], targets[i][j]
if (a == b):
tp[a] += 1
else:
fp[a] += 1
fn[b] += 1
tps = 0
fps = 0
fns = 0
for i in range(9):
tps += tp[i]
fps += fp[i]
fns += fn[i]
p = tps / (tps + fps)
r = tps / (tps + fns)
return 2 * p * r / (p + r)
main
import numpy as np
from utils import read_data
from utils import get_dataloader
from utils import pre_processing
from model import LSTM_CRF
import time
import torch
import matplotlib.pyplot as plt
from torchtext.vocab import Vectors
from utils import compute_f1
n_classes = 5
batch_size = 250
embedding_size = 100
hidden_size = 20
epochs = 20
vectors = Vectors('glove.6B.100d.txt',
'C:/Users/Mechrevo/Desktop/nlp-beginner/code-for-nlp-beginner-master/Task2-Text Classification (RNN&CNN)/embedding')
def train(model, vocab_size, tag2idx, embedding_size, hidden_size, max_length, vectors=None):
model = model(vocab_size, tag2idx, embedding_size, hidden_size, max_length, vectors=vectors)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
start_time = time.time()
loss_history = []
print("dataloader length: ", len(train_dataloader))
model.train()
f1_history = []
idx2tag = {value: key for key, value in tag2idx.items()}
for epoch in range(epochs):
total_loss = 0.
f1 = 0
for idx, (inputs, targets, length_list) in enumerate(train_dataloader):
model.zero_grad()
loss = (-1) * model(inputs, length_list, targets)
total_loss += loss.item()
pred = model.predict(inputs, length_list)
f1 += compute_f1(pred, targets, length_list)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
if (idx + 1) % 10 == 0 and idx:
cur_loss = total_loss
loss_history.append(cur_loss / (idx+1))
f1_history.append(f1 / (idx+1))
total_loss = 0
print("epochs : {}, batch : {}, loss : {}, f1 : {}".format(epoch+1, idx*batch_size,
cur_loss / (idx * batch_size), f1 / (idx+1)))
plt.plot(np.arange(len(loss_history)), np.array(loss_history))
plt.xlabel('Iterations')
plt.ylabel('Training Loss')
plt.title('LSTM+CRF model')
plt.show()
plt.plot(np.arange(len(f1_history)), np.array(f1_history))
plt.title('train f1 scores')
plt.show()
model.eval()
f1 = 0
f1_history = []
s = 0
with torch.no_grad():
for idx, (inputs, targets, length_list) in enumerate(test_dataloader):
loss = (-1) * model(inputs, length_list, targets)
total_loss += loss.item()
pred = model.predict(inputs, length_list)
f1 += compute_f1(pred, targets, length_list) * 32
print("f1 score : {}, test size = {}".format(f1/3200, 3200))
if __name__ == '__main__':
x_train, y_train = read_data("data/conll2003/train.txt", 14000)
x_test, y_test = read_data("data/conll2003/test.txt", 3200)
word2idx, tag2idx, vocab_size = pre_processing()
train_dataloader, train_max_length = get_dataloader(x_train, y_train, batch_size)
test_dataloader, test_max_length = get_dataloader(x_test, y_test, 32)
train(LSTM_CRF, vocab_size, tag2idx, embedding_size, hidden_size, max_length=train_max_length, vectors=None)
训练结果:
离paper with code上的最好得分90.94差了一个多点,大概是我没有调参?(不想调了,1660ti跑的太慢惹)