### Self-Attention

”The animal didn’t cross the street because it was too tired”

#### Self-Attention 矩阵的计算

##### 这里我们用一个文本2分类的任务融合 attention机制来解释 multi heads attention理论
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
import random

def set_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

set_random_seed(6688)
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
data_path = 'F:/data/'


from collections import Counter

def build_vocab(sents, max_words=50000):
word_counts = Counter()
for word in sents:
word_counts[word] += 1
itos = [w for w, c in word_counts.most_common(max_words)]
itos = itos + ["UNK"]
stoi = {w: i for i, w in enumerate(itos)}
return itos, stoi

tokenize = lambda x: x.split()
vob = tokenize(text.lower())
itos, stoi = build_vocab(vob)

itos[0:5]

['1', '0', 'the', ',', 'a']

##### 设计数据集
class Corpus:
def __init__(self, data_path, sort_by_len=False):
self.vocab = vob
self.sort_by_len = sort_by_len
self.train_data, self.train_label = self.tokenize(data_path + 'train.tsv')
self.valid_data, self.valid_label = self.tokenize(data_path + 'dev.tsv')
self.test_data, self.test_label = self.tokenize(data_path + 'test.tsv')

def tokenize(self, text_path):
with open(text_path) as f:
index_data = []  # 索引数据，存储每个样本的单词索引列表
labels = []
sentence, label = line.split('\t')
index_data.append(
self.sentence_to_index(sentence.lower())
)
labels.append(
int(label[0])
)
index_data = sorted(index_data, key=lambda x: len(x), reverse=True)
return index_data, labels

def sentence_to_index(self, s):
a = []
for w in s.split():
if w in stoi.keys():
a.append(stoi[w])
else:
a.append(stoi["UNK"])
return a

def index_to_sentence(self, x):
return ' '.join([itos[i] for i in x])

corpus = Corpus(data_path, sort_by_len=False)

##### 设计batches
def get_minibatches(text_idx, labels, batch_size=64, sort=False):
if sort:
text_idx_and_labels = sorted(list(zip(text_idx, labels)), key=lambda x: len(x[0]))
else:
text_idx_and_labels = (list(zip(text_idx, labels)))
text_idx_batches = []
label_batches = []
for i in range(0, len(text_idx), batch_size):
text_batch = [t for t, l in text_idx_and_labels[i:i + batch_size]]
label_batch = [l for t, l in text_idx_and_labels[i:i + batch_size]]
text_idx_batches.append(text_batch)
label_batches.append(label_batch)
return text_idx_batches, label_batches

BATCH_SIZE = 256
VOCAB_SIZE = len(itos)
EMBEDDING_SIZE = 256
OUTPUT_SIZE = 1

train_batches, train_label_batches = get_minibatches(corpus.train_data, corpus.train_label, BATCH_SIZE)
dev_batches, dev_label_batches = get_minibatches(corpus.valid_data, corpus.valid_label, BATCH_SIZE)
test_batches, test_label_batches = get_minibatches(corpus.test_data, corpus.test_label, BATCH_SIZE)


##### 设计attention 中的 positional encoding
import math

class PositionalEncoding(nn.Module):
"Implement the PE function."

def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)

# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)

def forward(self, x):
x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
return self.dropout(x)

##### 设计attention score 的叉乘操作
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()

def forward(self, Q, K, V, attn_mask):
# Q: [ batch_size ,n_heads ,seq_length ,d_k ]
# K: [ batch_size ,n_heads ,seq_length ,d_k ]
# V: [ batch_size ,n_heads ,seq_length ,d_k ]

# scores: [ batch_size ,n_heads ,seq_length ,seq_length ]
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)

# Fills elements of self tensor with value where mask is one.

# attn: [ batch_size ,n_heads ,seq_length,seq_length ]
attn = nn.Softmax(dim=-1)(scores)

# context: [batch_size , n_heads ,seq_length, d_k]
Z = torch.matmul(attn, V)

return Z

##### 设计多头的attention
class MultiHeadAttention(nn.Module):
# self.W_Q.weight: [ n_heads*d_k, EMBEDDING_SIZE ]
self.W_Q = nn.Linear(EMBEDDING_SIZE, n_heads * d_k)

# self.W_K.weight: [ n_heads*d_k, EMBEDDING_SIZE ]
self.W_K = nn.Linear(EMBEDDING_SIZE, n_heads * d_k)

# self.W_V.weight: [ n_heads*d_k, EMBEDDING_SIZE ]
self.W_V = nn.Linear(EMBEDDING_SIZE, n_heads * d_k)

self.d_model = embedding_size
self.d_k = d_k

# q: [batch_size,seq_length, EMBEDDING_SIZE]
# residual, batch_size = Q, Q.size(0)

batch_size = Q.size(0)
# (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)

# q_s: [batch_size, n_heads, seq_length, d_k]
q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)

# k_s: [batch_size, n_heads, seq_length, d_k]
k_s = self.W_K(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)

# v_s: [batch_size, n_heads, seq_length, d_k]
v_s = self.W_V(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)

# Z : [batch_size, n_heads, seq_length, d_k]
Z = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)

# Z : [batch_size , seq_length , n_heads * d_k]
Z = Z.transpose(1, 2).contiguous().view(batch_size, -1, self.d_k * self.n_heads)

# output : [batch_size , seq_length , embedding_size]
output = nn.Linear(self.d_k * self.n_heads, self.d_model).to(device)(Z)

return output

##### 设置 attention的头数量 以及 q k v的维度
d_k = 4  # dimension of K(=Q), V
class Encoder(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, dropout_p=0.5):
super(Encoder, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size)
initrange = 0.1
self.embed.weight.data.uniform_(-initrange, initrange)
self.embed_words = nn.Embedding(vocab_size, embedding_size)
self.linear = nn.Linear(embedding_size, output_size)
self.dropout = nn.Dropout(dropout_p)
self.Pos = PositionalEncoding(embedding_size, dropout_p, max_len=5000)


##### 设计attention 加权平均模型
class WordAVGModel(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, dropout_p=0.5):
super(WordAVGModel, self).__init__()
self.embedding_size = embedding_size
self.output_size = output_size
self.encoder = Encoder(vocab_size, embedding_size, output_size, dropout_p)

# text: [batch_size * max_seq_len]  mask: [batch_size * max_seq_len]

# embedded: [batch_size, max_seq_len, embedding_size]
embedded = self.encoder.embed(text)

# embedded: [batch_size, max_seq_len, embedding_size]
#embedded = self.encoder.Pos(embedded)

# embedded: [batch_size, max_seq_len, embedding_size]
embedded = self.encoder.dropout(embedded)

# enc_inputs to same Q,K,V 为模型加入 multi-heads attention
# a_ts: [batch_size , max_seq_len , embedding_size]

# a_t: [batch_size , max_seq_len]
a_t = torch.sum(a_ts,2)

# a_t: [batch_size , max_seq_len]
a_t = torch.softmax(a_t, dim=1)

# h_self: [batch_size ,embedding_size]
h_self = torch.bmm(a_t.unsqueeze(1), embedded).squeeze()

# embedded: [batch_size, max_seq_len, embedding_size]

# h_av: [batch_size, embedding_size]

# out: [batch_size, output_size]
out = self.encoder.linear(h_self)
#out = self.encoder.linear(h_self + h_av)

return out


model = WordAVGModel(vocab_size=VOCAB_SIZE,
embedding_size=EMBEDDING_SIZE,
output_size=OUTPUT_SIZE,
dropout_p=0.5)

crit = nn.BCEWithLogitsLoss()
model = model.to(device)

def binary_accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc

def train(model, text_idxs, labels, optimizer, crit):
epoch_loss, epoch_acc = 0., 0.
model.train()
total_len = 0.
for text, label in zip(text_idxs, labels):
text = [torch.tensor(x).long().to(device) for x in (text)]
label = [torch.tensor(label).long().to(device)]
lengths = torch.tensor([len(x) for x in text]).long().to(device)

lengths, perm_index = lengths.sort(descending=True)
text = text[perm_index]
label = label[0][perm_index]

preds = model(text, mask).squeeze()  # [batch_size, sent_length]
loss = crit(preds, label.float())
acc = binary_accuracy(preds, label)

loss.backward()
optimizer.step()
epoch_loss += loss.item() * len(label)
epoch_acc += acc.item() * len(label)
total_len += len(label)

return epoch_loss / total_len, epoch_acc / total_len

def evaluate(model, text_idxs, labels, crit):
epoch_loss, epoch_acc = 0., 0.
model.train()
total_len = 0.
for text, label in zip(text_idxs, labels):
text = [torch.tensor(x).long().to(device) for x in (text)]
label = [torch.tensor(label).long().to(device)]
lengths = torch.tensor([len(x) for x in text]).long().to(device)

lengths, perm_index = lengths.sort(descending=True)
text = text[perm_index]
label = label[0][perm_index]
preds = model(text, mask).squeeze()  # [batch_size, sent_length]
loss = crit(preds, label.float())
acc = binary_accuracy(preds, label)
epoch_loss += loss.item() * len(label)
epoch_acc += acc.item() * len(label)
total_len += len(label)

return epoch_loss / total_len, epoch_acc / total_len

##### 训练模型
N_EPOCHS = 30
best_valid_acc = 0.
record = 0
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_batches, train_label_batches, optimizer, crit)
valid_loss, valid_acc = evaluate(model, dev_batches, dev_label_batches, crit)

if valid_acc > best_valid_acc:
record = 0
best_valid_acc = valid_acc
else:
record += 1
if record > 30:
print("early stopping at epoch", epoch)
break
if epoch % 5 == 0:
print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)

Epoch 0 Train Loss 0.6855089499452708 Train Acc 0.5557145779421909
Epoch 0 Valid Loss 0.6951910366705798 Valid Acc 0.5091743135671003
Epoch 5 Train Loss 0.5996944122435344 Train Acc 0.7028215090279393
Epoch 5 Valid Loss 0.5999153551705386 Valid Acc 0.7075688084331128
Epoch 10 Train Loss 0.4333473568592937 Train Acc 0.8600490696955205
Epoch 10 Valid Loss 0.4965302976993246 Valid Acc 0.7694954188591844
Epoch 15 Train Loss 0.32046079018673657 Train Acc 0.914127990246989
Epoch 15 Valid Loss 0.46561843430230376 Valid Acc 0.7970183519048428
Epoch 20 Train Loss 0.24862328401142725 Train Acc 0.9407074219169999
Epoch 20 Valid Loss 0.4701578086669292 Valid Acc 0.7935779838387026
Epoch 25 Train Loss 0.20012277661322134 Train Acc 0.9533837661619349
Epoch 25 Valid Loss 0.4839585598455657 Valid Acc 0.7970183535453377

##### 预测test
model.load_state_dict(torch.load("wordavg-model-Adam.pth"))
test_loss, test_acc = evaluate(model, test_batches, test_label_batches, crit)
print("Test Loss", test_loss, "Test Acc", test_acc)

Test Loss 0.4726336712957672 Test Acc 0.7935200437027828



• 6
点赞
• 32
收藏
觉得还不错? 一键收藏
• 1
评论
06-17 647
09-18 2006

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。