看图介绍
先是我们的input将其进行embedding,然后加上我们的位置编码,(rnn中是矩阵相加,lstm中用的是矩阵拼接,而我们的位置编码加的时候还有门道)然后传入多头注意力机制(里面怎么怎么乘于qkv,然后进行softmax,得到得分矩阵),从三条线中我们可以看出来它使用的是三头注意力,旁边的那条线是残差,所谓残差就是假设我们有一个A矩阵(embedding)我们将其传入到模型之中,然后得到一个B矩阵,但是这个模型的效果很不好,我们就认为B是A的一个损失版本,为减少损失,我们将A和B进行相加,得到C,那么C矩阵就是A的一个增强版本,然后就到了feed Frward,就是两个线形层中间加一个relu,线性变换的功能就是空间变换,中间的激活函数是为了防止梯度爆炸的。
位置编码的作用,之前我们在做古诗生成的时候我们为了学习字字之间的信息都是将每个字的信息一层层的往下传递,用的是rnn,加入了位置编码之后我们就不用一层一层向下传递了,类似与将wordembedding,位置编码也可以用随机矩阵,比如我们产生一个-1到1的随机矩阵,还有一个就是我们直接用他的index做位置编码行不行,这里是强烈不建议,首先index、是有大小关系的,其次相邻的位置难以辨认,我们自然是想找一个有规则的矩阵来表示位置编码,transformer中用的就是sin和cos构成的一组编码。满足相近的数据编码后不会很相近,而且还能无线长的编码下去,在embedding后面加上位置信息要保证最后embedding维度上加上的信息是一样的,不能去改变他的embedding信息。
这里我们用的是随机数位置编码。没啥用!!!
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
import torch.nn as nn
import os
import time
from tqdm import tqdm
def get_data(path,num=None):
all_text = []
all_label = []
with open(path,"r",encoding="utf8") as f:
all_data = f.read().split("\n")
for data in all_data:
try:
if len(data) == 0:
continue
data_s = data.split(" ")
if len(data_s) != 2:
continue
text,label = data_s
label = int(label)
except Exception as e:
print(e)
else:
all_text.append(text)
all_label.append(int(label))
if num is None:
return all_text,all_label
else:
return all_text[:num], all_label[:num]
def build_word2index(train_text):
word_2_index = {"PAD":0,"UNK":1}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
class TextDataset(Dataset):
def __init__(self,all_text,all_lable):
self.all_text = all_text
self.all_lable = all_lable
def __getitem__(self, index):
global word_2_index
text = self.all_text[index]
text_index = [word_2_index.get(i,1) for i in text]
label = self.all_lable[index]
text_len = len(text)
return text_index,label,text_len
def process_batch_batch(self, data):
global max_len,word_2_index,index_2_embeding
batch_text = []
batch_label = []
batch_len = []
for d in data:
batch_text.append(d[0])
batch_label.append(d[1])
batch_len.append(d[2])
min_len = min(batch_len)
batch_text = [i[:max_len] for i in batch_text]
batch_text = [i + [0]*(max_len-len(i)) for i in batch_text]
# batch_emebdding = []
# for text_idx in batch_text:
# text_embdding = []
# for idx in text_idx:
# word_emb = index_2_embeding[idx]
# text_embdding.append(word_emb)
# batch_emebdding.append(text_embdding)
return torch.tensor(batch_text),torch.tensor(batch_label)
def __len__(self):
return len(self.all_text)
class Positional(nn.Module):
def __init__(self,embedding_num,max_len = 3000):
super().__init__()
self.position = torch.normal(0,1,size=(max_len,1)) # embedding * 3000,均值为0,方差为1 我们这里用的是随机的位置编码。
self.position = self.position.repeat(1,embedding_num)#3000*200
def forward(self,batch_x): # batch * len * 200
pos = self.position[:batch_x.shape[1],:]
pos = pos.unsqueeze(dim=0)
pos = pos.to(batch_x.device)
result = batch_x + pos
return result#上面的加法用了,广播机制,所以还是50,30,200,里面包含了位置信息
class Model(nn.Module):
def __init__(self,word_size,embeding_dim,class_num):
super().__init__()
"""
1. 随机数表示字向量
2. 预训练字向量 : 使用bert 字向量替换, 使用sougou字向量
3. 自己基于train_text 训练字向量
"""
self.embedding = torch.nn.Embedding(word_size,embeding_dim)
self.positional = Positional(embeding_dim)
# 5W~18W 短文本数据
self.linear1 = nn.Linear(embeding_dim,class_num)
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,x,label=None):
x_embedding = self.embedding(x)
x_embedding = self.positional(x_embedding)
pre = self.linear1.forward(x_embedding)
pre = torch.mean(pre,dim=1)
if label is not None:
loss = self.loss_fun(pre,label)
return loss
else:
return torch.argmax(pre,dim=-1)
def same_seeds(seed):
torch.manual_seed(seed) # 固定随机种子(CPU)
if torch.cuda.is_available(): # 固定随机种子(GPU)
torch.cuda.manual_seed(seed) # 为当前GPU设置
torch.cuda.manual_seed_all(seed) # 为所有GPU设置
np.random.seed(seed) # 保证后续使用random函数时,产生固定的随机数
torch.backends.cudnn.benchmark = False # GPU、网络结构固定,可设置为True
torch.backends.cudnn.deterministic = True # 固定网络结构
# word2vec 复现
if __name__ == "__main__":
same_seeds(1007)
train_text,train_lable = get_data(os.path.join("..","data","文本分类","train.txt"),70000)
dev_text,dev_lable = get_data(os.path.join("..","data","文本分类","dev.txt"),10000)
assert len(train_lable) == len(train_text),"训练数据长度都不一样,你玩冒险呢?"
assert len(dev_text) == len(dev_lable),"验证数据长度都不一样,你玩冒险呢?"
embedding_num = 200
word_2_index = build_word2index(train_text)
train_batch_size = 50
max_len = 30
epoch = 10
lr = 0.001
class_num = len(set(train_lable))
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"
train_dataset = TextDataset(train_text,train_lable)
train_dataloader = DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True,collate_fn=train_dataset.process_batch_batch)
dev_dataset = TextDataset(dev_text, dev_lable)
dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False,collate_fn=dev_dataset.process_batch_batch)
model = Model(len(word_2_index),embedding_num,class_num).to(device)
opt = torch.optim.Adam(model.parameters(),lr)
s_time = time.time()
for e in range(epoch):
print("*" * 100)
for bi,(batch_text,batch_label) in (enumerate(train_dataloader,start=1)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
loss = model.forward(batch_text,batch_label)
loss.backward()
opt.step()
opt.zero_grad()
print(f"loss:{loss:.2f}")
e_time = time.time()
# print(f"cost time :{e_time - s_time:.2f}s")
s_time = time.time()
right_num = 0
for bi,(batch_text,batch_label) in (enumerate(dev_dataloader)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
pre = model.forward(batch_text)
right_num += int(torch.sum(pre == batch_label))
print(f"acc:{right_num/len(dev_dataset) * 100:.2f}%")
正弦余弦来构造位置编码。
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
import torch.nn as nn
import os
import time
from tqdm import tqdm
def get_data(path,num=None):
all_text = []
all_label = []
with open(path,"r",encoding="utf8") as f:
all_data = f.read().split("\n")
for data in all_data:
try:
if len(data) == 0:
continue
data_s = data.split(" ")
if len(data_s) != 2:
continue
text,label = data_s
label = int(label)
except Exception as e:
print(e)
else:
all_text.append(text)
all_label.append(int(label))
if num is None:
return all_text,all_label
else:
return all_text[:num], all_label[:num]
def build_word2index(train_text):
word_2_index = {"PAD":0,"UNK":1}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
class TextDataset(Dataset):
def __init__(self,all_text,all_lable):
self.all_text = all_text
self.all_lable = all_lable
def __getitem__(self, index):
global word_2_index
text = self.all_text[index]
text_index = [word_2_index.get(i,1) for i in text]
label = self.all_lable[index]
text_len = len(text)
return text_index,label,text_len
def process_batch_batch(self, data):
global max_len,word_2_index,index_2_embeding
batch_text = []
batch_label = []
batch_len = []
for d in data:
batch_text.append(d[0])
batch_label.append(d[1])
batch_len.append(d[2])
min_len = min(batch_len)
batch_text = [i[:max_len] for i in batch_text]
batch_text = [i + [0]*(max_len-len(i)) for i in batch_text]
# batch_emebdding = []
# for text_idx in batch_text:
# text_embdding = []
# for idx in text_idx:
# word_emb = index_2_embeding[idx]
# text_embdding.append(word_emb)
# batch_emebdding.append(text_embdding)
return torch.tensor(batch_text),torch.tensor(batch_label)
def __len__(self):
return len(self.all_text)
class Positional(nn.Module):
def __init__(self,embedding_num,max_len = 3000):
super().__init__()
self.position = torch.zeros(size=(max_len,embedding_num),requires_grad=False) # 3000 * embedding
t = torch.arange(1,max_len+1,dtype=torch.float).unsqueeze(1)
w_i = 1/(10000**((torch.arange(0,embedding_num,2))/embedding_num))
w_i_t = w_i*t
self.position[:,::2] = torch.sin(w_i_t)
self.position[:,1::2] = torch.cos(w_i_t)
#self.position[:,::2] 和 self.position[:,1::2] 是对 self.position 张量的切片操作:
#self.position[:,::2] 选择所有行和偶数列(索引从 0 开始,步长为 2)。
#self.position[:,1::2] 选择所有行和奇数列(索引从 1 开始,步长为 2)
def forward(self,batch_x): # batch * len * 200
pos = self.position[:batch_x.shape[1],:]
pos = pos.unsqueeze(dim=0)
pos = pos.to(batch_x.device)
result = batch_x + pos
return result
class Model(nn.Module):
def __init__(self,word_size,embeding_dim,class_num):
super().__init__()
"""
1. 随机数表示字向量
2. 预训练字向量 : 使用bert 字向量替换, 使用sougou字向量
3. 自己基于train_text 训练字向量
"""
self.embedding = torch.nn.Embedding(word_size,embeding_dim)
self.positional = Positional(embeding_dim)
# 5W~18W 短文本数据
self.linear1 = nn.Linear(embeding_dim,class_num)
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,x,label=None):
x_embedding = self.embedding(x)
x_embedding = self.positional(x_embedding)
pre = self.linear1.forward(x_embedding)
pre = torch.mean(pre,dim=1)
if label is not None:
loss = self.loss_fun(pre,label)
return loss
else:
return torch.argmax(pre,dim=-1)
def same_seeds(seed):
torch.manual_seed(seed) # 固定随机种子(CPU)
if torch.cuda.is_available(): # 固定随机种子(GPU)
torch.cuda.manual_seed(seed) # 为当前GPU设置
torch.cuda.manual_seed_all(seed) # 为所有GPU设置
np.random.seed(seed) # 保证后续使用random函数时,产生固定的随机数
torch.backends.cudnn.benchmark = False # GPU、网络结构固定,可设置为True
torch.backends.cudnn.deterministic = True # 固定网络结构
# word2vec 复现
if __name__ == "__main__":
same_seeds(1007)
train_text,train_lable = get_data(os.path.join("..","data","文本分类","train.txt"),70000)
dev_text,dev_lable = get_data(os.path.join("..","data","文本分类","dev.txt"),10000)
assert len(train_lable) == len(train_text),"训练数据长度都不一样,你玩毛线呢?"
assert len(dev_text) == len(dev_lable),"验证数据长度都不一样,你玩毛线呢?"
embedding_num = 200
word_2_index = build_word2index(train_text)
train_batch_size = 50
max_len = 30
epoch = 10
lr = 0.001
class_num = len(set(train_lable))
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"
train_dataset = TextDataset(train_text,train_lable)
train_dataloader = DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True,collate_fn=train_dataset.process_batch_batch)
dev_dataset = TextDataset(dev_text, dev_lable)
dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False,collate_fn=dev_dataset.process_batch_batch)
model = Model(len(word_2_index),embedding_num,class_num).to(device)
opt = torch.optim.Adam(model.parameters(),lr)
s_time = time.time()
for e in range(epoch):
print("*" * 100)
for bi,(batch_text,batch_label) in (enumerate(train_dataloader,start=1)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
loss = model.forward(batch_text,batch_label)
loss.backward()
opt.step()
opt.zero_grad()
print(f"loss:{loss:.2f}")
e_time = time.time()
# print(f"cost time :{e_time - s_time:.2f}s")
s_time = time.time()
right_num = 0
for bi,(batch_text,batch_label) in (enumerate(dev_dataloader)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
pre = model.forward(batch_text)
right_num += int(torch.sum(pre == batch_label))
print(f"acc:{right_num/len(dev_dataset) * 100:.2f}%")