self -attention
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
import torch.nn as nn
import os
import time
import math
from tqdm import tqdm
def get_data(path,num=None):
all_text = []
all_label = []
with open(path,"r",encoding="utf8") as f:
all_data = f.read().split("\n")
for data in all_data:
try:
if len(data) == 0:
continue
data_s = data.split(" ")
if len(data_s) != 2:
continue
text,label = data_s
label = int(label)
except Exception as e:
print(e)
else:
all_text.append(text)
all_label.append(int(label))
if num is None:
return all_text,all_label
else:
return all_text[:num], all_label[:num]
def build_word2index(train_text):
word_2_index = {"PAD":0,"UNK":1}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
class TextDataset(Dataset):
def __init__(self,all_text,all_lable):
self.all_text = all_text
self.all_lable = all_lable
def __getitem__(self, index):
global word_2_index
text = self.all_text[index]
text_index = [word_2_index.get(i,1) for i in text]
label = self.all_lable[index]
text_len = len(text)
return text_index,label,text_len
def process_batch_batch(self, data):
global max_len,word_2_index,index_2_embeding
batch_text = []
batch_label = []
batch_len = []
for d in data:
batch_text.append(d[0])
batch_label.append(d[1])
batch_len.append(d[2])
min_len = min(batch_len)
batch_text = [i[:max_len] for i in batch_text]
batch_text = [i + [0]*(max_len-len(i)) for i in batch_text]
# batch_emebdding = []
# for text_idx in batch_text:
# text_embdding = []
# for idx in text_idx:
# word_emb = index_2_embeding[idx]
# text_embdding.append(word_emb)
# batch_emebdding.append(text_embdding)
return torch.tensor(batch_text),torch.tensor(batch_label)
def __len__(self):
return len(self.all_text)
class Positional(nn.Module):
def __init__(self,embedding_num,max_len = 3000):
super().__init__()
self.position = torch.zeros(size=(max_len,embedding_num),requires_grad=False) # 3000 * embedding
t = torch.arange(1,max_len+1,dtype=torch.float).unsqueeze(1)
w_i = 1/(10000**((torch.arange(0,embedding_num,2))/embedding_num))
w_i_t = w_i*t
self.position[:,::2] = torch.sin(w_i_t)
self.position[:,1::2] = torch.cos(w_i_t)
def forward(self,batch_x): # batch * len * 200
pos = self.position[:batch_x.shape[1],:]
pos = pos.unsqueeze(dim=0)
pos = pos.to(batch_x.device)
result = batch_x + pos
return result
class Self_Attention(nn.Module):
def __init__(self,embedding_num):
super(Self_Attention, self).__init__()
self.W_Q = nn.Linear(embedding_num,embedding_num,bias=False)#将偏置项去掉。
self.W_K = nn.Linear(embedding_num,embedding_num,bias=False)
# self.W_L = nn.Linear(embedding_num,max_len,bias=False)
self.W_V = nn.Linear(embedding_num,embedding_num,bias=False)
self.softmax = nn.Softmax(dim=-1)
def forward(self,x):
Q = self.W_Q(x) # 查询
K = self.W_K(x) # 关键
# L = self.W_L(x) #
V = self.W_V(x) # 值
# s = (Q@(K.transpose(-1,-2)) + L) / (math.sqrt(x.shape[-1]/1.0))
s = (Q@(K.transpose(-1,-2)) ) / (math.sqrt(x.shape[-1]/1.0))#这里就相当于除以一个常数项,如果去掉效果很差,是为了防止在进行softmax的时候一家独大。希望他多注意一点位置。
score = self.softmax(s)
r = score @ V
return r
class Model(nn.Module):
def __init__(self,word_size,embeding_dim,class_num):
super().__init__()
"""
1. 随机数表示字向量
2. 预训练字向量 : 使用bert 字向量替换, 使用sougou字向量
3. 自己基于train_text 训练字向量
"""
self.embedding = torch.nn.Embedding(word_size,embeding_dim)
self.positional = Positional(embeding_dim)
# 5W~18W 短文本数据
self.att_layer = Self_Attention(embeding_dim)
self.linear1 = nn.Linear(embeding_dim,class_num)
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,x,label=None):
x_embedding = self.embedding(x)
x_embedding = self.positional(x_embedding)
x_embedding = self.att_layer(x_embedding)
pre = self.linear1.forward(x_embedding)
pre = torch.mean(pre,dim=1)
if label is not None:
loss = self.loss_fun(pre,label)
return loss
else:
return torch.argmax(pre,dim=-1)
def same_seeds(seed):
torch.manual_seed(seed) # 固定随机种子(CPU)
if torch.cuda.is_available(): # 固定随机种子(GPU)
torch.cuda.manual_seed(seed) # 为当前GPU设置
torch.cuda.manual_seed_all(seed) # 为所有GPU设置
np.random.seed(seed) # 保证后续使用random函数时,产生固定的随机数
torch.backends.cudnn.benchmark = False # GPU、网络结构固定,可设置为True
torch.backends.cudnn.deterministic = True # 固定网络结构
# word2vec 复现
if __name__ == "__main__":
same_seeds(1007)
train_text,train_lable = get_data(os.path.join("..","data","文本分类","train.txt"),70000)
dev_text,dev_lable = get_data(os.path.join("..","data","文本分类","dev.txt"),10000)
assert len(train_lable) == len(train_text),"训练数据长度都不一样,你玩冒险呢?"
assert len(dev_text) == len(dev_lable),"验证数据长度都不一样,你玩冒险呢?"
embedding_num = 200
word_2_index = build_word2index(train_text)
train_batch_size = 50
max_len = 30
epoch = 10
lr = 0.001
class_num = len(set(train_lable))
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"
train_dataset = TextDataset(train_text,train_lable)
train_dataloader = DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True,collate_fn=train_dataset.process_batch_batch)
dev_dataset = TextDataset(dev_text, dev_lable)
dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False,collate_fn=dev_dataset.process_batch_batch)
model = Model(len(word_2_index),embedding_num,class_num).to(device)
opt = torch.optim.Adam(model.parameters(),lr)
s_time = time.time()
for e in range(epoch):
print("*" * 100)
for bi,(batch_text,batch_label) in (enumerate(train_dataloader,start=1)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
loss = model.forward(batch_text,batch_label)
loss.backward()
opt.step()
opt.zero_grad()
print(f"loss:{loss:.2f}")
e_time = time.time()
# print(f"cost time :{e_time - s_time:.2f}s")
s_time = time.time()
right_num = 0
for bi,(batch_text,batch_label) in (enumerate(dev_dataloader)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
pre = model.forward(batch_text)
right_num += int(torch.sum(pre == batch_label))
print(f"acc:{right_num/len(dev_dataset) * 100:.2f}%")
multi-head
我们先回顾一下注意力机制
然后我们回忆一下自注意力机制
然而我们的multi-head 思想是,我哦们一句话或者一张图中有多个点值得我们去注意,所以我们将这句话或者图分成几个部分,然后去用注意力机制,这就叫做多头注意力机制。
Mask
之前在Rnn里面我们用的是pack,我们规避了一些不必要的数据进行运算。
我们先搞了一个size为x,数值全为1的一个矩阵,然后我们将pad的那些数据全都变成0,然后乘上之前的x,构建成mask-x,这样就能使得数据变得简单。
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
import torch.nn as nn
import os
import time
import math
from tqdm import tqdm
def get_data(path,num=None):
all_text = []
all_label = []
with open(path,"r",encoding="utf8") as f:
all_data = f.read().split("\n")
for data in all_data:
try:
if len(data) == 0:
continue
data_s = data.split(" ")
if len(data_s) != 2:
continue
text,label = data_s
label = int(label)
except Exception as e:
print(e)
else:
all_text.append(text)
all_label.append(int(label))
if num is None:
return all_text,all_label
else:
return all_text[:num], all_label[:num]
def build_word2index(train_text):
word_2_index = {"PAD":0,"UNK":1}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
class TextDataset(Dataset):
def __init__(self,all_text,all_lable):
self.all_text = all_text
self.all_lable = all_lable
def __getitem__(self, index):
global word_2_index
text = self.all_text[index]
text_index = [word_2_index.get(i,1) for i in text]
label = self.all_lable[index]
text_len = len(text)
return text_index,label,text_len
def process_batch_batch(self, data):
global max_len,word_2_index,index_2_embeding
batch_text = []
batch_label = []
batch_len = []
for d in data:
batch_text.append(d[0])
batch_label.append(d[1])
batch_len.append(d[2])
min_len = max(batch_len)
batch_text = [i[:max_len] for i in batch_text]
batch_text = [i + [0]*(max_len-len(i)) for i in batch_text]
# batch_emebdding = []
# for text_idx in batch_text:
# text_embdding = []
# for idx in text_idx:
# word_emb = index_2_embeding[idx]
# text_embdding.append(word_emb)
# batch_emebdding.append(text_embdding)
return torch.tensor(batch_text),torch.tensor(batch_label),batch_len
def __len__(self):
return len(self.all_text)
class Positional(nn.Module):
def __init__(self,embedding_num,max_len = 3000):
super().__init__()
self.position = torch.zeros(size=(max_len,embedding_num),requires_grad=False) # 3000 * embedding
t = torch.arange(1,max_len+1,dtype=torch.float).unsqueeze(1)
w_i = 1/(10000**((torch.arange(0,embedding_num,2))/embedding_num))
w_i_t = w_i*t
self.position[:,::2] = torch.sin(w_i_t)
self.position[:,1::2] = torch.cos(w_i_t)
def forward(self,batch_x): # batch * len * 200
pos = self.position[:batch_x.shape[1],:]
pos = pos.unsqueeze(dim=0)
pos = pos.to(batch_x.device)
result = batch_x + pos
return result
#这里有个很奇怪的点,上面的位置编码我们创建的是一个max-len*embedding的矩阵,这样相加的话
#也就是说batch里的每户话加的位置编码都是一样的,然后我用加位置编码的思路将一个batch里面的所有字搞成不一样
#的位置编码,然后resahpe,效果好了一点点,所以位置编码究竟是???
#这里还需要再看看,然后想一下他的更新,怎么怎么样操作。
# def forward(self,batch_x): # batch * len * 200
# pos = self.position[:batch_x.shape[0]*batch_x.shape[1],:]
# # a= batch_x[-1]
# # pos = pos.unsqueeze(dim=0)
# pos = pos.to(batch_x.device)
# batch_xx = batch_x.reshape(-1,batch_x.shape[-1])
# result = batch_xx+ pos
# result = result.reshape(batch_x.shape)
# return result
class M_Self_Attention(nn.Module):
def __init__(self,embedding_num,n_heads):
super(M_Self_Attention, self).__init__()
self.W_Q = nn.Linear(embedding_num,embedding_num,bias=False)
self.W_K = nn.Linear(embedding_num,embedding_num,bias=False)
# self.W_L = nn.Linear(embedding_num,max_len,bias=False)
self.W_V = nn.Linear(embedding_num,embedding_num,bias=False)
self.softmax = nn.Softmax(dim=-1)
self.n_heads = n_heads
def forward(self,x):
b,l,n = x.shape
x_ = x.reshape(b, self.n_heads, -1, n)#给他多加一个维度。
Q = self.W_Q(x_) # 查询
K = self.W_K(x_) # 关键
V = self.W_V(x_) # 值
# s = (Q@(K.transpose(-1,-2)) + L) / (math.sqrt(x.shape[-1]/1.0))
s = (Q@(K.transpose(-1,-2)) ) / (math.sqrt(x.shape[-1]/1.0))#这里进行的都是批量操作。
score = self.softmax(s)
r = score @ V#这上面的操作和自注意力是一样的都是q乘上k的转置,然后出来的结果进行softmax,然后把得出来的分数再乘于V。
r = r.reshape(b,l,n)
return r
class Add_Norm(nn.Module):
def __init__(self,embedding_num):
super().__init__()
self.Add = nn.Linear(embedding_num,embedding_num)#这个add就是一个linear层,其实都不用加这个线形层,我们直接用mutil出来的结果和一开始输入的x相加就可以了。
#然后求进行一个Layernorm
self.Norm = nn.LayerNorm(embedding_num)
# self.Norm = nn.BatchNorm1d
# layernorm和batchNorm是两种归一化方式。数据在回传的时候,梯度会消失(小数乘于小数,太小了就没了),我们对每一层的输出进行一个归一化,
#所以我们对其进行一个归一化,对于我们NLP来说我们都是在循环Layer这一层,所以我们常用的是LayerNorm
def forward(self,x): # B * Layer * emb
add_x = self.Add(x)
norm_x = self.Norm(add_x)
return norm_x
class Feed_Forward(nn.Module):#这一层更是简单了。
def __init__(self,embedding_num,feed_num):
super(Feed_Forward, self).__init__()
self.l1 = nn.Linear(embedding_num,feed_num)#这里就不用一个正方形了,原论文中就是一个升维度的操作的。
self.relu = nn.ReLU()
self.l2 = nn.Linear(feed_num,embedding_num)
def forward(self,x):
l1_x = self.l1(x)
r_x = self.relu(l1_x)
l2_x = self.l2(r_x)
return l2_x
class Block(nn.Module):
def __init__(self,embeding_dim,n_heads,feed_num):
super(Block, self).__init__()
self.att_layer = M_Self_Attention(embeding_dim, n_heads)
self.add_norm1 = Add_Norm(embeding_dim)
self.feed_forward = Feed_Forward(embeding_dim, feed_num)
self.add_norm2 = Add_Norm(embeding_dim)
self.n = 100
def forward(self,x):
att_x = self.att_layer(x)
adn_x1 = self.add_norm1(att_x)
adn_x1 = x + adn_x1 # 残差网络
ff_x = self.feed_forward(adn_x1)
adn_x2 = self.add_norm2(ff_x)
adn_x2 = adn_x1 + adn_x2 # 残差网络
return adn_x2
class TransformerEncoder(nn.Module):
def __init__(self,word_size,embeding_dim,class_num,n_heads,feed_num,N):
super().__init__()
self.embedding = torch.nn.Embedding(word_size,embeding_dim)
self.positional = Positional(embeding_dim)
# 5W~18W 短文本数据
# self.blocks = nn.ModuleList([Block(embedding_num,n_heads,feed_num)]*N)
self.blocks = nn.Sequential(*[Block(embedding_num,n_heads,feed_num) for i in range(N)])
self.linear1 = nn.Linear(embeding_dim,class_num)
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,x,batch_len,label=None):
x = self.embedding(x)
x = self.positional(x)
#经过了embedding和位置编码,里面就不存在0了,所以我们不好用pack那种方式。
#我们现在知道了数据的长度,我们把他后面的数据再搞成0
# mask_x = torch.ones(size=(*x.shape[:2],1),device=x.device)
mask_x = torch.ones_like(x,device=x.device)
for i in range(len(batch_len)):
mask_x[i][batch_len[i]:] = 0
##mask_x[i].shape -》torch.Size([30, 200])
x = mask_x * x
x = self.blocks(x)
pre = self.linear1.forward(x)
pre = torch.mean(pre,dim=1)
if label is not None:
loss = self.loss_fun(pre,label)
return loss
else:
return torch.argmax(pre,dim=-1)
def same_seeds(seed):
torch.manual_seed(seed) # 固定随机种子(CPU)
if torch.cuda.is_available(): # 固定随机种子(GPU)
torch.cuda.manual_seed(seed) # 为当前GPU设置
torch.cuda.manual_seed_all(seed) # 为所有GPU设置
np.random.seed(seed) # 保证后续使用random函数时,产生固定的随机数
torch.backends.cudnn.benchmark = False # GPU、网络结构固定,可设置为True
torch.backends.cudnn.deterministic = True # 固定网络结构
# word2vec 复现
if __name__ == "__main__":
same_seeds(1007)
train_text,train_lable = get_data(os.path.join("..","data","文本分类","train.txt"),70000)
dev_text,dev_lable = get_data(os.path.join("..","data","文本分类","dev.txt"),10000)
assert len(train_lable) == len(train_text),"训练数据长度都不一样,你玩冒险呢?"
assert len(dev_text) == len(dev_lable),"验证数据长度都不一样,你玩冒险呢?"
embedding_num = 200
word_2_index = build_word2index(train_text)
train_batch_size = 50
max_len = 30
epoch = 10
lr = 0.001
n_heads = 2
N = 2
feed_num = int(embedding_num*1.2)
class_num = len(set(train_lable))
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"
train_dataset = TextDataset(train_text,train_lable)
train_dataloader = DataLoader(train_dataset,batch_size=train_batch_size,shuffle=True,collate_fn=train_dataset.process_batch_batch)
dev_dataset = TextDataset(dev_text, dev_lable)
dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False,collate_fn=dev_dataset.process_batch_batch)
model = TransformerEncoder(len(word_2_index),embedding_num,class_num,n_heads,feed_num,N).to(device)
opt = torch.optim.Adam(model.parameters(),lr)
s_time = time.time()
for e in range(epoch):
print("*" * 100)
for bi,(batch_text,batch_label,batch_len) in (enumerate(train_dataloader,start=1)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
loss = model.forward(batch_text,batch_len,batch_label)
loss.backward()
opt.step()
opt.zero_grad()
print(f"loss:{loss:.2f}")
e_time = time.time()
# print(f"cost time :{e_time - s_time:.2f}s")
s_time = time.time()
right_num = 0
for bi,(batch_text,batch_label,batch_len) in (enumerate(dev_dataloader)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
pre = model.forward(batch_text,batch_len)
right_num += int(torch.sum(pre == batch_label))
print(f"acc:{right_num/len(dev_dataset) * 100:.2f}%")
基础解包操作。
d1 = {"name":"buding","score":10}
d2 = {"name":"liu","score":10.2}
d3 = {"name":"lin","score":9.2}
d4 = {"name":"yuan","score":11.2}
l = [d1,d2,d3,d4]
names,scores = zip(*[i.values() for i in l])
未包装代码。以及知识点
后面是我自己写的代码,没有封装block
import os
import torch.nn as nn
import torch.cuda
from torch.utils.data import Dataset,DataLoader
import math
import time
import numpy as np
def get_data(path,num = None):
all_text = []
all_label = []
with open (path,"r",encoding="utf-8") as f:
# all_data = f.read().split("\n")
al_data = f.readlines()
alll_data = [line.strip("\n") for line in al_data]
for data in alll_data:
try:
if len(data)==0:
continue
data_s =data.split("\t")
if len(data_s) != 2:
continue
text,label = data_s
label = int(label)
#这里有个关键就是这个label要转化为int类型
except Exception as e:
print(e)
else:
all_text.append(text)
all_label.append(label)
if num is None:
return all_text,all_label
else:
return all_text[:num],all_label[:num]
def build_word2index(train_text):
word_2_index = {"PAD":0,"UNK":1}
for text in train_text:
for word in text:
if word not in word_2_index:
word_2_index[word] = len(word_2_index)
return word_2_index
# 写这个函数的时候我们要注意,怎么去构建我们的word2index,
# if word not in word_2_index:
# word_2_index[word] = len(word_2_index)
# 这两句话至关重要。
class TextDataset(Dataset):
def __init__(self,all_text,all_label):
self.all_text = all_text
self.all_label = all_label
def __getitem__(self, index):
global word_2_index
text = self.all_text[index]
text_index = [word_2_index.get(i,1) for i in text]
label = self.all_label[index]
text_len = len(text)
return text_index ,label ,text_len
def process_batch (self,data):
global max_len,word_2_index
batch_text = []
batch_label = []
batch_len = []
for i in data:
batch_text.append(i[0])
batch_label.append(i[1])
batch_len.append(i[2])
#这里记得有一个策略,我们是先切割数据,然后填充,然后去转化word2index
# 这里的切割一般不会按照最小值来,但我们可以按照一个batch的最大值来,但这样的话我们要知道
#我们应该先把数据按照数据长度拍一个顺序,然后每个batch拿到数据都会差不多
#这样一来,我们外面取数据的时候就不能shuffle了。
#我们下面的代码还是按照外部传入的max_len来对数据进行切割,全部统一
batch_text = [i[:max_len] for i in batch_text]
batch_text = [i +[0]*(max_len-len(i)) for i in batch_text]
return torch.tensor(batch_text),torch.tensor(batch_label),batch_len
def __len__(self):
return len(train_text)
class Positional(nn.Module):
def __init__(self,embedding_num,max_len = 3000):
super().__init__()
#这里赋值参数的要放在后面。
self.position = torch.zeros(size = (max_len,embedding_num),requires_grad = False)
t = torch.arange(1,max_len+1,dtype=torch.float).unsqueeze(1)
w_i = 1/(10000**((torch.arange(0,embedding_num,2))/embedding_num))
w_i_t = w_i*t
self.position[:,::2] = torch.sin(w_i_t)
self.position[:,1::2] = torch.cos(w_i_t)
def forward(self,batch_x):#这时候我们传入的batch_x,他的size是batch * len * embedding_num
pos = self.position[:batch_x.shape[1],:]
pos = pos.unsqueeze(dim=0)
pos = pos.to(batch_x.device)
result = batch_x + pos
return result
class Self_Attention(nn.Module):
def __init__(self,embedding_num,n_head):
super().__init__()
self.W_Q = nn.Linear(embedding_num,embedding_num,bias=False)
self.W_K = nn.Linear(embedding_num,embedding_num,bias=False)
# self.W_L = nn.Linear(embedding_num,max_len,bias=False)
self.W_V = nn.Linear(embedding_num,embedding_num,bias=False)
self.softmax = nn.Softmax(dim = -1)
self.n_head = n_head
def forward(self,x):
b,l,n = x.shape
x_ = x.reshape(b,self.n_head,-1,n)
Q = self.W_Q(x_) # 查询
K = self.W_K(x_) # 关键
V = self.W_V(x_) # 值
s = (Q@(K.transpose(-1,-2)))/ (math.sqrt(x.shape[-1])/1.0)
score = self.softmax(s)
r = score @ V
r = r.reshape(b,l,n)
return r
class NORM(nn.Module):
def __init__(self,embedding_num ):
super().__init__()
self.Norm = nn.LayerNorm(embedding_num)
def forward(self,x):
norm_x = self.Norm(x)
return norm_x
class Feed_Forward(nn.Module):
def __init__(self,embedding_num,feed_num):
super().__init__()
self.l1 = nn.Linear(embedding_num, feed_num) # 这里就不用一个正方形了,原论文中就是一个升维度的操作的。
self.relu = nn.ReLU()
self.l2 = nn.Linear(feed_num, embedding_num)
def forward(self,x):
l1_x = self.l1(x)
r_x = self.relu(l1_x)
l2_x = self.l2(r_x)
return l2_x
class Transformer_enconder(nn.Module):
def __init__(self,word_size,embedding_number,class_num,n_head,feed_num):
super().__init__()
self.embedding= torch.nn.Embedding(word_size,embedding_number)
self.positional = Positional(embedding_number)
self.att =Self_Attention(embedding_number,n_head)
self.norm = NORM(embedding_number)
self.feed_forward = Feed_Forward(embedding_number,feed_num)
self.linear1 = nn.Linear(embedding_number,class_num)
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,x,batch_len,label = None):
x = self.embedding(x)
x = self.positional(x)
mask_x = torch.ones_like(x,device=x.device)
for i in range(len(batch_len)):
mask_x[i][batch_len[i]:] = 0
##mask_x[i].shape -》torch.Size([30, 200])
x = mask_x * x
att_x =self.att(x)
x = att_x+x
x = self.norm(x)
ff_x = self.feed_forward(x)
x = x+ff_x
x = self.norm(x)
pre = self.linear1.forward(x)
pre = torch.mean(pre,dim=1)
if label is not None:
loss = self.loss_fun(pre,label)
return loss
else:
return torch.argmax(pre,dim=-1)
def same_seeds(seed):
torch.manual_seed(seed) # 固定随机种子(CPU)
if torch.cuda.is_available(): # 固定随机种子(GPU)
torch.cuda.manual_seed(seed) # 为当前GPU设置
torch.cuda.manual_seed_all(seed) # 为所有GPU设置
np.random.seed(seed) # 保证后续使用random函数时,产生固定的随机数
torch.backends.cudnn.benchmark = False # GPU、网络结构固定,可设置为True
torch.backends.cudnn.deterministic = True # 固定网络结构
if __name__ =="__main__":
same_seeds(1007)
train_text,train_label= get_data(os.path.join("..","data","文本分类","train.txt"),70000)
dev_text,dev_label = get_data(os.path.join("..","data","文本分类","train.txt"),10000)
assert len(train_text)==len(train_label),"长度不一样,不可训练"
assert len(dev_text) == len(dev_label),"长度不一样,不可训练"
word_2_index = build_word2index(train_text)
index_2_word = list(word_2_index)
word_size = len(index_2_word)
epoch= 100
lr = 0.001
max_len = 30
class_num = len(set(train_label))
n_head = 2
embedding_number = 100
feed_num = int(embedding_number * 1.2)
# hidden_num = 50
train_batch_size = 50
train_dataset = TextDataset(train_text,train_label)
train_dataloader = DataLoader(train_dataset,batch_size=train_batch_size,shuffle=False,collate_fn = train_dataset.process_batch )
dev_dataset = TextDataset(dev_text, dev_label)
dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False, collate_fn=dev_dataset.process_batch)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = Transformer_enconder(word_size,embedding_number,class_num,n_head,feed_num).to(device)
opt = torch.optim.Adam(model.parameters(),lr)
s_time = time.time()
for e in range(epoch):
print("*" * 100)
for bi, (batch_text, batch_label, batch_len) in (enumerate(train_dataloader, start=1)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
loss = model.forward(batch_text, batch_len, batch_label)
loss.backward()
opt.step()
opt.zero_grad()
print(f"loss:{loss:.2f}")
e_time = time.time()
# print(f"cost time :{e_time - s_time:.2f}s")
s_time = time.time()
right_num = 0
for bi, (batch_text, batch_label, batch_len) in (enumerate(dev_dataloader)):
batch_text = batch_text.to(device)
batch_label = batch_label.to(device)
pre = model.forward(batch_text, batch_len)
right_num += int(torch.sum(pre == batch_label))
print(f"acc:{right_num / len(dev_dataset) * 100:.2f}%")