之前写代码都是照着抄,这一次看了一下参考代码的思路,完全靠自己写,事实证明自己写代码忽视了很多实现细节:
- Embedding输入的Tensor.shape:
batch_size*seq_len
,dtype=torch.long
做index。用于计算交叉熵loss的pred数据类型torch.float
。 - 计算图创建在Tensor对象中。不同对象作为函数的参数传递的可以是值可以是对象引用。
- 以及一些工具gensim不同版本涉及的不同用法和使用时可能要调整的bug
- 当然还有一些自己容易犯的错误,比如拼写错误、变量的统一修改、
for epoch in epochs、for word in enumerate(o)或者for i,word in o。往后还要多实践实践,才能提高代码能力!
python对象的原地修改
for i,sen in enumerate(train_data): # 没有修改train_data
if len(sen)>20:
print("True")
sen = sen[:20]
if len(train_data[i])>20:
print("False")
elif len(sen)<20:
print("False")
for i,sen in enumerate(train_data):
if len(sen)>20:
print("True")
train_data[i] = sen[:20]
if len(train_data[i])>20:
print("False")
elif len(sen)<20:
print("False")
def sen2index(self): # 外部控制 sen2index的 封装的同时保证一定的控制
# 句子转为在字典中的index
sens = self.sens
# sens = [sen.split(" ") for sen in sens ]
sentence_list = []
for sen in (sens): # sen 指向原来的内存
for i,word in enumerate(sen):
if self.word2index.get(word)!= None:
# sen[i] 此处是一个局部变量 但也会原地修改sens
sen[i] = self.word2index[word] # sen[i] 指向一个字符串 不能直接修改
else:
sen[i] = self.word2index["<UNK>"]
if len(sen)>self.sen_len:
# 另外创建sen
# sen= sen[:self.sen_len] # 复制了一份 没有修改原来的
sen[:]= sen[:self.sen_len]
elif len(sen)<self.sen_len:
sen.extend([self.word2index['<PAD>']]*(self.sen_len-len(sen)))
# sen[:] = sen[:len(sen)]+[self.word2index['<PAD>']]*(self.sen_len-len(sen))
# print("pad后的长度",len(sen))
else:
pass
sentence_list.append(sen)
# print(sens.shape)
sens = torch.LongTensor(sens)
return sens
Preprocess类对象
- preprocess类功能是将sentence转换 为对应的index、label转换为对应的int Tensor、同时获取word2vec中的embedding为下一步封装数据以及模型的构建做好准备
- 各个函数保持适当耦合度是一件精细的工作,这些知识需要了解设计模型会有更深刻的体会。如make_embedding 控制了函数的执行顺序
- 还需注意的一点是:参数传递 整数、字符串、元组、不可变对象不会更改外部值 数组和字典则会
# 根据参考代码这里的要求是 model 在这之前已经构建好了 如果在模型李构建 或者未定义调用就报错
# 从封装的角度看 要完成em的构建 将数据转为index 对接模型
class Preprocess:
def __init__(self,sens,sen_len,embedding_dim,model_path = None,y = None):
self.model_path = model_path
self.embedding_dim = embedding_dim
self.embedding_matrix = []
self.word2index = {}
self.words = []
self.sens = sens
self.y = y
self.sen_len = sen_len
def load_word_embedding(self,file):
# 路径中查询word2vec 或者临时创建 word2vec对输入的句子创建一个字典,同时对每个word embedding
self.embedding_matrix = []
self.embedding = Word2Vec.load(file)
model = self.embedding
self.word2index = model.wv.key_to_index
self.words = model.wv.index_to_key
for i in range(len(model.wv)):
self.embedding_matrix.append(model.wv[i])
self.embedding_matrix = torch.FloatTensor(self.embedding_matrix)
print("loading word2vec model……")
print("dictionary vocab :{}×{}".format(self.embedding_matrix.size(0),self.embedding_matrix.size(1)))
# model wv 是array
# self.embedding_matrix = torch.FloatTensor(model.wv) 最后一个idx not present
def add_embedding(self,word):
# 在词向量矩阵中加入 <pad\unk> 在load embedding 之后
# 先随机化一个vector
vector = torch.empty(1,self.embedding_dim)
# embedding 的取值范围 -1 1 ?
torch.nn.init.uniform_(vector,a=0,b=1)
words_size = len(self.words)
self.words.append(word) # append 让words none
self.word2index[word] = words_size
self.embedding_matrix = torch.cat([self.embedding_matrix,vector],dim = 0)
def make_embedding(self,file = None):
"""
外部调用 封装了add 控制了load_add的顺序
"""
if file != None:
self.load_word_embedding(file)
elif self.embedding_matrix != []:
None
elif self.model_path != None:
self.load_word_embedding(self.model_path)
else:
raise ImplementError
self.add_embedding("<PAD>")
self.add_embedding("<UNK>")
return self.embedding_matrix
def sen2index(self): # 外部控制 sen2index的 封装的同时保证一定的控制
# 句子转为在字典中的index
sens = self.sens
# sens = [sen.split(" ") for sen in sens ]
sentence_list = []
for sen in (sens): # sen 指向原来的内存
for i,word in enumerate(sen):
if self.word2index.get(word)!= None:
# sen[i] 此处是一个局部变量 但也会原地修改sens
sen[i] = self.word2index[word] # sen[i] 指向一个字符串 不能直接修改
else:
sen[i] = self.word2index["<UNK>"]
if len(sen)>self.sen_len:
# 另外创建sen
# sen= sen[:self.sen_len] # 复制了一份 没有修改原来的
sen[:]= sen[:self.sen_len]
elif len(sen)<self.sen_len:
sen.extend([self.word2index['<PAD>']]*(self.sen_len-len(sen)))
# sen[:] = sen[:len(sen)]+[self.word2index['<PAD>']]*(self.sen_len-len(sen))
# print("pad后的长度",len(sen))
else:
pass
sentence_list.append(sen)
# print(sens.shape)
sens = torch.LongTensor(sens)
return sens
def label2tensor(self):
# str 2 index
if self.y != None:
self.y = [int(item) for item in self.y]
return torch.LongTensor(self.y)
else:
raise NotImplementedError
gensim.Word2Vec模型
from gensim.models import Word2Vec
def word2vec(x):
# 注意输入 word2vec 的是单词列表
model = Word2Vec(x,vector_size=250,sg = 1 ,epochs =10 ,min_count = 5,window = 5)
return model
# 训练word2vec
total = train_senss+test_senss
model = word2vec(total)
model.save("words2dilemma.model")
model=Word2Vec.load("words2dilemma.model")
# pickle 问题卸载gensim
本文下载的gensim是最新版本、
在使用过程中发现无法将model.wv直接放在FloatTensor中,报错: Key '23428' not present
因此在Preprocess中用一个循环加载到embedding_matrix上
for i in range(len(model.wv)):
self.embedding_matrix.append(model.wv[i])
self.embedding_matrix = torch.FloatTensor(self.embedding_matrix)
对于出现的_pickle.UnpicklingError :could not find MARK
,jupyter notebook 提示我重新安装gensim,最后成功解决了,但是pycharm 没有这一友情提示哦
计算图连接断开
这应该是最后一个重磅级的难题了,RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
按照博客上说法改了loss_train.requires_grad_(True)
回过头看就是隔靴搔痒,虽然意识到之前没有这样设置过loss、但没有足够的先备知识也看不出其他问题。
最后确定问题出现在模型上
结合官方文档的理解:
函数定义了在tensor 上构建的计算图,这里的tensor指的是w,b
requires_grad=True;计算梯度时它们作为变量,如果把值又另外放到FloatTensor
里面相当于截断了loss关于wb的fuction计算连接。
完整代码
预处理、数据封装还有最后的测试细节写在详细代码里,代码参考了李宏毅2020机器学习作业4-RNN:句子情感分类
数据读入
import gensim
import copy
import re
def load_training_data(file):
if 'training_nolabel' in file:
sens=open(file,encoding='utf-8').readlines()
sens = [re.sub(r"[^0-9a-zA-Z]+", r" ", sen) for sen in sens]
sens =[sen.strip() for sen in sens]
return sens
else :
sens = open(file,encoding='utf-8').readlines()
sens = [re.sub(r"[^0-9a-zA-Z]+", r" ", sen) for sen in sens]
sens = [sen.strip().split(" ") for sen in sens]
label = [sen[0] for sen in sens]
senss = [sen[2:] for sen in sens]
return label,senss
train_labels,train_senss = load_training_data("hw4//training_label.txt")
def load_testing_data(file):
with open(file,encoding='utf-8') as f:
next(f)
sens = f.readlines()
sens = [sen.strip().split(',') for sen in sens]
label = [sen[0].strip() for sen in sens]
senss = [sen[1] for sen in sens]
senss = [re.sub(r"[^0-9a-zA-Z]+", r" ", sen) for sen in senss]
senss = [sen.strip().split(" ") for sen in senss]
return label,senss
test_labels,test_senss = load_testing_data("hw4//testing_data.txt")
数据预处理
from gensim.models import Word2Vec
def word2vec(x):
# 注意输入 word2vec 的是单词列表
model = Word2Vec(x,vector_size=250,sg = 1 ,epochs =10 ,min_count = 5,window = 5)
return model
# 训练word2vec
total = train_senss+test_senss
model = word2vec(total)
model.save("words2dilemma.model")
model=Word2Vec.load("words2dilemma.model")
# 根据参考代码这里的要求是 model 在这之前已经构建好了 如果在模型李构建 或者未定义调用就报错
# 从封装的角度看 要完成em的构建 将数据转为index 对接模型
class Preprocess:
def __init__(self,sens,sen_len,embedding_dim,model_path = None,y = None):
# 传递 整数、字符串、元组、不可变对象不会更改外部值 数组和字典则会
self.model_path = model_path
self.embedding_dim = embedding_dim
self.embedding_matrix = []
self.word2index = {}
self.words = []
self.sens = sens
self.y = y
self.sen_len = sen_len
def load_word_embedding(self,file):
# 路径中查询word2vec 或者临时创建 word2vec对输入的句子创建一个字典,同时对每个word embedding
self.embedding_matrix = []
self.embedding = Word2Vec.load(file)
model = self.embedding
self.word2index = model.wv.key_to_index
self.words = model.wv.index_to_key
for i in range(len(model.wv)):# "Key '23428' not present"
self.embedding_matrix.append(model.wv[i])
self.embedding_matrix = torch.FloatTensor(self.embedding_matrix)
print("loading word2vec model……")
print("dictionary vocab :{}×{}".format(self.embedding_matrix.size(0),self.embedding_matrix.size(1)))
# model wv 是array
# self.embedding_matrix = torch.FloatTensor(model.wv) 最后一个idx not present
def add_embedding(self,word):
# 在词向量矩阵中加入 <pad\unik> 在load embedding 之后
# 先随机化一个vector
vector = torch.empty(1,self.embedding_dim)
# embedding 的取值范围 -1 1 ?
torch.nn.init.uniform_(vector,a=0,b=1)
words_size = len(self.words)
self.words.append(word) # append 让words none
self.word2index[word] = words_size
self.embedding_matrix = torch.cat([self.embedding_matrix,vector],dim = 0)
def make_embedding(self,file = None):
"""
外部调用 封装了add 控制了load_add的顺序
"""
if file != None:
self.load_word_embedding(file)
elif self.embedding_matrix != []:
None
elif self.model_path != None:
self.load_word_embedding(self.model_path)
else:
raise ImplementError
self.add_embedding("<PAD>")
self.add_embedding("<UNK>")
return self.embedding_matrix
def sen2index(self): # 外部控制 sen2index的 封装的同时保证一定的控制
# 句子转为在字典中的index
sens = self.sens
# sens = [sen.split(" ") for sen in sens ]
sentence_list = []
for sen in (sens): # sen 指向原来的内存
for i,word in enumerate(sen):
if self.word2index.get(word)!= None:
# sen[i] 此处是一个局部变量 但也会原地修改sens
sen[i] = self.word2index[word] # sen[i] 指向一个字符串 不能直接修改
else:
sen[i] = self.word2index["<UNK>"]
if len(sen)>self.sen_len:
# 另外创建sen
# sen= sen[:self.sen_len] # 复制了一份 没有修改原来的
sen[:]= sen[:self.sen_len]
elif len(sen)<self.sen_len:
sen.extend([self.word2index['<PAD>']]*(self.sen_len-len(sen)))
# sen[:] = sen[:len(sen)]+[self.word2index['<PAD>']]*(self.sen_len-len(sen))
else:
pass
sentence_list.append(sen)
# print(sens.shape)
sens = torch.LongTensor(sens)
return sens
def label2tensor(self):
# str 2 index
if self.y != None:
self.y = [int(item) for item in self.y]
return torch.LongTensor(self.y)
else:
raise NotImplementedError
a=copy.deepcopy(train_senss)
p = Preprocess(sens=a,y= train_labels,embedding_dim = 250,sen_len =20 )
p.make_embedding("words2dilemma.model")
train_data = p.sen2index()
train_labels = p.label2tensor()
数据封装
import torch
import numpy as np
from torch.utils.data import Dataset,DataLoader
class Twitter(Dataset):
def __init__(self,x,y=None):
# 不要用y 与 test 状态绑定
self.x = x
self.y = y
self.test = False
def __len__(self):
return len(self.x)
def __getitem__(self,idx):
if self.y != None:
return self.x[idx],self.y[idx]
# 不要对数据结构做限制
# return x[idx][0],x[idx][1] # 前一个标签后一个句子
else :
return self.x[idx]
from sklearn.model_selection import train_test_split
x_train,x_val, y_train, y_val = train_test_split(train_data, train_labels, test_size = 0.1, random_state = 1, stratify = train_labels)
train_set=Twitter(x_train,y_train)
valid_set = Twitter(x_val,y_val)
train_loader = DataLoader(train_set,shuffle = True,batch_size = 128)
valid_loader = DataLoader(valid_set,shuffle=True,batch_size = 128)
构建模型
from torch import nn
class Lstm_net(nn.Module):
def __init__(self,embedding ,num_layers,bidirectional,dropout,hidden_dim,fix_embedding = True):
super(Lstm_net,self).__init__()
self.hidden_dim = hidden_dim
self.bidirectional = bidirectional
# self.embedding =embedding # 参数矩阵 <pad> <UNK> 都没有对应的参数?就是embeddIng_matrix
self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
self.embedding.weight = torch.nn.Parameter(embedding)
embedding_dim = embedding.size(1)
if fix_embedding == True:
# 对embedding fine tune
self.embedding.weight.requires_grad = False if fix_embedding else True
self.dropout = dropout
self.num_layers = num_layers
self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers = self.num_layers ,bidirectional=bidirectional ,batch_first = True)
self.classifier = nn.Sequential(nn.Dropout(self.dropout),
nn.Linear(hidden_dim,1),
nn.Sigmoid())
def forward(self,x):
embedded = self.embedding(x)
# ouput :batch_size *sen_len * (hidden_dim*num_direction)
# hidden : batch_size * (hidden_dim* num_layer)*hidden_dim
output,(hidden,_) = self.lstm(embedded)
if self.bidirectional :
output = output[:,:,:self.hidden_dim]+output[:,:,self.hidden_dim:]
labels = self.classifier(output[:,-1,:])
# 二维变一维
# return torch.FloatTensor([label[0] for label in labels])
return labels.squeeze()
model = Lstm_net(p.embedding_matrix,num_layers = 1,bidirectional=False,dropout = 0.8 ,hidden_dim = 150)
训练模型
查看可训练参数
for name,param in model.named_parameters():
if param.requires_grad:
print(name)
import torch.cuda as cuda
device = "gpu" if cuda.is_available() else "cpu"
model.to(device)
def train(model,epoches,train_loader,valid_loader,lr = 0.002 ):
# print("字典中词的个数{}",model.embedding.shape)
print("total params",(sum([p.numel() for p in model.parameters()])) )
print("trainable params",sum([p.numel() for p in model.parameters() if p.requires_grad == True ]) )
loss = nn.BCELoss() # 损失计算函数
optimizer = torch.optim.Adam(list(filter (lambda p:p.requires_grad,model.parameters())),lr ) #优化器
total_train_loss = []
total_valid_loss = []
total_acc=[]
step = 0
for epoch in range(epoches):
current_loss = 0
model.train()
# 每个epoch 验证一次 但是参数的更新每batch_size 一次
with torch.set_grad_enabled(True):
for x,y in train_loader:
x = x.to(device,dtype = torch.long )
y = y.to(device,dtype = torch.float )
optimizer.zero_grad() # 要清零 否则梯度会累加
pred = model(x) # 模型输入Long
loss_train = loss(pred,y) # 计算交叉熵要float
# loss_train.requires_grad_(True)
loss_train.backward() # 计算梯度
optimizer.step() # 更新参数
current_loss += (loss_train.detach().item()) # 这个为啥有detach()
losss = loss_train.detach().item()/128
step+=1
# if step%100 == 0:
# print("Train times{} Train loss{:.5f}".format(step,losss))
los = current_loss/len(train_loader)
print("||epoch ||Train times{} Train loss{:.5f}".format(epoch,los))
total_train_loss.append(los)
current_loss = 0
current_acc = 0
model.eval()
with torch.no_grad(): # 在验证集选择模型
# 这个分批计算loss 不是为了更好地训练模型 纯粹就是计算上的简便 也可一次性计算
for x,y in valid_loader:
x = x.to(device,dtype = torch.long )
y = y.to(device,dtype = torch.float )
pred = model(x)
loss_valid= loss(pred,y)
current_loss += (loss_valid.item()) # 这个为啥有detach()
current_acc += evaluate(pred,y)
los,acc = current_loss/len(valid_loader) ,current_acc/len(valid_loader)
print("||epoch ||valid loss{:.5f} valid acc{:.5f}".format(los,acc))
total_acc.append((acc))
total_valid_loss.append(los)
return total_train_loss,total_valid_loss,total_acc
def evaluate(pred,y):
# 对pred tensor 操作
pred[pred>=0.5] = 1
pred[pred<0.5] = 0
return torch.sum(torch.eq(pred,y))/len(y)
total_train_loss,total_valid_loss,total_acc = train(model,10,train_loader,valid_loader)
测试
def testing(test_loader):
output = []
y = torch.tensor(y)
for x,y in test_loader:
model.eval()
with torch.no_grad():
x.to(device,dtype=torch.long)
pred = model(x)
pred[pred>0.5] = 1
pred[pred<0.5] = 0
output += pred.int().to_list()
当模型改进为双向的两层lstm的时候,cpu 就已经跑不动了。只能展示以下baseline的结果