TextRNN pytorch 实践
前言
首先吐槽一下 ,研究机器学习深度学习的时候 发现 各路大神 都是面向过程编程 ,非常简单的东西看的也是云里雾里,代码面向对象编程 很多东西一看就能看明白, 希望借此能抛砖引玉。
另外命名也喜欢用 驼峰命名法
Embedding
理解 textRNN 之前 首先得理解 Embedding ,把这个东西了解了 其他的东西 就是 RNN。
一般处理 文档的时候 遵循下面几步:
-
将文档变成 [ [文档1] , [ 文档2] … ] 这种 list 结构 每一个文档就是一个list
-
将list 文档切割成单词形式 [ [“我”,“爱” ,“小喵咪”] ,[“我”,“吃” ,“西瓜” ] … ] 一般用jieba 分词
-
去停顿词 像 ! , 啊 … 这样的标点符号 语气词 没有实际意义
-
将所有的单词 按照词频记录 比如: { “我": 1800 , “爱”:1500} ==>词频排序{ “我”:1 ,“爱”:2 ,“吃”: 3 ,“西瓜”:4,“小喵咪”:5 }
-
将所有的文档中的单词 按照 这个词频顺序变成数字 [“我”,“爱” ,“小喵咪”]==>[1, 2, 5] ,[“我”,“吃” ,“西瓜” ] ==>[1,3,4]
-
准备词向量 我们假设这个词向量 A 的 size = [ 100,300] , 这个词向量 获取:
1,可以从别人预训练好的直接获取 比如
搜狗
腾讯
2,用 genisim word2vect 本文采用的是这种 因为数据量小 直接用这种方式比较简单 -
Embedding 就是用第5步得到的 数字向量 B 去取第6步的真正词向量A 比如 nn.Embedding([ 1,2,5] ) 就是 把 A 的第 1,2,5 行的向量取出来 。
-
Embedding 出来的词向量 参与 RNN 等各种神经网络计算 。 后面的步骤就和图像处理一样了。如果对RNN 不太理解 移步RNN实战
所以可以看到文档的深度学习计算 就是处理数据比较麻烦点, 要将数据变成特有的格式。明白了上述步骤就可以写代码运算了。
面向对象编程实现 TextRNN
准备数据
我从网上找到一个 ch_auto.csv数据集
然后切分成 三份:
- train.tsv 训练集
- test.tsv 测试集
- dev.tsv 验证集
数据格式:
导入库:
import pandas as pd
import jieba
from gensim import models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import numpy as np
import os
import torch
实现数据集转换:
'''
后面训练 和测试的 数据格式
'''
class Batch(object):
def __init__(self,label,text):
self.text =torch.LongTensor(text)
self.label=torch.LongTensor([int(x) for x in label])
class DataSet(object):
def __init__(self):
self.UNK, self.PAD = '<UNK>', '<PAD>' # 未知字,padding符号
'''
# 将 csv 文件 变成 [ [ 文档1] ,[ 文档2]]
'''
def ReadCSV(self,path):
csvFile= pd.read_csv(path,usecols=[1,2],encoding="utf-8")
labelList = csvFile.values[:,0].tolist()
valueList = csvFile.values[:,1].tolist()
print( labelList[0],valueList[0])
print(len(valueList))
return labelList,valueList
'''
将tsv 文件变成 [ ["我","爱" ,"小喵咪"] ,["我","吃" ,"西瓜" ] ... ]
'''
def ReadTSV(self,path):
label=[]
text=[]
with open(path,"r",encoding="utf-8") as fhandle:
line= fhandle.readline()
while line :
line = fhandle.readline()
lines= line.split("\t")
if len(lines) != 3:
continue
label.append(lines[1])
tmpLine= [word for word in jieba.cut(lines[2])]
#print(lines)
#print("line=",len(lines),lines[1],tmpLine)
text.append(tmpLine)
return label,text
'''
将数据[[],[]]切割成Batch 块 [ batch1,batch2]
'''
def SpliteData(self,text,label,batchSize):
allLen = len(text)
step = int(allLen/batchSize)
spliteText=[]
for i in range(0,step):
spliteText.append(Batch(label[i:i+batchSize] ,text[i:i+batchSize]))
return spliteText
'''
将解析tsv文件
数据变成等长的 list [ [1, 2, 5], [1, 2, 4] ...]
切割成[ batch1,batch2]
'''
def BuidBatch(self,batchSize,vocabDict,sentenSize=32,
testPath="./data/test.tsv",valPath="./data/dev.tsv",trainPath="./data/train.tsv"):
testLabel,testText = self.ReadTSV(testPath)
valLabel,valText = self.ReadTSV(valPath)
trainLabel,trainText = self.ReadTSV(trainPath)
testText = self.Doc2Embedding(testText,vocabDict,sentenSize)
valText = self.Doc2Embedding(valText,vocabDict,sentenSize)
trainText = self.Doc2Embedding(trainText, vocabDict, sentenSize)
testIter = self.SpliteData(label=testLabel,text= testText,batchSize=batchSize)
valIter= self.SpliteData(label=valLabel,text= valText,batchSize=batchSize)
trainIter = self.SpliteData(label=trainLabel, text=trainText, batchSize=batchSize)
return testIter,valIter,trainIter
'''
splitTexts =[["xx","xxx"],["xx","xxx"]]
vocabDict=[('空间', index1), ('很', index2),]
'''
def BuildVocabDict(self,splitTexts,minFreq=1):
vocabDict={}
for setences in splitTexts:
#print("setence==",setences)
for word in setences:
vocabDict[word] = vocabDict.get(word,0)+1
vocabList = sorted([_ for _ in vocabDict.items() if _[1] >= minFreq], key=lambda x: x[1], reverse=True)
print("vocabList==",vocabList)
vocabDict = {word_count[0]: idx for idx, word_count in enumerate(vocabList)}
vocabDict.update({self.UNK: len(vocabDict),self.PAD: len(vocabDict) + 1})
print("vocabDict size",len(vocabDict))
return vocabDict
#将词向量存到硬盘
def Word2Vect(self, docments,vocabDict,embSize=300,prePath="./data/"):
model = gensim.models.Word2Vec(docments, sg=1, size=embSize, window=5, min_count=1, negative=3, sample=0.001, hs=1,
workers=4)
#model.wv.save_word2vec_format("word300.txt",binary=False)
wordEmbding=[]
for k,v in vocabDict.items():
#print("k,",k )
if(model.wv.__contains__(k)):
tmpVect = model.wv.get_vector(k)
#print("model==",tmpVect)
wordEmbding.append(tmpVect)
else:
embedding = np.random.uniform(0, 1, embSize)
wordEmbding.append(embedding)
print("not contain key==",k)
fileName = "{}word{}.npz".format(prePath,embSize)
if not os.path.exists(fileName):
fd = open(fileName,"w",encoding="utf-8")
fd.close()
np.savez(fileName,vocabDict=vocabDict,wordEmbding=wordEmbding)
return model
#加载词向量
def LoadWordEmbding(self,path="./data/word300.npz"):
savNpz= np.load(path,allow_pickle=True)
embedding_pretrained = savNpz["wordEmbding"].astype('float32')
vocabDict = savNpz["vocabDict"].item()
print("embding===",len(embedding_pretrained))
print("embding dict=",vocabDict)
return vocabDict, embedding_pretrained
#建立词向量
def BuildWordVect(self,embSize):
if not os.path.exists("./data/word300.npz"):
label, text = self.ReadCSV("./data/ch_auto.csv")
splitTexts = [[word for word in jieba.cut(setences)] for setences in text]
print(splitTexts[0])
vocabDict = self.BuildVocabDict(splitTexts)
self.Word2Vect(splitTexts, vocabDict, embSize=embSize)
配置参数
class RNNConfig():
def __init__(self,vocabSize,outputSize=2,batchSize=50,embedDimention=300,
hiddenSize=64,hiddenLayer=3,dropKeep=0.1,bidirectional=True,
lr=0.001,cuda=False,saveDir="./data/snap/",
logInteval=5,epochs= 3 ,evalInteval=-1,preTrain=True,embdingVect=None
):
self.vocabSize= vocabSize # 总词数 多少个
self.batchSize = batchSize #一次性传入多少数据
self.embedDimention= embedDimention #词向量维度
self.hiddenSize = hiddenSize # lstm隐藏层大小
self.hiddenLayer= hiddenLayer # lstm层数
self.dropKeep= dropKeep # 随机失活
self.bidirectional= bidirectional # 是否双向
self.outputSize=outputSize #输出大小
self.lr= lr #学习率
self.cuda= cuda #是否用GPU
self.saveDir = saveDir #保存快照的位置
self.logInteval =logInteval #隔多少步 打一个log
self.epochs= epochs # 循环多少次
self.evalInteval = evalInteval #隔多少步 评估一下保存快照
self.preTrain= preTrain
self.embdingVect = embdingVect
TextRNN 实现
class TextRNN(nn.Module):
def __init__(self, config):
super(TextRNN, self).__init__()
self.config = config
if config.preTrain:
# self.embeddings = nn.Embedding(self.config.vocabSize, self.config.embedDimention)
#
# self.embeddings.weight.data.copy_(config.embdingVect)
self.embeddings = nn.Embedding.from_pretrained(config.embdingVect,freeze=False)
print("pr train")
else:
# Embedding 层, 随机初始化
self.embeddings = nn.Embedding(self.config.vocabSize, self.config.embedDimention)
# LSTM 层
'''
input_size:输入特征的数目
hidden_size:隐层的特征数目
num_layers:这个是模型集成的LSTM的个数 记住这里是模型中有多少个LSTM摞起来 一般默认就1个
#batch_first: 输入数据的size为[batch_size, time_step, input_size]还是[time_step, batch_size, input_size]
'''
self.lstm = nn.LSTM(input_size=self.config.embedDimention,
hidden_size=self.config.hiddenSize,
num_layers=self.config.hiddenLayer,
dropout=self.config.dropKeep,
bidirectional=self.config.bidirectional,
batch_first=True
)
# dropout
self.dropout = nn.Dropout(self.config.dropKeep)
outSize= self.config.hiddenSize * ( 2 if self.config.bidirectional else 1 ) #*self.config.hiddenLayer
print("outSize=",outSize)
# 全连接层
self.fc = nn.Linear( # 就是 hn、cn 的输出然后去掉 batch_size
outSize,
self.config.outputSize
)
# softmax 层
self.softmax = nn.Softmax(dim=1)
# for param in self.parameters():
# print("parm==",param)
self.optimizer= torch.optim.Adam(self.parameters(),config.lr)
self.lossFunc = nn.CrossEntropyLoss()
def RunModel(self,x):
# x.shape = (max_sen_len, batch_size)
#x = torch.LongTensor(x)
print("x:",x.size(),x[0])
embedded_sent = self.embeddings(x) # (max_sen_len = 30, batch_size=64, embed_size=300)
embedded_sent = self.dropout(embedded_sent)
print("embedded_sent==",embedded_sent.size())
# LSTM
lstm_out, (h_n, c_n) = self.lstm(embedded_sent,None)
# dropout
final_feature_map = self.dropout(h_n) # (num_layers * num_directions, batch_size, hidden_size)
print("final_feature_map:",final_feature_map.size())
final_feature_map = torch.cat((final_feature_map[-1, :, :] ,final_feature_map[-2, :, :]), dim=1)
print("final_feature_map22:", final_feature_map.size())
# 全连接
final_out = self.fc(final_feature_map)
#final_out = self.softmax(final_out)
return final_out # 返回 softmax 的结果
def forward(self, x):
return self.RunModel(x)
def Refrush(self,predictY,targetY):
self.optimizer.zero_grad()
loss= self.lossFunc(predictY,targetY)
loss.backward()
self.optimizer.step()
print("loss:",loss.data.item())
return loss
def ShowRate(self,prdictY,targetY):
result = torch.argmax(prdictY,dim=1)
print("rate==",prdictY[0:5],result[0:5],targetY[0:5])
corrects = (result == targetY).sum().item()
accuracy = corrects / self.config.batchSize
print ("correct:",corrects,"acc:",accuracy)
def SaveMode(self,saveDir,step):
if not os.path.exists(saveDir):
os.mkdir(saveDir)
savePath = "{}Steps_{}.pt".format(saveDir,step)
torch.save(self.state_dict(),savePath)
def RunTrain(self,trainIter,evalIter):
step =0
bestAcc = 0
self.train()
for epoch in range (1,self.config.epochs+1):
for batch in trainIter:
feature, target = batch.text, batch.label
if self.config.cuda:
feature,target = feature.cuda(),target.cuda()
predictY = self.RunModel(feature)
print("predict Y:",predictY.size(),target.size())
loss =self.Refrush(predictY,target)
if loss.data.item() < 0.0001:
break
step += 1
if step % self.config.logInteval ==0:
self.ShowRate(predictY, target)
if self.config.evalInteval >0 and step % self.config.evalInteval ==0 :
devAcc= self.Eval(evalIter)
if devAcc > bestAcc:
bestAcc = devAcc
#self.SaveMode(self.config.saveDir,step)
self.train()
def Eval(self,dataIter):
self.eval()
avgLoss =0.0
corrects=0.0
accuracy=0.0
for batch in dataIter:
feature, target = batch.text, batch.label
#feature.data.t_()
if self.config.cuda:
feature, target = feature.cuda(), target.cuda()
predictY = self.RunModel(feature)
loss = F.cross_entropy(predictY,target)
avgLoss += loss.item()
#result = torch.max(predictY, 1)[1]
result = torch.argmax(predictY, dim=1)
print("rate==", predictY[0:5], result[0:5], target[0:5])
correct = (result == target).sum().item()
acc= correct / self.config.batchSize
accuracy += acc
print("correct:", correct, "acc:", acc)
size =len(dataIter)
avgLoss /= size
accuracy = accuracy /size
print("eval loss:{} acc:{}".format(avgLoss,accuracy))
return accuracy
使用TextRNN训练 和测试数据
import torch
import torch.nn.functional as F
from torch import nn
import os
#一次性传入的文档数量
BATHSIZE = 50
#每个句子的统一长度
SentenceLength = 32
#词向量维度
EmbdingDemition=300
if __name__ == '__main__':
dataSet = DataSet()
## 训练词向量
dataSet.BuildWordVect(EmbdingDemition)
##加载词向量
vocabDict,wordVect= dataSet.LoadWordEmbding()
## 准备训练数据
trainIter, valIter, testIter =dataSet.BuidBatch(BATHSIZE,vocabDict,sentenSize=SentenceLength)
## 配置文件
config = RNNConfig(len(vocabDict),embedDimention=EmbdingDemition,batchSize=BATHSIZE,
preTrain=True,embdingVect= torch.tensor(wordVect)
)
## 初始化 RNN
myRNN= TextRNN(config)
## 开始训练
myRNN.RunTrain(trainIter,valIter)
## 开始测试
myRNN.Eval(testIter)
数据集下载:
将数据下载 ,建立文件夹 data 放入其中。
链接:https://pan.baidu.com/s/1u6xhcLqI6NWZU1Qh8h7imA
提取码:6w30