模型介绍
CNN(Convolutional Neural Network)即卷积神经网络,本质上,CNN就是一个多层感知机,只不过采用了局部连接和共享权值的方式减少了参数的数量,使得模型更易于训练并减轻过拟合。在文本分类中,参考论文Convolutional Neural Networks for Sentence Classification https://arxiv.org/abs/1408.5882中的模型,其结构如下
CNN文本分类模型结构图
首先对每个嵌入的单词xi使用第j个卷积核大小为h的滤波器进行特征提取
其中为激活函数,由此可得到第个特征向量的第维数据,随后对每一个特征向量取最大值
最后将这些值通过全连接层和Softmax层映射到输出空间
对于单词的嵌入向量,有四种处理方法
1. 使用随机嵌入并在训练时进行更新;
2. 使用已有的嵌入向量,在训练时不作为参数更新;
3. 使用已有的嵌入向量,在训练时作为参数更新;
4. 结合2和3,将单词嵌入到两个通道的嵌入向量中,其中一个嵌入向量为固有属性,另一个嵌入向量作为参数进行更新。
在实验中将对每一种方法进行比较。
数据集介绍
我们使用Classify the sentiment of sentences from the Rotten Tomatoes dataset数据集,该数据集为烂番茄影评数据,主要包括用户评论和情感指标,它将一个句子拆分为多个短语,对每个句子本身和多个短语打上了情感分析标签,其数值意义如下
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 – positive
我们使用该数据集来进行实验
以下是加载该数据集的代码
# 获取数据集和字典
def getWordDict(filename="./data/train.tsv"):
data = pd.read_csv(filename, sep='\t')
#单词转数字字典
word2num=dict()
#数字转单词字典
num2word=dict()
maxLength=0
for i in range(len(data)):
phrase = data['Phrase'][i].lower()
if len(phrase.split()) > maxLength:
maxLength=len(phrase.split())
for j in phrase.split():
if word2num.get(j) is None:
word2num[j]=len(word2num) + 1
num2word[len(word2num)]=j
#在进行预测时可能会出现训练集中没有的单词,多半为人名、地名等,将这些未知单词统一记作<UNK>
word2num['<UNK>']=len(word2num) + 1
num2word[len(word2num)] = '<UNK>'
#因为模型有最大长度,一些句子需要使用空格填充,空格统一记作<SPACE>
word2num['<SPACE>']=0
num2word[0]='<SPACE>'
#句子矩阵
dataX = np.zeros((len(data), maxLength), dtype='float')
for i in range(len(data)):
phrase = data['Phrase'][i].lower().split()
for j in range(len(phrase)):
#位置单词记为<UNK>
if word2num.get(phrase[j]) is None:
phrase[j] = '<UNK>'
dataX[i, j] = word2num[phrase[j]]
#标签
dataY = np.array(data['Sentiment'], dtype='int')
dataY.resize((dataY.size, 1))
return word2num,num2word,dataX,dataY
代码介绍
首先我们实现随机嵌入的方式,模型初始化参数有以下六个:
vocab_size:词汇数量
embedding_dim:词嵌入维度
hidden_dim:隐藏层维度,也就是CNN网络层卷积核的个数
kernel_size:卷积核大小
output_size:输出维度
maxlength:句子的最大长度
class cnnmodel_rand(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, kernel_size, output_size, maxlength):
super().__init__()
self.embed=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
self.cnn=nn.Conv1d(embedding_dim, hidden_dim, kernel_size)
self.maxpool=nn.MaxPool1d(maxlength-kernel_size+1)
# self.dense=nn.Sequential(nn.Linear(features_dim,output_size),
# nn.Softmax(dim=1))
self.dense=nn.Sequential(nn.Dropout(0.3), nn.Linear(hidden_dim, output_size))
def forward(self,x):
embed_x=self.embed(x)
cnn_x=self.cnn(embed_x.transpose(1,2))
pool_x=self.maxpool(cnn_x)
out=self.dense(pool_x.squeeze(-1))
return out
第二种嵌入方式是使用已有的嵌入向量,这里我使用glove预训练的embedding向量来处理https://nlp.stanford.edu/data/glove.twitter.27B.zip
下载好数据后需要加载这一个嵌入向量,因为glove预训练的单词有很多,大部分是我们用不上的,所以我们需要根据前面在数据集中加载的word2num来对这些文件进行处理
def getGloveEmb(word2num,filename='./word2vec/glove25d.txt',dimSize=25):
f=open(filename,'r',encoding='utf-8')
wordList=f.readlines()
wordDict={i.split()[0]:torch.tensor([float(j) for j in i.split()[1:]]) for i in wordList}
emb=torch.zeros((len(word2num)+2,dimSize))
for i,j in wordDict.items():
try:
emb[word2num[i],:]=j
except:
pass
return emb
得到embedding矩阵后,将其传入模型中,设其为embedding层的参数,并禁止其更新即可
# CNN 固定嵌入 emb为嵌入矩阵
class cnnmodel_static(nn.Module):
def __init__(self,vocab_size,embedding_dim,hidden_dim,kernel_size,output_size,maxlength,emb):
super().__init__()
self.embed=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
#设定参数
self.embed.weight=torch.nn.Parameter(emb)
#禁止更新
self.embed.weight.requires_grad=False
self.cnn=nn.Conv1d(embedding_dim,hidden_dim,kernel_size)
self.maxpool=nn.MaxPool1d(maxlength-kernel_size+1)
# self.dense=nn.Sequential(nn.Linear(features_dim,output_size),
# nn.Softmax(dim=1))
self.dense=nn.Sequential(nn.Dropout(0.3),nn.Linear(hidden_dim,output_size))
def forward(self,x):
embed_x=self.embed(x)
cnn_x=self.cnn(embed_x.transpose(1,2))
pool_x=self.maxpool(cnn_x)
out=self.dense(pool_x.squeeze(-1))
return out
对于第三种传入预训练向量并允许更新就只需要在如上模型基础上允许embedding层更新即可
# CNN 固定嵌入 可更新 emb为嵌入矩阵
class cnnmodel_nonstatic(nn.Module):
def __init__(self,vocab_size,embedding_dim,hidden_dim,kernel_size,output_size,maxlength,emb):
super().__init__()
self.embed=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
self.embed.weight=torch.nn.Parameter(emb)
self.embed.weight.requires_grad=True
self.cnn=nn.Conv1d(embedding_dim,hidden_dim,kernel_size)
self.maxpool=nn.MaxPool1d(maxlength-kernel_size+1)
# self.dense=nn.Sequential(nn.Linear(features_dim,output_size),
# nn.Softmax(dim=1))
self.dense=nn.Sequential(nn.Dropout(0.5),nn.Linear(hidden_dim,output_size))
def forward(self,x):
embed_x=self.embed(x)
cnn_x=self.cnn(embed_x.transpose(1,2))
pool_x=self.maxpool(cnn_x)
out=self.dense(pool_x.squeeze(-1))
return out
第四种也比较简单,设置两个不同的embeding层,并将他们的输出进行拼接即可,代码如下
#CNN 双通道嵌入,一个通道为参数,一个为固有属性 emb为嵌入矩阵
class cnnmodel_multichannel(nn.Module):
def __init__(self,vocab_size,embedding_dim,hidden_dim,kernel_size,output_size,maxlength,emb):
super().__init__()
self.embedStatic=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
self.embedStatic.weight=torch.nn.Parameter(emb)
self.embedStatic.weight.requires_grad=False
self.embedNonStatic=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
self.cnn=nn.Conv1d(embedding_dim,hidden_dim,kernel_size)
self.maxpool=nn.MaxPool1d(2*(maxlength-kernel_size+1))
# self.dense=nn.Sequential(nn.Linear(features_dim,output_size),
# nn.Softmax(dim=1))
self.dense=nn.Sequential(nn.Dropout(0.3),nn.Linear(hidden_dim,output_size))
def forward(self,x):
embed_x_static=self.embedStatic(x)
embed_x_nonStatic=self.embedNonStatic(x)
cnn_x_static=self.cnn(embed_x_static.transpose(1,2))
cnn_x_nonstatic=self.cnn(embed_x_nonStatic.transpose(1,2))
#拼接两者输出
cnn_x=torch.cat((cnn_x_static,cnn_x_nonstatic),2)
pool_x=self.maxpool(cnn_x)
out=self.dense(pool_x.squeeze(-1))
return out
在训练时,我们使用Adam优化器,学习率为0.001,损失函数使用交叉熵函数,以下是训练过程的代码:
def split_data(X, Y, split_ratio=None,randseed=None):
"""
数据集划分函数
:param X: (num,size) 样本
:param Y: (num,1) 标签
:param split_ratio:切分比例,默认为6:2:2
:return: trainX,trainY,valX,valY,testX,testY
"""
if not (randseed is None):
random.seed(randseed)
if split_ratio is None:
split_ratio = [0.6, 0.2, 0.2]
num=X.shape[0]
sizeX=X.shape[1]
sizeY=Y.shape[1]
train_num=int(num*split_ratio[0])
val_num=int(num*split_ratio[1])
test_num=num-train_num-val_num
randList=random.sample(range(num),num)
trainX=np.zeros((train_num,sizeX),dtype=type(X[0][0]))
valX=np.zeros((val_num,sizeX),dtype=type(X[0][0]))
testX=np.zeros((test_num,sizeX),dtype=type(X[0][0]))
trainY=np.zeros((train_num,sizeY),dtype=type(X[0][0]))
valY=np.zeros((val_num,sizeY),dtype=type(X[0][0]))
testY=np.zeros((test_num,sizeY),dtype=type(X[0][0]))
k=0
for i in randList:
if k<train_num:
trainX[k,:]=X[i,:]
trainY[k,:]=Y[i,:]
elif k<train_num+val_num:
valX[k-train_num,:]=X[i,:]
valY[k-train_num,:]=Y[i,:]
else:
testX[k-train_num-val_num,:]=X[i,:]
testY[k-train_num-val_num,:]=Y[i,:]
k+=1
return trainX,trainY,valX,valY,testX,testY
if __name__=='__main__':
#CNN
word2num, num2word, dataX, dataY=getWordDict()
#训练轮数
EPOCH = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
#数据集分割
trainX, trainY, valX, valY, testX, testY = split_data(dataX, dataY,randseed=10)
trainX = torch.tensor(trainX)
trainY = torch.tensor(trainY)
valX = torch.tensor(valX)
valY = torch.tensor(valY)
testX = torch.tensor(testX)
testY = torch.tensor(testY)
dataset = data.TensorDataset(trainX, trainY)
#数据载入DataLoader中方便训练
loader = data.DataLoader(
dataset=dataset,
batch_size=64,
shuffle=True
)
#提取glove的嵌入向量
emb = getGloveEmb(word2num, filename='./word2vec/{}{}d.txt'.format('glove',25), dimSize=25)
#创建模型
# model = cnnmodel_rand(len(word2num),embsize,64,5,5,dataX.shape[1]).to(device)
# model = cnnmodel_static(len(word2num),embsize,64,5,5,dataX.shape[1],emb).to(device)
model = cnnmodel_nonstatic(len(word2num),25,64,5,5,dataX.shape[1],emb).to(device)
# model = cnnmodel_multichannel(len(word2num),embsize,64,5,5,dataX.shape[1],emb).to(device)
# writer = SummaryWriter(log_dir="summarycnn_nonstatic")
#优化器,损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
best_loss = float('inf')
loss_func = nn.CrossEntropyLoss()
best_model=None
k=0
for epoch in range(EPOCH):
sumLoss = []
s = time.time()
model.train()
for step, (batch_x, batch_y) in enumerate(loader):
out = model(batch_x.long().to(device))
loss = loss_func(out, batch_y.long().to(device).squeeze(1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
sumLoss.append(loss.detach())
averLoss = sum(sumLoss) / len(sumLoss)
model.eval()
out = model(valX.long().to(device))
loss = loss_func(out, valY.long().to(device).squeeze(1))
valLoss = loss.detach()
# writer.add_scalars('loss', {'train': averLoss, 'val': valLoss}, epoch)
e = time.time()
print(">>epoch{}: trainLoss:{:.3f} valLoss:{:.3f} time:{:.3f}s".format(epoch + 1, averLoss, valLoss, e - s))
#保存在验证集上效果最好的模型
if valLoss < best_loss:
best_model=copy.deepcopy(model)
best_loss=valLoss
k=0
else:
k+=1
#早停防止过拟合
if k>=5:
print("提前终止")
break
model=copy.deepcopy(best_model)
#在测试集上进行测试
model.eval()
# writer.close()
device = torch.device('cpu')
model.to(device)
out = model(testX.long().to(device))
# out = torch.argmax(out, 2).to(torch.device("cpu"))
out = torch.argmax(out, 1).to(device)
precision = torch.sum(out.squeeze(0) == testY.long().squeeze(1).to(device)) / out.shape[0]
print("测试集准确率:{}".format(precision))
运行效果如下:
这个项目的代码可以在下方链接下载