0.前言
参考:这位博主 。自己再写博客是为了方便再回顾
1.电影评论数据集
数据集下载:链接:https://pan.baidu.com/s/1zultY2ODRFaW3XiQFS-36w
提取码:mgh2 压缩包里有四个文件,将解压好的文件夹放在项目目录里即可
训练数据集过大所以我使用的是test数据集进行训练
2.加载数据
import pandas as pd
# 加载数据
train_data = pd.read_csv('./Dataset/test.txt', names=['label', 'review'], sep='\t')
train_labels = train_data['label']
train_reviews = train_data['review']
训练数据共有369条
comments_len = train_data.iloc[:, 1].apply(lambda x: len(x.split(' ')))
print(comments_len)
train_data['comments_len'] = comments_len
print(train_data['comments_len'].describe(percentiles=[.5, .95]))
train_data.iloc[:, 1].apply(lambda x: len(x.split(’ ')))的意思是对train_data第2列(编号为1即review)的数据返回每一条数据词的数量
可以看出95%的评论词的个数都在63以内,那么我们取每条评论词的数量最多为max_sent=63,超过了就切掉后面的词,否则就补充
3.数据预处理
from collections import Counter
def text_process(review):
"""
数据预处理
:param review: 评论数据train_reviews
:return: 词汇表words、词-id的字典word2id、id-词的字典id2word、pad_sentencesid
"""
words = []
for i in range(len(review)):
words += review[i].split(' ')
# 选出频率较高的词,存放到word_freq.txt中
with open('./Dataset/word_freq.txt', 'w', encoding='utf-8') as f:
# Counter(words).most_common() 找出出现频率最高的词(不加参数则返回所有词及其频率)
for word, freq in Counter(words).most_common():
if freq > 1:
f.write(word+'\n')
# 取出出数据
with open('./Dataset/word_freq.txt', encoding='utf-8') as f:
words = [i.strip() for i in f]
# 去重(词汇表)
words = list(set(words))
# 词-id的字典word2id
word2id = {j: i for i, j in enumerate(words)}
# id-词的字典id2word
id2word = {i: j for i, j in enumerate(words)}
pad_id = word2id['把'] # 中性词的id 用于填充
sentences = [i.split(' ') for i in review]
# 填充后的所有句子 每个词用id表示
pad_sentencesid = []
for i in sentences:
# 如果词汇表中没有这个词 用pad_id替代 如果有这个词返回这个词对应的id
temp = [word2id.get(j, pad_id) for j in i]
# 如果句子词的数量大于max_sent,则截断后面的
if len(i) > max_sent:
temp = temp[:max_sent]
else: # 如果句子词的数量小于max_sent,则用pad_id进行填充
for j in range(max_sent - len(i)):
temp.append(pad_id)
pad_sentencesid.append(temp)
return words, word2id, id2word, pad_sentencesid
首先将所有评论的词放到words中,选出出现频率freq>1的词,存放到word_freq.txt再取出放到words中,freq可以自己控制大于多少,当然也可以不要这一步,这步操作在数据量大的时候会比较高效,筛选后基本保留了情感分类的关键词
我们把那些出现频率freq<=1的词都变成"把“字,变成什么无所谓,只要不会影响我们的分类结果就行,这个字是个中性词,所以不会对情感分类有什么影响。
在对每个句子进行处理,如果句子词的数量大于max_sent,则截断后面的词,否则用中性词的pad_id进行填充
4.得到数据
import torch
import torch.utils.data as Data
import numpy as np
from gensim.models import keyedvectors
# hyper parameter
Batch_Size = 32
Embedding_Size = 50 # 词向量维度
Filter_Num = 10 # 卷积核个数
Dropout = 0.5
Epochs = 60
LR = 0.01
# 加载词向量模型Word2vec
w2v = keyedvectors.load_word2vec_format('./Dataset/wiki_word2vec_50.bin', binary=True)
def get_data(labels, reviews):
words, word2id, id2word, pad_sentencesid = text_process(reviews)
x = torch.from_numpy(np.array(pad_sentencesid)) # [369, 63]
y = torch.from_numpy(np.array(labels)) # [369]
dataset = Data.TensorDataset(x, y)
data_loader = Data.DataLoader(dataset=dataset, batch_size=Batch_Size)
# 遍历词汇表中所有的词 如果w2v中有该词的向量表示则不作操作,否则随机生成向量放到w2v中
for i in range(len(words)):
try:
w2v[words[i]] = w2v[words[i]]
except Exception:
w2v[words[i]] = np.random.randn(50, )
return data_loader, id2word
重点:评论中的词在w2v中可能没有,得进行随机分配向量,否则会报错
5.将词转为词向量
def word2vec(x): # [batch_size, 63]
"""
将句子中的所有词转为词向量
:param x: batch_size个句子
:return: batch_size个句子的词向量
"""
batch_size = x.shape[0]
x_embedding = np.ones((batch_size, x.shape[1], Embedding_Size)) # [batch_size, 63, 50]
for i in range(len(x)):
# item() 将tensor类型转为number类型
x_embedding[i] = w2v[[id2word[j.item()] for j in x[i]]]
return torch.tensor(x_embedding).to(torch.float32)
这里将所有句子中的词都转为词向量
6.模型
import torch.nn as nn
class TextCNN(nn.Module):
def __init__(self):
super(TextCNN, self).__init__()
self.conv = nn.Sequential(
nn.Conv1d(1, Filter_Num, (2, Embedding_Size)),
nn.ReLU(),
nn.MaxPool2d((max_sent-1, 1))
)
self.dropout = nn.Dropout(Dropout)
self.fc = nn.Linear(Filter_Num, 2)
self.softmax = nn.Softmax(dim=1) # 行
def forward(self, X): # [batch_size, 63]
batch_size = X.shape[0]
X = word2vec(X) # [batch_size, 63, 50]
X = X.unsqueeze(1) # [batch_size, 1, 63, 50]
X = self.conv(X) # [batch_size, 10, 1, 1]
X = X.view(batch_size, -1) # [batch_size, 10]
X = self.fc(X) # [batch_size, 2]
X = self.softmax(X) # [batch_size, 2]
return X
TextCNN网络模型
7.训练
if __name__ == '__main__':
data_loader, id2word = get_data(train_labels, train_reviews)
text_cnn = TextCNN()
optimizer = torch.optim.Adam(text_cnn.parameters(), lr=LR)
loss_fuc = nn.CrossEntropyLoss()
print("+++++++++++start train+++++++++++")
for epoch in range(Epochs):
for step, (batch_x, batch_y) in enumerate(data_loader):
# 前向传播
predicted = text_cnn.forward(batch_x)
loss = loss_fuc(predicted, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 计算accuracy
# dim=0表示取每列的最大值,dim=1表示取每行的最大值
# torch.max()[0]表示返回最大值 torch.max()[1]表示返回最大值的索引
predicted = torch.max(predicted, dim=1)[1].numpy()
label = batch_y.numpy()
accuracy = sum(predicted == label) / label.size
if step % 30 == 0:
print('epoch:', epoch, ' | train loss:%.4f' % loss.item(), ' | test accuracy:', accuracy)
8.所有代码
#encoding:utf-8
#作者:codewen
import pandas as pd
from collections import Counter
import torch
import torch.nn as nn
import torch.utils.data as Data
import numpy as np
from gensim.models import keyedvectors
# 加载数据
train_data = pd.read_csv('./Dataset/test.txt', names=['label', 'review'], sep='\t')
train_labels = train_data['label']
train_reviews = train_data['review']
# 95%的评论 词的数量在62以内
comments_len = train_data.iloc[:, 1].apply(lambda x: len(x.split(' ')))
print(comments_len)
train_data['comments_len'] = comments_len
print(train_data['comments_len'].describe(percentiles=[.5, .95]))
# 对数据进行处理 如果句子长度大于max_sent则截断,否则用'空'填充
max_sent = 63
def text_process(reviews):
"""
数据预处理
:param review: 评论数据train_reviews
:return: 词汇表words、词-id的字典word2id、id-词的字典id2word、pad_sentencesid
"""
words = []
for i in range(len(reviews)):
words += reviews[i].split(' ')
# 选出频率较高的词,存放到word_freq.txt中
with open('./Dataset/word_freq.txt', 'w', encoding='utf-8') as f:
# Counter(words).most_common() 找出出现频率最高的词(不加参数则返回所有词及其频率)
for word, freq in Counter(words).most_common():
if freq > 1:
f.write(word+'\n')
# 取出出数据
with open('./Dataset/word_freq.txt', encoding='utf-8') as f:
words = [i.strip() for i in f]
# 去重(词汇表)
words = list(set(words))
# 词-id的字典word2id
word2id = {j: i for i, j in enumerate(words)}
# id-词的字典id2word
id2word = {i: j for i, j in enumerate(words)}
pad_id = word2id['把'] # 中性词的id 用于填充
sentences = [i.split(' ') for i in reviews]
# 填充后的所有句子 每个词用id表示
pad_sentencesid = []
for i in sentences:
# 如果词汇表中没有这个词 用pad_id替代 如果有这个词返回这个词对应的id
temp = [word2id.get(j, pad_id) for j in i]
# 如果句子词的数量大于max_sent,则截断后面的
if len(i) > max_sent:
temp = temp[:max_sent]
else: # 如果句子词的数量小于max_sent,则用pad_id进行填充
for j in range(max_sent - len(i)):
temp.append(pad_id)
pad_sentencesid.append(temp)
return words, word2id, id2word, pad_sentencesid
# hyper parameter
Batch_Size = 32
Embedding_Size = 50 # 词向量维度
Filter_Num = 10 # 卷积核个数
Dropout = 0.5
Epochs = 60
LR = 0.01
# 加载词向量模型Word2vec
w2v = keyedvectors.load_word2vec_format('./Dataset/wiki_word2vec_50.bin', binary=True)
def get_data(labels, reviews):
words, word2id, id2word, pad_sentencesid = text_process(reviews)
x = torch.from_numpy(np.array(pad_sentencesid)) # [369, 63]
y = torch.from_numpy(np.array(labels)) # [369]
dataset = Data.TensorDataset(x, y)
data_loader = Data.DataLoader(dataset=dataset, batch_size=Batch_Size)
# 遍历词汇表中所有的词 如果w2v中有该词的向量表示则不作操作,否则随机生成向量放到w2v中
for i in range(len(words)):
try:
w2v[words[i]] = w2v[words[i]]
except Exception:
w2v[words[i]] = np.random.randn(50, )
return data_loader, id2word
def word2vec(x): # [batch_size, 63]
"""
将句子中的所有词转为词向量
:param x: batch_size个句子
:return: batch_size个句子的词向量
"""
batch_size = x.shape[0]
x_embedding = np.ones((batch_size, x.shape[1], Embedding_Size)) # [batch_size, 63, 50]
for i in range(len(x)):
# item() 将tensor类型转为number类型
x_embedding[i] = w2v[[id2word[j.item()] for j in x[i]]]
return torch.tensor(x_embedding).to(torch.float32)
class TextCNN(nn.Module):
def __init__(self):
super(TextCNN, self).__init__()
self.conv = nn.Sequential(
nn.Conv1d(1, Filter_Num, (2, Embedding_Size)),
nn.ReLU(),
nn.MaxPool2d((max_sent-1, 1))
)
self.dropout = nn.Dropout(Dropout)
self.fc = nn.Linear(Filter_Num, 2)
self.softmax = nn.Softmax(dim=1) # 行
def forward(self, X): # [batch_size, 63]
batch_size = X.shape[0]
X = word2vec(X) # [batch_size, 63, 50]
X = X.unsqueeze(1) # [batch_size, 1, 63, 50]
X = self.conv(X) # [batch_size, 10, 1, 1]
X = X.view(batch_size, -1) # [batch_size, 10]
X = self.fc(X) # [batch_size, 2]
X = self.softmax(X) # [batch_size, 2]
return X
if __name__ == '__main__':
data_loader, id2word = get_data(train_labels, train_reviews)
text_cnn = TextCNN()
optimizer = torch.optim.Adam(text_cnn.parameters(), lr=LR)
loss_fuc = nn.CrossEntropyLoss()
print("+++++++++++start train+++++++++++")
for epoch in range(Epochs):
for step, (batch_x, batch_y) in enumerate(data_loader):
# 前向传播
predicted = text_cnn.forward(batch_x)
loss = loss_fuc(predicted, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 计算accuracy
# dim=0表示取每列的最大值,dim=1表示取每行的最大值
# torch.max()[0]表示返回最大值 torch.max()[1]表示返回最大值的索引
predicted = torch.max(predicted, dim=1)[1].numpy()
label = batch_y.numpy()
accuracy = sum(predicted == label) / label.size
if step % 30 == 0:
print('epoch:', epoch, ' | train loss:%.4f' % loss.item(), ' | test accuracy:', accuracy)
大概一分钟后,训练结束