基于CNN卷积网络.(正确率86%~88%左右)
1.数据清理
2.构建词汇字典
3.构建模型
4.开始训练
5.开始训练
6.测试结果
7.调整参数
1.处理数据和构建词表 clear_data.py
import pandas as pd
import numpy as np
import re
import os
import pickle as pkl
train_path = './aclImdb/train/'
test_path = './aclImdb/test/'
train_out_path = "./dataset/imdb_tr.csv"
test_out_path = "./dataset/imdb_te.csv"
stopwords_en_path = "./dataset/stopwords.en.txt"
word_dict = {}
# 将句子和停用词作为输入,并返回没有任何停用词的句子
def remove_stopwords(sentence, stopwords):
sentencewords = sentence.split()
resultwords = [word for word in sentencewords if word.lower() not in stopwords]
result = ' '.join(resultwords)
return result
def pre_data(file_path, out_path):
num = 0 # 数据的编号
indices = [] # 编号段
text = [] # 文本段
rating = [] # 评价段
stopwords = open(stopwords_en_path, 'r', encoding="ISO-8859-1").read()
stopwords = stopwords.split("\n")
# 处理正面评论
for filename in os.listdir(file_path + 'pos'):
review = open(file_path+'pos/'+filename, 'r', encoding="ISO-8859-1").read()
review = re.sub('[^a-zA-Z]', ' ', review)
review = review.lower()
review = remove_stopwords(review, stopwords)
indices.append(num)
text.append(review)
rating.append(1)
num += 1
print("第{}条数据完成".format(num))
# 处理负面评论
for filename in os.listdir(file_path + 'neg'):
review = open(file_path+'neg/'+filename, 'r', encoding="ISO-8859-1").read()
review = re.sub('[^a-zA-Z]', ' ', review)
review = review.lower() # 将大写字母转化位小写
review = remove_stopwords(review, stopwords)
indices.append(num)
text.append(review)
rating.append(0)
num += 1
print("第{}条数据完成".format(num))
Dataset = list(zip(indices, text, rating)) # 将这些记录组合
np.random.shuffle(Dataset) # 讲这些评论打乱
df = pd.DataFrame(data=Dataset, columns=['num', 'text', 'rate'])
df.to_csv(out_path, index=False, header=True) # 不保存索引,保存列名在首行
# 构建词汇表
def build_vocab_dict(file_path1, file_path2):
file_path = [file_path1, file_path2]
vocab = []
for i in file_path:
with open(i, 'rb') as f:
sentences = pd.read_csv(f)['text']
word_list = " ".join(sentences).split()
vocab += word_list
vocab = list(set(vocab))
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_size = len(vocab)
print("vocab:\n", word2idx)
print("vocab_length:", vocab_size)
with open('./vocab.pkl', 'wb') as f:
pkl.dump(word2idx, f)
print("success!")
if __name__ == "__main__":
# 处理数据
# pre_data(train_path, train_out_path) # 处理训练数据
# pre_data(test_path, test_out_path) # 处理测试数据
build_vocab_dict(train_out_path, test_out_path) # 构建词汇表
2.构建模型及训练 run.py
import torch.nn as nn
import pickle as pkl
import torch.utils.data as Data
import pandas as pd
import torch
from pre_stop import pre_stop
train_out_path = "./dataset//imdb_tr.csv" # 训练数据路径
test_out_path = "./dataset/imdb_te.csv" # 测试数据路径
vocab_out_path = './dataset/vocab.pkl' # 词汇表
model_path = './models/CNN-2-L-1-epcho-5.bin' # 训练模型保存地址
vocab = pkl.load(open(vocab_out_path, 'rb')) # 加载词表
train_data = pd.read_csv(open(train_out_path))['text'] # 读取训练数据
train_label = pd.read_csv(open(train_out_path))['rate'] # 读取训练数据
target_data = pd.read_csv(open(test_out_path))['text'] # 读取测试数据
target_label = pd.read_csv(open(test_out_path))['rate'] # 读取测试标签
vocab_size = len(vocab) # 记录词表长度
total_data = len(train_data) # 总训练数据数量
max_length = 1024 # 目标词向量长度
learning_rate = 0.0001 # 学习率
epoches = 5 # epoch次数
train_size = 25000 # 训练数据数量
test_size = 14000 # 测试数据数量
# Model Parameter
output_channel = [256, 8] # 两层卷积
dropout_prob = 0.1 # droout概率
embedding_size = 512 # 词嵌入向量
num_classes = 2 # 0 表示bad的评论, 1 表示good的评论
batch_size = 32 # 每批送入数量
if torch.cuda.is_available():
print("开始使用GPU") # 判断是否存在GPU可用,存在返回True
device = torch.device('cuda:0') # 使用CPU进行训练
else:
print('开始使用CPU')
device = torch.device('cpu') # 使用GPU进行训练
# ---------------------训练模型------------------------
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.W = nn.Embedding(vocab_size, embedding_size) # [32 1 1024 512]
self.dropout = nn.Dropout(dropout_prob) # 添加 Dropout 层
self.conv = nn.Sequential(
nn.Conv2d(1, output_channel[0], (3, embedding_size)), # 使用更小的卷积核 [32 64 1022 1]
nn.ReLU(),
nn.MaxPool2d((2, 1)), # 添加池化层 [32 64 511 1]
nn.Conv2d(output_channel[0], output_channel[1], (4, 1)), # 第二个卷积层 [32 8 508 1]
nn.ReLU(),
nn.MaxPool2d((2, 1)), # [32 8 254 1]
)
self.fc = nn.Linear(output_channel[1]*254, num_classes)
def forward(self, X):
batch_size = X.shape[0]
embedding_X = self.W(X) # [batch_size, sequence_length, embedding_size]
embedding_X = embedding_X.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
conved = self.conv(embedding_X) # [batch_size, output_channel, 1, 1]
flatten = conved.view(batch_size, -1) # [batch_size, output_channel*1*1] # [4 3]
flatten = self.dropout(flatten) # 在全连接层之前应用 Dropout
output = self.fc(flatten) # [32 2]
return output
# -------------------填充或截断列表----------------------------
def unify_length(lists):
# print("截断填充前:", len(lists[0]))
list = []
for lst in lists:
if len(lst) < max_length:
lst.extend([0] * (max_length - len(lst)))
elif len(lst) > max_length:
lst = lst[:max_length]
list.append(lst)
# print("截断填充后:", len(list[0]))
return list
# --------------------------词用索引表示---------------------------------------
def make_data(sentences, labels):
inputs = [] # 将句子的词转化为词嵌入向量表示
for sen in sentences:
inputs.append([vocab[n] for n in sen.split()])
# 统一句长
inputs = unify_length(inputs) # 将句子截断或填充
targets = [] # 训练数据对应标签
for out in labels:
targets.append(out) # To using Torch Softmax Loss function
return inputs, targets
def train():
torch.manual_seed(1)
print("词表长度:", vocab_size)
print("共", total_data, "条训练数据\n")
input_batch, target_batch = make_data(train_data[0:train_size], train_label[0:train_size])
input_batch, target_batch = torch.LongTensor(input_batch), torch.LongTensor(target_batch)
print("input_batch:", input_batch.shape)
print("target_batch:", target_batch.shape)
# TensorDataset 用于将输入数据(input_batch)和目标数据(target_batch)组合成一个数据集对象
dataset = Data.TensorDataset(input_batch, target_batch)
loader = Data.DataLoader(dataset, batch_size, True)
# 加载模型
print("加载模型...")
model = Model().to(device) # 加载CPU或GPU
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# print("\n加载成功!")
print("开始训练...")
# Training
flag, max_train_acc, max_test_acc = (0, 0.0, 0.0) # flag表示是否更新模型, max_train_acc, max_test_acc表示历史最好正确率
count = 0
for epoch in range(epoches):
losses = 0
total_correct = 0
for batch_x, batch_y in loader:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
pred = model(batch_x)
loss = criterion(pred, batch_y)
losses += loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
_, predicted = torch.max(pred, dim=1)
# print(predicted)
# print(batch_y.shape)
total_correct += (predicted == batch_y).sum().item()
train_accuracy = total_correct / (batch_size*len(loader))
print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(losses/len(loader)))
print("Train Accuracy: {:.4f}%".format(train_accuracy * 100))
test_accuracy = test(model)
# 实现早停
count += 1
flag, max_train_acc, max_test_acc = pre_stop(train_accuracy, test_accuracy, max_train_acc, max_test_acc)
if flag == 1:
print("当前模型正确率更优,保存当前模型")
count = 0
torch.save(model, model_path)
elif count > 5: # 连续5次效果变差,直接停止训练
break
def test(model_):
label = target_label[0:test_size] # 使用测试集测试
input_batch, target_batch = make_data(target_data[0:test_size], target_label[0:test_size]) # 使用测试集测试
# label = train_label[0:train_size] # 使用训练集测试
# input_batch, target_batch = make_data(train_data[0:train_size], train_label[0:train_size]) # 使用训练集测试
model = model_.eval().to(device)
correct = 0.0
for num, test_batch in enumerate(input_batch):
test_batch = torch.LongTensor(test_batch).unsqueeze(0).to(device) # 转换为Tensor并移动到设备
predict = model(test_batch.to(device)).data.max(1, keepdim=True)[1]
if predict[0][0] == label[num]:
correct += 1
correct_rate = correct/(num+1)
print("Test Accuracy:{:.6f}".format(correct_rate))
# print("测试集数量:", num+1)
return correct_rate
if __name__ == '__main__':
# train()
model_ = torch.load(model_path).to(device)
test(model_)
3. 手动输入评论测试效果 practise.py
import torch.nn as nn
import pickle as pkl
import torch
vocab_out_path = './dataset/vocab.pkl' # 词汇表
model_path = './models/CNN-2-L-1-epcho-5.bin' # 训练模型保存地址
vocab = pkl.load(open(vocab_out_path, 'rb')) # 加载词表
vocab_size = len(vocab) # 记录词表长度
max_length = 1024 # 目标词向量长度
learning_rate = 0.0001 # 学习率
epoches = 5 # epoch次数
train_size = 25000 # 训练数据数量
test_size = 14000 # 测试数据数量
# Model Parameter
output_channel = [256, 8] # 两层卷积
dropout_prob = 0.1 # droout概率
embedding_size = 512 # 词嵌入向量
num_classes = 2 # 0 表示bad的评论, 1 表示good的评论
batch_size = 32 # 每批送入数量
if torch.cuda.is_available():
print("开始使用GPU\n") # 判断是否存在GPU可用,存在返回True
device = torch.device('cuda:0') # 使用CPU进行训练
else:
print('开始使用CPU\n')
device = torch.device('cpu') # 使用GPU进行训练
# ---------------------训练模型------------------------
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.W = nn.Embedding(vocab_size, embedding_size) # [32 1 1024 512]
self.dropout = nn.Dropout(dropout_prob) # 添加 Dropout 层
self.conv = nn.Sequential(
nn.Conv2d(1, output_channel[0], (3, embedding_size)), # 使用更小的卷积核 [32 64 1022 1]
nn.ReLU(),
nn.MaxPool2d((2, 1)), # 添加池化层 [32 64 511 1]
nn.Conv2d(output_channel[0], output_channel[1], (4, 1)), # 第二个卷积层 [32 8 508 1]
nn.ReLU(),
nn.MaxPool2d((2, 1)), # [32 8 254 1]
)
self.fc = nn.Linear(output_channel[1]*254, num_classes)
def forward(self, X):
batch_size = X.shape[0]
embedding_X = self.W(X) # [batch_size, sequence_length, embedding_size]
embedding_X = embedding_X.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
conved = self.conv(embedding_X) # [batch_size, output_channel, 1, 1]
flatten = conved.view(batch_size, -1) # [batch_size, output_channel*1*1] # [4 3]
flatten = self.dropout(flatten) # 在全连接层之前应用 Dropout
output = self.fc(flatten) # [32 2]
return output
def unify_length(lists):
# 填充或截断列表
print("截断填充前:", len(lists[0]))
list = []
for lst in lists:
if len(lst) < max_length:
lst.extend([0] * (max_length - len(lst)))
elif len(lst) > max_length:
lst = lst[:max_length]
list.append(lst)
print("截断填充后:", len(list[0]))
return list
def make_data(sentences):
inputs = []
for sen in sentences:
input = []
# print(sen.split())
for n in sen.split():
try:
input.append(vocab[n])
except:
pass
inputs.append(input)
# print(inputs)
inputs = unify_length(inputs)
return inputs
def APP():
model_ = torch.load(model_path).to(device)
model = model_.eval().to(device)
# test_text = 'lucky enough see test screening el padrino couple months ago'
input_str = 'start!'
input_str = input('please input your review:\nreview:')
num = 1
while input_str != 'quit':
list_str = [input_str]
input_batch = make_data(list_str)
input_batch = torch.LongTensor(input_batch)
for test_batch in input_batch:
test_batch = torch.LongTensor(test_batch).unsqueeze(0).to(device) # 转换为Tensor并移动到设备
predict = model(test_batch.to(device)).data.max(1, keepdim=True)[1]
if predict[0][0] == 0:
print(" {} ".format(num), " Bad ")
else:
print(" {} ".format(num), " Good ")
num += 1
input_str = input("review:")
if __name__ == '__main__':
APP()
4.资源链接: https://pan.baidu.com/s/1ZXXS0oJW9vtynndLcJ4kYA?pwd=3eiq 提取码: 3eiq
1918

被折叠的 条评论
为什么被折叠?



