基于CNN卷积网络.(正确率86%~88%左右)
1.数据清理
2.构建词汇字典
3.构建模型
4.开始训练
5.开始训练
6.测试结果
7.调整参数
1.处理数据和构建词表 clear_data.py
import pandas as pd import numpy as np import re import os import pickle as pkl train_path = './aclImdb/train/' test_path = './aclImdb/test/' train_out_path = "./dataset/imdb_tr.csv" test_out_path = "./dataset/imdb_te.csv" stopwords_en_path = "./dataset/stopwords.en.txt" word_dict = {} # 将句子和停用词作为输入,并返回没有任何停用词的句子 def remove_stopwords(sentence, stopwords): sentencewords = sentence.split() resultwords = [word for word in sentencewords if word.lower() not in stopwords] result = ' '.join(resultwords) return result def pre_data(file_path, out_path): num = 0 # 数据的编号 indices = [] # 编号段 text = [] # 文本段 rating = [] # 评价段 stopwords = open(stopwords_en_path, 'r', encoding="ISO-8859-1").read() stopwords = stopwords.split("\n") # 处理正面评论 for filename in os.listdir(file_path + 'pos'): review = open(file_path+'pos/'+filename, 'r', encoding="ISO-8859-1").read() review = re.sub('[^a-zA-Z]', ' ', review) review = review.lower() review = remove_stopwords(review, stopwords) indices.append(num) text.append(review) rating.append(1) num += 1 print("第{}条数据完成".format(num)) # 处理负面评论 for filename in os.listdir(file_path + 'neg'): review = open(file_path+'neg/'+filename, 'r', encoding="ISO-8859-1").read() review = re.sub('[^a-zA-Z]', ' ', review) review = review.lower() # 将大写字母转化位小写 review = remove_stopwords(review, stopwords) indices.append(num) text.append(review) rating.append(0) num += 1 print("第{}条数据完成".format(num)) Dataset = list(zip(indices, text, rating)) # 将这些记录组合 np.random.shuffle(Dataset) # 讲这些评论打乱 df = pd.DataFrame(data=Dataset, columns=['num', 'text', 'rate']) df.to_csv(out_path, index=False, header=True) # 不保存索引,保存列名在首行 # 构建词汇表 def build_vocab_dict(file_path1, file_path2): file_path = [file_path1, file_path2] vocab = [] for i in file_path: with open(i, 'rb') as f: sentences = pd.read_csv(f)['text'] word_list = " ".join(sentences).split() vocab += word_list vocab = list(set(vocab)) word2idx = {w: i for i, w in enumerate(vocab)} vocab_size = len(vocab) print("vocab:\n", word2idx) print("vocab_length:", vocab_size) with open('./vocab.pkl', 'wb') as f: pkl.dump(word2idx, f) print("success!") if __name__ == "__main__": # 处理数据 # pre_data(train_path, train_out_path) # 处理训练数据 # pre_data(test_path, test_out_path) # 处理测试数据 build_vocab_dict(train_out_path, test_out_path) # 构建词汇表
2.构建模型及训练 run.py
import torch.nn as nn import pickle as pkl import torch.utils.data as Data import pandas as pd import torch from pre_stop import pre_stop train_out_path = "./dataset//imdb_tr.csv" # 训练数据路径 test_out_path = "./dataset/imdb_te.csv" # 测试数据路径 vocab_out_path = './dataset/vocab.pkl' # 词汇表 model_path = './models/CNN-2-L-1-epcho-5.bin' # 训练模型保存地址 vocab = pkl.load(open(vocab_out_path, 'rb')) # 加载词表 train_data = pd.read_csv(open(train_out_path))['text'] # 读取训练数据 train_label = pd.read_csv(open(train_out_path))['rate'] # 读取训练数据 target_data = pd.read_csv(open(test_out_path))['text'] # 读取测试数据 target_label = pd.read_csv(open(test_out_path))['rate'] # 读取测试标签 vocab_size = len(vocab) # 记录词表长度 total_data = len(train_data) # 总训练数据数量 max_length = 1024 # 目标词向量长度 learning_rate = 0.0001 # 学习率 epoches = 5 # epoch次数 train_size = 25000 # 训练数据数量 test_size = 14000 # 测试数据数量 # Model Parameter output_channel = [256, 8] # 两层卷积 dropout_prob = 0.1 # droout概率 embedding_size = 512 # 词嵌入向量 num_classes = 2 # 0 表示bad的评论, 1 表示good的评论 batch_size = 32 # 每批送入数量 if torch.cuda.is_available(): print("开始使用GPU") # 判断是否存在GPU可用,存在返回True device = torch.device('cuda:0') # 使用CPU进行训练 else: print('开始使用CPU') device = torch.device('cpu') # 使用GPU进行训练 # ---------------------训练模型------------------------ class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.W = nn.Embedding(vocab_size, embedding_size) # [32 1 1024 512] self.dropout = nn.Dropout(dropout_prob) # 添加 Dropout 层 self.conv = nn.Sequential( nn.Conv2d(1, output_channel[0], (3, embedding_size)), # 使用更小的卷积核 [32 64 1022 1] nn.ReLU(), nn.MaxPool2d((2, 1)), # 添加池化层 [32 64 511 1] nn.Conv2d(output_channel[0], output_channel[1], (4, 1)), # 第二个卷积层 [32 8 508 1] nn.ReLU(), nn.MaxPool2d((2, 1)), # [32 8 254 1] ) self.fc = nn.Linear(output_channel[1]*254, num_classes) def forward(self, X): batch_size = X.shape[0] embedding_X = self.W(X) # [batch_size, sequence_length, embedding_size] embedding_X = embedding_X.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size] conved = self.conv(embedding_X) # [batch_size, output_channel, 1, 1] flatten = conved.view(batch_size, -1) # [batch_size, output_channel*1*1] # [4 3] flatten = self.dropout(flatten) # 在全连接层之前应用 Dropout output = self.fc(flatten) # [32 2] return output # -------------------填充或截断列表---------------------------- def unify_length(lists): # print("截断填充前:", len(lists[0])) list = [] for lst in lists: if len(lst) < max_length: lst.extend([0] * (max_length - len(lst))) elif len(lst) > max_length: lst = lst[:max_length] list.append(lst) # print("截断填充后:", len(list[0])) return list # --------------------------词用索引表示--------------------------------------- def make_data(sentences, labels): inputs = [] # 将句子的词转化为词嵌入向量表示 for sen in sentences: inputs.append([vocab[n] for n in sen.split()]) # 统一句长 inputs = unify_length(inputs) # 将句子截断或填充 targets = [] # 训练数据对应标签 for out in labels: targets.append(out) # To using Torch Softmax Loss function return inputs, targets def train(): torch.manual_seed(1) print("词表长度:", vocab_size) print("共", total_data, "条训练数据\n") input_batch, target_batch = make_data(train_data[0:train_size], train_label[0:train_size]) input_batch, target_batch = torch.LongTensor(input_batch), torch.LongTensor(target_batch) print("input_batch:", input_batch.shape) print("target_batch:", target_batch.shape) # TensorDataset 用于将输入数据(input_batch)和目标数据(target_batch)组合成一个数据集对象 dataset = Data.TensorDataset(input_batch, target_batch) loader = Data.DataLoader(dataset, batch_size, True) # 加载模型 print("加载模型...") model = Model().to(device) # 加载CPU或GPU criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # print("\n加载成功!") print("开始训练...") # Training flag, max_train_acc, max_test_acc = (0, 0.0, 0.0) # flag表示是否更新模型, max_train_acc, max_test_acc表示历史最好正确率 count = 0 for epoch in range(epoches): losses = 0 total_correct = 0 for batch_x, batch_y in loader: batch_x, batch_y = batch_x.to(device), batch_y.to(device) pred = model(batch_x) loss = criterion(pred, batch_y) losses += loss optimizer.zero_grad() loss.backward() optimizer.step() _, predicted = torch.max(pred, dim=1) # print(predicted) # print(batch_y.shape) total_correct += (predicted == batch_y).sum().item() train_accuracy = total_correct / (batch_size*len(loader)) print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(losses/len(loader))) print("Train Accuracy: {:.4f}%".format(train_accuracy * 100)) test_accuracy = test(model) # 实现早停 count += 1 flag, max_train_acc, max_test_acc = pre_stop(train_accuracy, test_accuracy, max_train_acc, max_test_acc) if flag == 1: print("当前模型正确率更优,保存当前模型") count = 0 torch.save(model, model_path) elif count > 5: # 连续5次效果变差,直接停止训练 break def test(model_): label = target_label[0:test_size] # 使用测试集测试 input_batch, target_batch = make_data(target_data[0:test_size], target_label[0:test_size]) # 使用测试集测试 # label = train_label[0:train_size] # 使用训练集测试 # input_batch, target_batch = make_data(train_data[0:train_size], train_label[0:train_size]) # 使用训练集测试 model = model_.eval().to(device) correct = 0.0 for num, test_batch in enumerate(input_batch): test_batch = torch.LongTensor(test_batch).unsqueeze(0).to(device) # 转换为Tensor并移动到设备 predict = model(test_batch.to(device)).data.max(1, keepdim=True)[1] if predict[0][0] == label[num]: correct += 1 correct_rate = correct/(num+1) print("Test Accuracy:{:.6f}".format(correct_rate)) # print("测试集数量:", num+1) return correct_rate if __name__ == '__main__': # train() model_ = torch.load(model_path).to(device) test(model_)
3. 手动输入评论测试效果 practise.py
import torch.nn as nn import pickle as pkl import torch vocab_out_path = './dataset/vocab.pkl' # 词汇表 model_path = './models/CNN-2-L-1-epcho-5.bin' # 训练模型保存地址 vocab = pkl.load(open(vocab_out_path, 'rb')) # 加载词表 vocab_size = len(vocab) # 记录词表长度 max_length = 1024 # 目标词向量长度 learning_rate = 0.0001 # 学习率 epoches = 5 # epoch次数 train_size = 25000 # 训练数据数量 test_size = 14000 # 测试数据数量 # Model Parameter output_channel = [256, 8] # 两层卷积 dropout_prob = 0.1 # droout概率 embedding_size = 512 # 词嵌入向量 num_classes = 2 # 0 表示bad的评论, 1 表示good的评论 batch_size = 32 # 每批送入数量 if torch.cuda.is_available(): print("开始使用GPU\n") # 判断是否存在GPU可用,存在返回True device = torch.device('cuda:0') # 使用CPU进行训练 else: print('开始使用CPU\n') device = torch.device('cpu') # 使用GPU进行训练 # ---------------------训练模型------------------------ class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.W = nn.Embedding(vocab_size, embedding_size) # [32 1 1024 512] self.dropout = nn.Dropout(dropout_prob) # 添加 Dropout 层 self.conv = nn.Sequential( nn.Conv2d(1, output_channel[0], (3, embedding_size)), # 使用更小的卷积核 [32 64 1022 1] nn.ReLU(), nn.MaxPool2d((2, 1)), # 添加池化层 [32 64 511 1] nn.Conv2d(output_channel[0], output_channel[1], (4, 1)), # 第二个卷积层 [32 8 508 1] nn.ReLU(), nn.MaxPool2d((2, 1)), # [32 8 254 1] ) self.fc = nn.Linear(output_channel[1]*254, num_classes) def forward(self, X): batch_size = X.shape[0] embedding_X = self.W(X) # [batch_size, sequence_length, embedding_size] embedding_X = embedding_X.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size] conved = self.conv(embedding_X) # [batch_size, output_channel, 1, 1] flatten = conved.view(batch_size, -1) # [batch_size, output_channel*1*1] # [4 3] flatten = self.dropout(flatten) # 在全连接层之前应用 Dropout output = self.fc(flatten) # [32 2] return output def unify_length(lists): # 填充或截断列表 print("截断填充前:", len(lists[0])) list = [] for lst in lists: if len(lst) < max_length: lst.extend([0] * (max_length - len(lst))) elif len(lst) > max_length: lst = lst[:max_length] list.append(lst) print("截断填充后:", len(list[0])) return list def make_data(sentences): inputs = [] for sen in sentences: input = [] # print(sen.split()) for n in sen.split(): try: input.append(vocab[n]) except: pass inputs.append(input) # print(inputs) inputs = unify_length(inputs) return inputs def APP(): model_ = torch.load(model_path).to(device) model = model_.eval().to(device) # test_text = 'lucky enough see test screening el padrino couple months ago' input_str = 'start!' input_str = input('please input your review:\nreview:') num = 1 while input_str != 'quit': list_str = [input_str] input_batch = make_data(list_str) input_batch = torch.LongTensor(input_batch) for test_batch in input_batch: test_batch = torch.LongTensor(test_batch).unsqueeze(0).to(device) # 转换为Tensor并移动到设备 predict = model(test_batch.to(device)).data.max(1, keepdim=True)[1] if predict[0][0] == 0: print(" {} ".format(num), " Bad ") else: print(" {} ".format(num), " Good ") num += 1 input_str = input("review:") if __name__ == '__main__': APP()
4.资源链接: https://pan.baidu.com/s/1ZXXS0oJW9vtynndLcJ4kYA?pwd=3eiq 提取码: 3eiq