电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯、神经网络模型

废柴小七

已于 2022-08-26 21:18:08 修改

阅读量3.2k

点赞数 3

文章标签： python 神经网络

于 2022-08-26 21:13:40 首次发布

本文链接：https://blog.csdn.net/qq_42272783/article/details/126549963

版权

载入包

import torch # torch==1.7.1
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import re
import numpy as np
from tqdm import tqdm
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

MAX_WORD = 10000  # 只保留最高频的10000词
MAX_LEN = 300     # 句子统一长度为200
word_count={}     # 词-词出现的词数 词典

数据处理

#读取数据集

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('./data/labeledTrainData.tsv',
                    header=0, delimiter="\t", quoting=3)
print('dataset shape is', data.shape)

#数据清洗

#去除网页符号
from bs4 import BeautifulSoup
example = BeautifulSoup(data['review'][0])
print(example.get_text())

#去除非字母元素
import re
letters_only = re.sub('[^A-Za-z]', ' ', example.get_text())
print(letters_only)

#将大写字母转化成小写，并对元素进行划分
lower_case = letters_only.lower()
words = lower_case.split()
print(words)

#获取停用词

# import nltk
# nltk.download('stopwords')

def get_custom_stopwords(stop_words_file):
    with open(stop_words_file,encoding='utf-8') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list
stop_words_file = 'english.txt'
stopwords = get_custom_stopwords(stop_words_file)
words = [word for word in words if word not in stopwords]
' '.join(words)

#打包成数据清洗函数

from bs4 import BeautifulSoup
#导入正则表达式工具包
# import re
# from nltk.corpus import stopwords
#定义review_to_text函数，完成对原始评论的三项数据预处理任务
def review_to_text(review):
    #任务一：去掉html标记。
    raw_text = BeautifulSoup(review,'html').get_text()
    #任务二：去掉非字母字符,sub(pattern, replacement, string) 用空格代替
    letters = re.sub('[^a-zA-Z]',' ',raw_text)
    #str.split(str="", num=string.count(str)) 通过指定分隔符对字符串进行切片，如果参数 num 有指定值，则仅分隔 num 个子字符串
    #这里是先将句子转成小写字母表示，再按照空格划分为单词list
    words = letters.lower().split()
    return words

#分别对原始数据和测试数据集进行上述三项处理

X_data = []
y_data=[]
for review in data['review']:
    X_data.append(' '.join(review_to_text(review)))

for sentiment in data['sentiment']:
    y_data.append(sentiment)
#
# y_data = data['sentiment']
# print(X_data,y_data)

#对数据集进行拆分

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45)

利用传统机器学习模型，朴素贝叶斯

#向量表示，和对数据进行学习，利用朴素贝叶斯分类器

from sklearn.feature_extraction.text import CountVectorizer
#5000的含义向量最大长度为5000，选取次数最多的5000个单词作为向量下标
vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
train_data_features = vectorizer.fit_transform(X_train)
t_data_features = vectorizer.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_data_features,y_train)
print(nb.score(train_data_features,  y_train))
print(nb.score(t_data_features,  y_test))

# #预测
# pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty"
# pre_str_list=[(' '.join(review_to_text(pre_str)))]
# pre_data = vectorizer.transform(pd.Series(pre_str_list))
# result = nb.predict(pre_data)
# print(result)

训练集精度为0.86，测试集精度为0.84
#output
0.86145
0.8498
[1]

利用神经模型LSTM/GRU进行数据学习、分类、预测

#将处理好的训练数据和测试数据写入新的train.txt和test.txt，便于使用dataset读取数据

with open("train.txt","w",encoding="utf-8") as f:
    for i in range(len(X_train)):
        # print(type(y_train[i]))
        # print(y_train[i])
        # print(type(X_train[i]))
        # print(type(X_train[i]))
        f.write(str(y_train[i])+" "+X_train[i]+"\n")
f.close()

with open("test.txt","w",encoding="utf-8") as f1:
    for i in range(len(X_test)):
        # print(type(y_train[i]))
        # print(y_train[i])
        # print(type(X_train[i]))
        # print(type(X_train[i]))
        f1.write(str(y_test[i])+" "+X_test[i]+"\n")
f1.close()

#将英文句子切成单词，并统计词频，生成词典

def tokenizer(sentence):
    return sentence.split()

def data_process(text): 
    for line in text:
        tokens = tokenizer(line) # 分词统计词数
        for token in tokens:
            if token in word_count.keys():
                word_count[token] = word_count[token] + 1
            else:
                word_count[token] = 0


    print("build vocabulary")

    vocab = {"<UNK>": 0, "<PAD>": 1}

    word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序，过滤低频词，只取前MAX_WORD个高频词
    word_number = 1
    for word in word_count_sort:
        if word[0] not in vocab.keys():
            vocab[word[0]] = len(vocab)
            word_number += 1
        if word_number > MAX_WORD:
            break
    return vocab

#建立词典

vocab=data_process(X_train)
# print(vocab)

#GRU模型构建，如果要换成LSTM，把nn.GRU换成nn.LSTM即可

class GRU(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(GRU, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层
    
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               bidirectional=False)
        self.decoder = nn.Linear(num_hiddens, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, inputs):
        # inputs的形状是（批量大小，词数），因此LSTM需要将序列长度（Seq_len）作为第一维，所以将输入转置后 再提取词特征
        embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度
        # LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态
        # outputs的形状是（词数，批量大小， 隐藏单元个数）
        outputs, _ = self.encoder(embeddings)
        # 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小， 隐藏单元个数)
        encoding = outputs[-1] # 取LSTM最后一层结果
        outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]
        return outs

#文本向量转化

def text_transform(sentence_list, vocab):
    sentence_index_list = []
    for sentence in sentence_list:
        sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分词转为id

        if len(sentence_idx) < MAX_LEN:
            for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充
                sentence_idx.append(vocab['<PAD>'])

        sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度
        sentence_index_list.append(sentence_idx)
    return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor

#模型训练

def train(model, train_data, vocab, epoch=10):
    print('train model')
    model = model.to(device)
    loss_sigma = 0.0
    correct = 0.0
    # 定义损失函数和优化器
    criterion = torch.nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

    for epoch in tqdm(range(epoch)):
        model.train()
        avg_loss = 0  # 平均损失
        avg_acc = 0  # 平均准确率

        for idx, (text, label) in enumerate(tqdm(train_data)):
            train_x = text_transform(text, vocab).to(device)
            train_y = label.to(device)
            optimizer.zero_grad()
            pred = model(train_x)
            loss = criterion(pred.log(), train_y)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            avg_acc += accuracy(pred, train_y)
        # 一个epoch结束后，计算平均loss和评平均acc
        avg_loss = avg_loss / len(train_data)
        avg_acc = avg_acc / len(train_data)

        print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)

        # 保存训练完成后的模型参数
        torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')

#设计数据格式

class MyDataset(Dataset):
    def __init__(self, text_path):
        file = open(text_path, 'r', encoding='utf-8')
        self.text_with_tag = file.readlines()  # 文本标签与内容
        file.close()

    def __getitem__(self, index): # 重写getitem
        line = self.text_with_tag[index] # 获取一个样本的标签和文本信息
        label = int(line[0]) # 标签信息
        text = line[2:-1]  # 文本信息
        return text, label

    def __len__(self):
        return len(self.text_with_tag)

#模型测试

def tst(model, test_data, vocab):
    print('test model')
    model = model.to(device)
    model.eval()
    avg_acc = 0
    for idx, (text, label) in enumerate(tqdm(test_data)):
        train_x = text_transform(text, vocab).to(device)
        train_y = label.to(device)
        pred = model(train_x)
        avg_acc += accuracy(pred, train_y)
    avg_acc = avg_acc / len(test_data)
    return avg_acc

#计算预测准确性

def accuracy(y_pred, y_true):
    label_pred = y_pred.max(dim=1)[1]
    acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数
    return acc.detach().cpu().numpy() / len(y_pred)

#main函数

def main():
    vocab = data_process(X_train)
    np.save('vocab.npy', vocab) # 词典保存为本地
    vocab = np.load('vocab.npy', allow_pickle=True).item()  # 加载本地已经存储的vocab

    # 构建MyDataset实例
    train_data = MyDataset(text_path="./train.txt")
    test_data = MyDataset(text_path="./test.txt")
    # 构建DataLoder
    train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)
    test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)

    # 生成模型
    model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3)  # 定义模型


    train(model=model, train_data=train_loader, vocab=vocab, epoch=30)

    # 加载训练好的模型
    model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))

    # 测试结果
    acc = tst(model=model, test_data=test_loader, vocab=vocab)
    print(acc)

#执行

if __name__ == '__main__':
    main()

结果
在这里插入图片描述

参考：
情感分析-IMDB数据集
 pytorch构建LSTM分类器用于IMDB情感分类
数据集：
文中涉及到的数据集和停用词表

链接：https://pan.baidu.com/s/1OTgLDoE1P9_FPDQaLU1VKw

提取码：mz6p

全部源码

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import torch # torch==1.7.1
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import re
import numpy as np
from tqdm import tqdm
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

MAX_WORD = 10000  # 只保留最高频的10000词
MAX_LEN = 300     # 句子统一长度为200
word_count={}     # 词-词出现的词数 词典


data = pd.read_csv('./data/labeledTrainData.tsv',
                    header=0, delimiter="\t", quoting=3)
print('dataset shape is', data.shape)

from bs4 import BeautifulSoup
example = BeautifulSoup(data['review'][0])
print(example.get_text())

import re
letters_only = re.sub('[^A-Za-z]', ' ', example.get_text())
print(letters_only)

lower_case = letters_only.lower()
words = lower_case.split()
print(words)

# import nltk
# nltk.download('stopwords')

def get_custom_stopwords(stop_words_file):
    with open(stop_words_file,encoding='utf-8') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list
stop_words_file = 'english.txt'
stopwords = get_custom_stopwords(stop_words_file)
words = [word for word in words if word not in stopwords]
' '.join(words)

from bs4 import BeautifulSoup
#导入正则表达式工具包
# import re
# from nltk.corpus import stopwords
#定义review_to_text函数，完成对原始评论的三项数据预处理任务
def review_to_text(review):
    #任务一：去掉html标记。
    raw_text = BeautifulSoup(review,'html').get_text()
    #任务二：去掉非字母字符,sub(pattern, replacement, string) 用空格代替
    letters = re.sub('[^a-zA-Z]',' ',raw_text)
    #str.split(str="", num=string.count(str)) 通过指定分隔符对字符串进行切片，如果参数 num 有指定值，则仅分隔 num 个子字符串
    #这里是先将句子转成小写字母表示，再按照空格划分为单词list
    words = letters.lower().split()
    #过滤掉停用词
    # words = [w for w in words if w not in stopwords]
    # words = [w for w in words if w not in stopwords.words()]
    return words

#分别对原始数据和测试数据集进行上述三项处理
X_data = []
y_data=[]
for review in data['review']:
    X_data.append(' '.join(review_to_text(review)))

for sentiment in data['sentiment']:
    y_data.append(sentiment)
#
# y_data = data['sentiment']
# print(X_data,y_data)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45)
print(type(y_train))

with open("train.txt","w",encoding="utf-8") as f:
    for i in range(len(X_train)):
        # print(type(y_train[i]))
        # print(y_train[i])
        # print(type(X_train[i]))
        # print(type(X_train[i]))
        f.write(str(y_train[i])+" "+X_train[i]+"\n")
f.close()

with open("test.txt","w",encoding="utf-8") as f1:
    for i in range(len(X_test)):
        # print(type(y_train[i]))
        # print(y_train[i])
        # print(type(X_train[i]))
        # print(type(X_train[i]))
        f1.write(str(y_test[i])+" "+X_test[i]+"\n")
f1.close()

# print(X_train)
# from sklearn.feature_extraction.text import CountVectorizer
# #5000的含义向量最大长度为5000，选取次数最多的5000个单词作为向量下标
# vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
# train_data_features = vectorizer.fit_transform(X_train)
# t_data_features = vectorizer.transform(X_test)
#
# from sklearn.naive_bayes import MultinomialNB
# nb = MultinomialNB()
# nb.fit(train_data_features,y_train)
# print(nb.score(train_data_features,  y_train))
# print(nb.score(t_data_features,  y_test))
#
# #预测
# pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty"
# pre_str_list=[(' '.join(review_to_text(pre_str)))]
# pre_data = vectorizer.transform(pd.Series(pre_str_list))
# result = nb.predict(pre_data)
# print(result)

def tokenizer(sentence):
    return sentence.split()

def data_process(text): # 根据文本路径生成文本的标签
    for line in text:
        tokens = tokenizer(line) # 分词统计词数
        for token in tokens:
            if token in word_count.keys():
                word_count[token] = word_count[token] + 1
            else:
                word_count[token] = 0


    print("build vocabulary")

    vocab = {"<UNK>": 0, "<PAD>": 1}

    word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序，过滤低频词，只取前MAX_WORD个高频词
    word_number = 1
    for word in word_count_sort:
        if word[0] not in vocab.keys():
            vocab[word[0]] = len(vocab)
            word_number += 1
        if word_number > MAX_WORD:
            break
    return vocab

vocab=data_process(X_train)
# print(vocab)

class GRU(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(GRU, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层
        #
        # self.encoder=nn.LSTM(input_size=embed_size
        #                     ,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               bidirectional=False)
        self.decoder = nn.Linear(num_hiddens, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, inputs):
        # inputs的形状是（批量大小，词数），因此LSTM需要将序列长度（Seq_len）作为第一维，所以将输入转置后 再提取词特征
        embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度
        # LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态
        # outputs的形状是（词数，批量大小， 隐藏单元个数）
        outputs, _ = self.encoder(embeddings)
        # 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小， 隐藏单元个数)
        encoding = outputs[-1] # 取LSTM最后一层结果
        outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]
        return outs

def text_transform(sentence_list, vocab):
    sentence_index_list = []
    for sentence in sentence_list:
        sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分词转为id

        if len(sentence_idx) < MAX_LEN:
            for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充
                sentence_idx.append(vocab['<PAD>'])

        sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度
        sentence_index_list.append(sentence_idx)
    return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor




# 模型训练
def train(model, train_data, vocab, epoch=10):
    print('train model')
    model = model.to(device)
    loss_sigma = 0.0
    correct = 0.0
    # 定义损失函数和优化器
    criterion = torch.nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)



    for epoch in tqdm(range(epoch)):
        model.train()
        avg_loss = 0  # 平均损失
        avg_acc = 0  # 平均准确率

        for idx, (text, label) in enumerate(tqdm(train_data)):
            train_x = text_transform(text, vocab).to(device)
            train_y = label.to(device)
            optimizer.zero_grad()
            pred = model(train_x)
            loss = criterion(pred.log(), train_y)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            avg_acc += accuracy(pred, train_y)
        # 一个epoch结束后，计算平均loss和评平均acc
        avg_loss = avg_loss / len(train_data)
        avg_acc = avg_acc / len(train_data)

        print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)

        # 保存训练完成后的模型参数
        torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')

class MyDataset(Dataset):
    def __init__(self, text_path):
        file = open(text_path, 'r', encoding='utf-8')
        self.text_with_tag = file.readlines()  # 文本标签与内容
        file.close()

    def __getitem__(self, index): # 重写getitem
        line = self.text_with_tag[index] # 获取一个样本的标签和文本信息
        label = int(line[0]) # 标签信息
        text = line[2:-1]  # 文本信息
        return text, label

    def __len__(self):
        return len(self.text_with_tag)

# 模型测试
def tst(model, test_data, vocab):
    print('test model')
    model = model.to(device)
    model.eval()
    avg_acc = 0
    for idx, (text, label) in enumerate(tqdm(test_data)):
        train_x = text_transform(text, vocab).to(device)
        train_y = label.to(device)
        pred = model(train_x)
        avg_acc += accuracy(pred, train_y)
    avg_acc = avg_acc / len(test_data)
    return avg_acc

# 计算预测准确性
def accuracy(y_pred, y_true):
    label_pred = y_pred.max(dim=1)[1]
    acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数
    return acc.detach().cpu().numpy() / len(y_pred)

from mxnet.gluon import data as gdata
def main():
    vocab = data_process(X_train)
    np.save('vocab.npy', vocab) # 词典保存为本地
    vocab = np.load('vocab.npy', allow_pickle=True).item()  # 加载本地已经存储的vocab

    # 构建MyDataset实例
    # train_data = X_train
    # test_data = X_test
    train_data = MyDataset(text_path="./train.txt")
    test_data = MyDataset(text_path="./test.txt")
    # 构建DataLoder
    # train_data = GetLoader(X_train, y_train)
    # # test_data=GetLoader(X_test,y_test)
    # train_data = gdata.ArrayDataset(X_train, y_train)
    train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)
    test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)

    # 生成模型
    model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3)  # 定义模型


    train(model=model, train_data=train_loader, vocab=vocab, epoch=30)

    # 加载训练好的模型
    model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))

    # 测试结果
    acc = tst(model=model, test_data=test_loader, vocab=vocab)
    print(acc)

if __name__ == '__main__':
    main()