深度学习实验四 LSTM情感分析

 情感分析作业说明:本次任务是有关自然语言处理领域中的情感识别问题(也叫观点分析问题,也就是将数据分类成正向、负向),包含六万多条购物评价,分别来自书籍、平板、手机、水果、洗发水、热水器、蒙牛、衣服、计算机、酒店,共十个类别。

一、数据读入及预处理

import pandas as pd
import jieba
import re
import os

from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

from torch.utils.tensorboard import SummaryWriter 
from tqdm.notebook import tqdm#tqdm是进度条库
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pad_packed_sequence

(1)数据读入

path="online_shopping_10_cats.csv"
df=pd.read_csv(path)
df.head()

(2)数据筛选及处理

df=df[["review","label"]]

df.head()

(3)去重

print(df.shape)
df.drop_duplicates()

数据清洗

info=re.compile("[0-9a-zA-Z]|作者|当当网|京东|洗发水|蒙牛|衣服|酒店|房间")
# print(df["review"].dtype)
df["review"]=df["review"].apply(lambda x:info.sub("",str(x)))
df["review"].head()

运行如图:

(5)分词

df["words"]=df["review"].apply(jieba.lcut)
df.head()

(6)建立词表

words = []
for sentence in df["words"].values:
    for word in sentence:
        words.append(word)
len(words)
words = list(set(words))#对词表去重
words = sorted(words)
word2idx = {w:i+1 for i,w in enumerate(words)}
idx2word = {i+1:w for i,w in enumerate(words)}
word2idx['<unk>'] = 0
idx2word[0] = '<unk>'

(7)将中文词数字化表示

data = []
label = []
        
for sentence in df['words']:
    words_to_idx = []
    for word in sentence:
        index = word2idx[word]
        words_to_idx.append(index)
        
    data.append(torch.tensor(words_to_idx))
label = torch.from_numpy(df['label'].values)

len(data)

len(label)

(8)划分训练集和验证集

x_train,x_val,y_train,y_val=train_test_split(data,label,test_size=0.2)

(9)设置DataSet和DataLoader
提供现成的数据变长处理的方法,可以直接在DataLoader的参数中设置collate_fn=mycollate_fn来使用这个方法

def mycollate_fn(data):
    
    data.sort(key=lambda x: len(x[0]), reverse=True)  
    data_length = [len(sq[0]) for sq in data]
    input_data = []
    label_data = []
    for i in data:
        input_data.append(i[0])
        label_data.append(i[1])
    input_data = pad_sequence(input_data, batch_first=True, padding_value=0)
    label_data = torch.tensor(label_data)
    return input_data, label_data, data_length
class mDataSet(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __getitem__(self, idx: int):
        return self.data[idx], self.label[idx]

    def __len__(self):
        return len(self.data)
train_dataset=mDataSet(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,num_workers=0,collate_fn=mycollate_fn)
val_dataset = mDataSet(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True,num_workers=0, collate_fn=mycollate_fn)

 二、建立模型
(1)定义模型

class Model(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(num_embeddings,embedding_dim)#nn.Embedding随机初始化词向量,词典的大小,嵌入词向量的维度
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim,num_layers)#
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(self.hidden_dim,256)
        self.fc2 = nn.Linear(256,32)
        self.fc3 = nn.Linear(32,2)

    def forward(self, input, batch_seq_len):
        embeds = self.embeddings(input) 
        embeds = pack_padded_sequence(embeds,batch_seq_len, batch_first=True)
        batch_size, seq_len = input.size()
        
        h_0 = input.data.new(3, batch_size, self.hidden_dim).fill_(0).float()
        c_0 = input.data.new(3, batch_size, self.hidden_dim).fill_(0).float()
        
        output, hidden = self.lstm(embeds, (h_0, c_0))
        output,_ = pad_packed_sequence(output,batch_first=True)

        output = self.dropout(torch.tanh(self.fc1(output)))
        output = torch.tanh(self.fc2(output))
        output = self.fc3(output)
        last_outputs = self.get_last_output(output, batch_seq_len)
        return last_outputs,hidden
    
    def get_last_output(self,output,batch_seq_len):
        last_outputs = torch.zeros((output.shape[0],output.shape[2]))
        for i in range(len(batch_seq_len)):
            last_outputs[i] =  output[i][batch_seq_len[i]-1]#index 是长度 -1
        last_outputs = last_outputs.to(output.device)
        return last_outputs

(2)初始化模型

# 实例化模型
model = Model(num_embeddings=len(words) + 1,embedding_dim=50,hidden_dim=100,num_layers=3)
# 定义优化器
optimizer = torch.optim.SGD(model.parameters(),lr=0.008)
# 学习率调整(可选)

# 定义损失函数
criterion = nn.CrossEntropyLoss()

(3)准确率指标

class AvgrageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.cnt += n
        self.avg = self.sum / self.cnt
def accuracy(output, label, topk=(1,)):
    maxk = max(topk) 
    batch_size = label.size(0)

    # 获取前K的索引
    _, pred = output.topk(maxk, 1, True, True) #使用topk来获得前k个的索引
    pred = pred.t() # 进行转置
    # eq按照对应元素进行比较 view(1,-1) 自动转换到行为1,的形状, expand_as(pred) 扩展到pred的shape
    # expand_as 执行按行复制来扩展,要保证列相等
    correct = pred.eq(label.view(1, -1).expand_as(pred)) # 与正确标签序列形成的矩阵相比,生成True/False矩阵
#     print(correct)

    rtn = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0) # 前k行的数据 然后平整到1维度,来计算true的总个数
        rtn.append(correct_k.mul_(100.0 / batch_size)) # mul_() ternsor 的乘法  正确的数目/总的数目 乘以100 变成百分比
    return rtn

 (4)训练

def train(epoch,epochs, train_loader, device, model, criterion, optimizer,tensorboard_path,k):
    model.train()
    top1 = AvgrageMeter()
    model = model.to(device)
    train_loss = 0.0
    for i, data in enumerate(train_loader, 0):  # 0是下标起始位置默认为0
        inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]
        if batch_seq_len[-1] <= 0:
            continue
        # 初始为0,清除上个batch的梯度信息
        optimizer.zero_grad()
        outputs,hidden = model(inputs,batch_seq_len)

        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()
        _,pred = outputs.topk(1)
        prec1, prec2= accuracy(outputs, labels, topk=(1,2))
        n = inputs.size(0)
        top1.update(prec1.item(), n)
        train_loss += loss.item()
        postfix = {'train_loss': '%.6f' % (train_loss / (i + 1)), 'train_acc': '%.6f' % top1.avg}
        train_loader.set_postfix(log=postfix)

        # ternsorboard 曲线绘制
#         if os.path.exists(tensorboard_path) == False: 
#             os.mkdir(tensorboard_path)
        if i%3==0:
            k = k + 1
            writer = SummaryWriter(tensorboard_path)
            writer.add_scalar('Train/Loss', loss.item(), k)
            writer.add_scalar('Train/Accuracy', top1.avg, k)
            writer.flush()
    return k

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 让torch判断是否使用GPU,建议使用GPU环境,因为会快很多
epochs=12
k = 0
for epoch in range(epochs):
    train_loader = tqdm(train_loader)
    k = train(epoch, epochs, train_loader, device, model, criterion, optimizer, "runs/train",k)
#'train_loss': '0.381668', 'train_acc': '84.044568'


def validate(epoch,validate_loader, device, model, criterion, tensorboard_path,k):
    val_acc = 0.0
    model = model.to(device)
    model.eval()
    with torch.no_grad():  # 进行评测的时候网络不更新梯度
        val_top1 = AvgrageMeter()
        validate_loader = tqdm(validate_loader)
        validate_loss = 0.0
        for i, data in enumerate(validate_loader, 0):  # 0是下标起始位置默认为0
            inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]
#             print('data0',data[0].shape)
#             print('data2',data[2])
#             print(i)
            if batch_seq_len[-1] <= 0:
                continue
            outputs,_ = model(inputs, batch_seq_len)
            loss = criterion(outputs, labels)
            #print('lables',labels)

            prec1, prec2 = accuracy(outputs, labels, topk=(1, 2))
            n = inputs.size(0)
            val_top1.update(prec1.item(), n)
            validate_loss += loss.item()
            postfix = {'validate_loss': '%.6f' % (validate_loss / (i + 1)), 'validate_acc': '%.6f' % val_top1.avg}
            validate_loader.set_postfix(log=postfix)

            # ternsorboard 曲线绘制
#             if os.path.exists(tensorboard_path) == False: 
#                 os.mkdir(tensorboard_path)   
            if i % 5==0:
                k = k + 1
                writer = SummaryWriter(tensorboard_path)
                writer.add_scalar('Validate/Loss', loss.item(), k)
                writer.add_scalar('Validate/Accuracy', val_top1.avg, k)
                writer.flush()
        val_acc = val_top1.avg
    print("val_acc: " + str(val_acc) + "%")
    return val_acc

k = 0
validate(epoch, val_loader,device,model,criterion,"runs/val",k)
#val_acc: 84.74806356060708%
#对模型进行测试
string = '商品外观有问题转来转去 解决不了'
words = jieba.lcut(string)
wtoid = []
for word in words:
    wtoid.append(word2idx[word])
#print(wtoid)
model = model.to(device)
model.eval()
batch_seq_len = []
batch_seq_len.append(len(wtoid))
inputs = torch.tensor(wtoid).reshape(1,-1)
# print(inputs.shape)
inputs = inputs.to(device)
# print(batch_seq_len)
# print(inputs.shape)
with torch.no_grad():
    outputs,_ = model(inputs,batch_seq_len)
if outputs[0][0] < outputs[0][1]:
    print("positive")
else:
    print("nagetive")

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值