pytorch实现情感分类(wordavg&lstm&cnn)

import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import spacy
from spacy.lang.en import English
import random
import torch.nn as nn
import torch.nn.functional as F

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

1.数据准备

nlp = English() #确定分词方式
TEXT = data.Field(tokenize=nlp)
# TEXT = data.Field(lower= True)
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) #载入数据

train_data, valid_data = train_data.split(split_ratio=0.7,random_state=random.seed(SEED)) #再拆分一个validation_set出来
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
print(vars(train_data[0]),'\n',len(vars(train_data[0])['text']))
print(vars(train_data[0])['text'][0],'\n',type((vars(train_data[0])['text'][0])))
{'text': This movie has got to be one of the worst I have ever seen make it to DVD!!! The story line might have clicked if the film had more funding and writers that would have cut the nonsense and sickly scenes that I highly caution parents on.... But the story line is like a loose cannon. If there was such a thing as a drive thru movie maker-this one would have sprung from that.It reminded me a lot of the quickie films that were put out in the 1960's, poor script writing and filming. <br /><br />The only sensible characters in the whole movie was the bartender and beaver. The rest of the film, could have easily been made by middle school children. I give this film a rating of 1 as it is truly awful and left my entire family with a sense of being cheated. My advice-Don't Watch It!!!, 'label': 'neg'} 
 173
This 
 <class 'spacy.tokens.token.Token'>
# 从上面可以看出,使用English()这种方法切出来的每个单词并不是string类型,
# 它的type是token,因此我们要把切出来的词的type都转化为str
for data in [train_data,valid_data,test_data]:
    for i in range(len(data)):
        a = data[i]
        a.text = [str(j) for j in a.text]

#建立词典
TEXT.build_vocab(train_data, max_size=25000,vectors='glove.6B.100d',unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(len(TEXT.vocab),len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(20))
25002 2
[('the', 203566), (',', 192495), ('.', 165539), ('and', 109443), ('a', 109116), ('of', 100702), ('to', 93766), ('is', 76328), ('in', 61255), ('I', 54004), ('it', 53508), ('that', 49187), ('"', 44285), ("'s", 43329), ('this', 42445), ('-', 37165), ('/><br', 35752), ('was', 35034), ('as', 30384), ('with', 29774)]
# 建立iterator(其实就是dataloader)
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
                                                (train_data, valid_data, test_data), 
                                                batch_size=batch_size,
                                                device=device)

2.wordavg model

2.1定义模型

class WordAvgModel(nn.Module):
    def __init__(self,vocab_size,embed_size,output_size,pad_idx):
        super(WordAvgModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embed_size,pad_idx)
        self.linear = nn.Linear(embed_size,output_size)
        
    def forward(self,text):
        #text 的size是(sq_length,batch_size),即一列是一句话(原始长度达不到sq_length的,用pad来填充)
        embedded = self.embed(text) # (sq_length,batch_size,embed_size)
        embedded = embedded.permute(1,0,2) # 把第一维度和第二维度交换一下,size变成(batch_size,seq_length,embed_size)
        pooled = F.avg_pool2d(embedded,(embedded.shape[1],1)).squeeze() # kernel的size为(seq_length,1),那么结果池化之后,原来的矩阵的size就变成(batch_size,1,embed_size),然后squeeze,变成(batch_size,embed_size)
        return self.linear(pooled)

2.2设置参数

batch_size = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(TEXT.vocab)
embed_size = 100
output_size = 1
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

2.3 初始化模型

avg_model = WordAvgModel(vocab_size=vocab_size,embed_size=embed_size,
                         output_size=output_size,pad_idx=pad_idx)
avg_model.to(device)
WordAvgModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)
num_parameters = sum(p.numel() for p in avg_model.parameters() if p.requires_grad)
print(num_parameters) #看一下待训练的参数个数
2500301

2.4 glove初始化模型embedding层

pretrained_embed = TEXT.vocab.vectors
#pretrained_embed的size是(25002,100),25002是字典中单词的个数,100是因为我们在建立字典时,对参数vectors设定的是glove.6b.100d 即100维 
avg_model.embed.weight.data.copy_(pretrained_embed)
tensor([[-0.6946,  0.0269,  0.0063,  ...,  1.2692, -1.3969, -0.4796],
        [-2.2822,  0.1412, -1.3277,  ..., -0.0465, -1.0185, -0.1024],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3617,  0.6201,  0.1105,  ...,  0.2994, -0.5920,  1.0949],
        [-0.3312,  0.9364, -0.1638,  ...,  0.9859, -1.0950, -1.1516],
        [-0.1954,  0.5692, -0.0671,  ...,  0.2170,  0.7001, -0.1479]],
       device='cuda:0')
avg_model.embed.weight.data.size() 
#embed的size是(25002,100) 每一行代表vocab里面的一个单词,
#其中第一个单词是<unk>,第二个单词是<pad>
#一般情况下,我们会把这两个单词的weight初始化为0
torch.Size([25002, 100])
avg_model.embed.weight.data[pad_idx] = torch.zeros(embed_size)
avg_model.embed.weight.data[unk_idx] = torch.zeros(embed_size)
# avg_model.embed.weight.data[pad_idx]指的就是<pad>所代表的那行

2.5 定义训练过程、评估函数

def train(model,dataset,optimizer,loss_fn):
    epoch_loss,epoch_count,epoch_acc_count=0.,0.,0.
    model.train()
    total_len = 0
    for batch in dataset:
        preds = model(batch.text).squeeze() #model出来的size为(batch_size,1),把那个1 squeeze掉,size变成batch_size
        loss = loss_fn(preds,batch.label)
        acc = binary_accuracy(preds,batch.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()*len(batch.label)      #用于计算整个epoch个loss
        epoch_count += len(batch.label)                 #用于计算整个epoch上的样本数
        epoch_acc_count += acc.item()*len(batch.label)  #用于计算整个epoch上预测正确的样本数
    
    return epoch_loss/epoch_count,epoch_acc_count/epoch_count

 
def evaluate(model,dataset,loss_fn):
    epoch_loss,epoch_count,epoch_acc_count=0.,0.,0.
    model.eval()
    total_len = 0
    for batch in dataset:
        preds = model(batch.text).squeeze() #model出来的size为(batch_size,1),把那个1 squeeze掉,size变成batch_size
        loss = loss_fn(preds,batch.label)
        acc = binary_accuracy(preds,batch.label)

        epoch_loss += loss.item()*len(batch.label)      #用于计算整个epoch个loss
        epoch_count += len(batch.label)                 #用于计算整个epoch上的样本数
        epoch_acc_count += acc.item()*len(batch.label)  #用于计算整个epoch上预测正确的样本数
    
    model.eval()
    return epoch_loss/epoch_count,epoch_acc_count/epoch_count

def binary_accuracy(preds,y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    num_correct = (rounded_preds==y).float()
    acc = num_correct.sum()/len(y)
    return acc
    

2.6 开始训练

optimizer = torch.optim.Adam(avg_model.parameters(),lr=0.005)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 10
best_valid_acc = 0.
for epoch in range(epochs):
    train_loss,train_acc = train(avg_model,train_iterator,optimizer,loss_fn)
    valid_loss,valid_acc = evaluate(avg_model,valid_iterator,loss_fn)
    
    if valid_acc>best_valid_acc:
        best_valid_acc = valid_acc
        best_epoch = epoch
        torch.save(avg_model.state_dict(),'./wordavg_model.txt')
#         print('模型在验证集上的正确率({})有所提高,已将模型保存'.format(valid_acc))
    

    print("Epoch:", epoch, "Train_Loss:", train_loss, "Train_Acc:", train_acc,
          "Valid_Loss", valid_loss, "Valid_Acc", valid_acc)
    
print("training has finished,the best epoch is {},the best valid_acc is {}".format(best_epoch,best_valid_acc))
       
Epoch: 0 Train_Loss: 0.5956396281242371 Train_Acc: 0.694228571496691 Valid_Loss 0.4051449816385905 Valid_Acc 0.8417333333969116
Epoch: 1 Train_Loss: 0.3593949766363416 Train_Acc: 0.8733142857960292 Valid_Loss 0.46664917748769125 Valid_Acc 0.8840000000317891
Epoch: 2 Train_Loss: 0.2551696341242109 Train_Acc: 0.913428571510315 Valid_Loss 0.5249438627560934 Valid_Acc 0.8950666667302449
Epoch: 3 Train_Loss: 0.196742424092974 Train_Acc: 0.9325142858232771 Valid_Loss 0.6135396106402079 Valid_Acc 0.8957333333969116
Epoch: 4 Train_Loss: 0.15810192627225603 Train_Acc: 0.9501142857687814 Valid_Loss 0.6637696914672852 Valid_Acc 0.9009333333969116
Epoch: 5 Train_Loss: 0.1267459169966834 Train_Acc: 0.9622285714830671 Valid_Loss 0.7350258693695069 Valid_Acc 0.9008000000635783
Epoch: 6 Train_Loss: 0.10385001053469521 Train_Acc: 0.9716 Valid_Loss 0.835720943514506 Valid_Acc 0.8982666667302449
Epoch: 7 Train_Loss: 0.08529832897612026 Train_Acc: 0.9776 Valid_Loss 0.8945791959762573 Valid_Acc 0.8969333333969116
Epoch: 8 Train_Loss: 0.0711212798680578 Train_Acc: 0.9828571428843907 Valid_Loss 0.9895696968078613 Valid_Acc 0.8968000000635783
Epoch: 9 Train_Loss: 0.05655052126603467 Train_Acc: 0.9883428571428572 Valid_Loss 1.065309889539083 Valid_Acc 0.8962666667302449
training has finished,the best epoch is 4,the best valid_acc is 0.9009333333969116
best_model = WordAvgModel(vocab_size=vocab_size,embed_size=embed_size,
                         output_size=output_size,pad_idx=pad_idx)
best_model.load_state_dict(torch.load('./wordavg_model.txt'))
best_model.to(device)
WordAvgModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

2.7 检验分类效果

def predict_sentiment(sentence):
    tokennized = [str(tok) for tok in TEXT.tokenize(sentence)]
    print(tokennized)
    indexed = torch.LongTensor([TEXT.vocab.stoi[t] for t in tokennized]).to(device).unsqueeze(1)
    pred = torch.sigmoid(best_model(indexed))
    return pred.item()    
sentence = input('please input the sentence you want to predict(in English):')
print('输入语句表达正向情感的概率为:{}'.format(predict_sentiment(sentence)))
please input the sentence you want to predict(in English): this is a good movie


['this', 'is', 'a', 'good', 'movie']
输入语句表达正向情感的概率为:1.0
sentence = input('please input the sentence you want to predict(in English):')
print('输入语句表达正向情感的概率为:{}'.format(predict_sentiment(sentence)))
please input the sentence you want to predict(in English): the film is great while the stars are awful


['the', 'film', 'is', 'great', 'while', 'the', 'stars', 'are', 'awful']
输入语句表达正向情感的概率为:3.232804579589299e-10
sentence = input('please input the sentence you want to predict(in English):')
print('输入语句表达正向情感的概率为:{}'.format(predict_sentiment(sentence)))
please input the sentence you want to predict(in English):  the film is great and the stars are good


[' ', 'the', 'film', 'is', 'great', 'and', 'the', 'stars', 'are', 'good']
输入语句表达正向情感的概率为:1.0

3.LSTM模型

class LstmModel(nn.Module):
    def __init__(self,vocab_size,embed_size,output_size,pad_idx,hidden_size,dropout_ratio):
        super(LstmModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embed_size,padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_size,hidden_size,bidirectional=True,num_layers=1)
        self.linear = nn.Linear(hidden_size*2,output_size)
        self.dropout = nn.Dropout(dropout_ratio)
        
    def forward(self,text):
        embedded = self.dropout(self.embed(text))
        output,(hidden,cell) = self.lstm(embedded)
        # output size: (seq_length,batch_size,num_directions*num_layers)
        # hidden 和 cell的size: (num_layers * num_directions, batch_size, hidden_size)
        
        hidden = torch.cat([hidden[-1],hidden[-2]],dim=1)
        # hidden[-1] 和 hidden[-2]的size都是(batch_size,hidden_size),
        #cat之后,hidden的size变成(batch_size,hidden_size*2)
#         print(hidden.size())
        hidden = self.dropout(hidden.squeeze())
        return self.linear(hidden)
        
vocab_size = len(TEXT.vocab)
embed_size = 100
output_size = 1
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
hidden_size = 100
dropout_ratio= 0.5
lstm_model = LstmModel(vocab_size,embed_size,output_size,pad_idx,hidden_size,dropout_ratio).to(device)
num_parameters = sum(p.numel() for p in lstm_model.parameters() if p.requires_grad)
print(num_parameters)
2662001
# lstm_model.to(device)
pretrained_embed = TEXT.vocab.vectors
lstm_model.embed.weight.data.copy_(pretrained_embed)

unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
lstm_model.embed.weight.data[pad_idx] = torch.zeros(embed_size)
lstm_model.embed.weight.data[unk_idx] = torch.zeros(embed_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(lstm_model.parameters(),lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()
# lstm_model.to(device)
# loss_fn.to(device)
epochs = 2
best_valid_acc = 0.
for epoch in range(epochs):
    train_loss,train_acc = train(lstm_model,train_iterator,optimizer,loss_fn)
    valid_loss,valid_acc = evaluate(lstm_model,valid_iterator,loss_fn)
    
    if valid_acc>best_valid_acc:
        best_valid_acc = valid_acc
        best_epoch = epoch
        torch.save(avg_model.state_dict(),'./lstm_model.txt')
#         print('模型在验证集上的正确率({})有所提高,已将模型保存'.format(valid_acc))
    

    print("Epoch:", epoch, "Train_Loss:", train_loss, "Train_Acc:", train_acc,
          "Valid_Loss", valid_loss, "Valid_Acc", valid_acc)
    
print("training has finished,the best epoch is {},the best valid_acc is {}".format(best_epoch,best_valid_acc))

4.CNN 模型

class CNNModel(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx,num_filters,filter_size,dropout_ratio):
        super(CNNModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx)
        self.conv = nn.Conv2d(in_channels=1,out_channels=num_filters,kernel_size=(filter_size,embedding_size))
        self.dropout = nn.Dropout(dropout_ratio)
        self.linear = nn.Linear(num_filters,ouput_size)
        
    def forward(self,text):
        text = text.permute(1,0) # 把batch_size换到第一维
        embedded = self.embed(text) # (batch_size,seq_length,embed_size)
        embedded = embedded.unsqueeze(1) #(batch_size,1,seq_length,embde_size) 这是因为cnn的input size是(batch_size,c_in,h_in,w_in)
                                         #其中c_in表示输入通道个数,比如灰度照片时,就为1;rgb照片时,就为3
        conved = F.relu(self.conv(embeded)) # (batch_size,num_filters,seq_length-fliter_size+1,1)
        conved = conved.squeeze() #把最后那个维度的1 给squeeze掉 (batch_size,num_filters,seq_length-filter_size+1)
        pooled = F.max_pool1d(conved,conved.shape[2]) 
        # max_pool1d kernel的size是(seq_length-filter_size+1,1),这个池化操作就是将每个样本的每个filter上的最大值取出来,
        # 所以经过这步池化之后,size为:(batch_size,num_filters,1)
        pooled = pooled.squeeze() #把那个1 squeeze掉
        pooled = self.dropout(pooled) 
        
        return self.linear(pooled)
  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值