项目实战(五) - - 文本分类
1. 任务概述
用Word Averaging模型、RNN、CNN实现情感分析
数据:torchtext自带电影评论数据集
2. 算法步骤
①数据预处理(分词,语料库,词编码)
②定义模型
- Word Averaging模型:对句子中所有向量求平均,seq_len转 为1,加入linear层计算分类
- RNN:使用最后一个hidden state ht来表示整个句子,将h通过一个线性变换从而预测句子的情感
- CNN:特征提取,池化
③训练、评估与情感测试
3. 代码实现与解析
- 导入相关包
- 数据预处理(分词,语料库,词编码)
TEXT=data.Field(tokenize=tokenizer)
LABEL = data.LabelField(dtype=torch.float)
train_data,test_data=datasets.IMDB.splits(TEXT,LABEL)
from torchtext.vocab import Vectors
vectors=Vectors(name="glove.6B.100d.txt")
TEXT.build_vocab(train_data,max_size=500,vectors=vectors,unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
3.1 Word Averaging模型
对句子中所有向量求平均,seq_len转为1,加入linear层计算分类
class WordAVGModel(nn.Module):
def __init__(self,vocab_size,embedding_size,output_size,pad_idx):
super(WordAVGModel,self).__init__()
self.embed=nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx)
self.linear=nn.Linear(embedding_size,output_size)
def forward(self,text):
embeded=self.embed(text) # seq_len * batch_size * embedding_size
embeded=embeded.permute(1,0,2)# [batch_size, seq_len, embedding_size]
pooled=F.avg_pool2d(embeded,(embeded.shape[1],1)).squeeze()# [batch_size, embedding_size]
return self.linear(pooled)
- glove 初始化模型
pretrained_embedding=TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding) #原始embed里面的参数权重被更新为glove的embedding
UNK_IDX=TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX]=torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX]=torch.zeros(EMBEDDING_SIZE)
- 训练模型
optimizer=torch.optim.Adam(model.parameters())
loss_fn=nn.BCEWithLogitsLoss() # 只针对二分类
for batch in iterator:
preds=model(batch.text).squeeze()
acc=compute_accuracy(preds,batch.label)
loss=loss_fn(preds,batch.label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
- 保存经评估后效果最好的模型
model.load_state_dict(torch.load(“wordavg-model.pth”)) - 情感预测
def predict_sentiment(sentence):
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
sentence2num = torch.LongTensor(indexed).to(device) # seq_len
tensor = sentence2num.unsqueeze(1) # seq_len * batch_size(1)
pred = torch.sigmoid(model(tensor))
return pred.item()
3.2 RNN模型
使用最后一个hidden state ht来表示整个句子,将h通过一个线性变换从而预测句子的情感
class RNNModel(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, pad_idx, hidden_size, dropout):
super(RNNModel, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=pad_idx)
self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional=True, num_layers=2)
self.linear = nn.Linear(hidden_size*2, output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embed(text) # [seq_len, batch_size, embedding_size]
embedded = self.dropout(embedded)
output, (hidden, cell) = self.lstm(embedded)
hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
hidden = self.dropout(hidden.squeeze())
return self.linear(hidden)
3.3 CNN模型
class CNN(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, pad_idx, num_filters, filter_size, dropout):
super(CNN, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=pad_idx)
self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size))
self.linear = nn.Linear(embedding_size,output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
text = text.permute(1, 0) # [batch_size, seq_len]
embedded = self.embed(text) # [batch_size, seq_len, embedding_size]
embedded = embedded.unsqueeze(1) # # [batch_size, 1, seq_len, embedding_size]
conved = F.relu(self.conv(embedded)) # [batch_size, num_filters, seq_len-filter_size+1, 1]
conved = conved.squeeze(3) # [batch_size, num_filters, seq_len-filter_size+1]
# max over time pooling
pooled = F.max_pool1d(conved, conved.shape[2]) # [batch_size, num_filters, 1]
pooled = pooled.squeeze(2)
pooled = self.dropout(pooled)
return self.linear(pooled)