说明
本博客代码来自开源项目:《动手学深度学习》(PyTorch版)
并且在博主学习的理解上对代码进行了大量注释,方便理解各个函数的原理和用途
配置环境
使用环境:python3.8
平台:Windows10
IDE:PyCharm
此节说明
此节对应书本上10.8节
此节功能为:文本情感分类:使用卷积神经网络(textCNN)
由于此节相对复杂,代码注释量较多
代码
# 本书链接https://tangshusen.me/Dive-into-DL-PyTorch/#/
# 10.8 文本情感分类:使用卷积神经网络(textCNN)
# 注释:黄文俊
# E-mail:hurri_cane@qq.com
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torch.nn.functional as F
import torchtext.vocab as Vocab
import torch.utils.data as Data
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT = "D:/Program/Pytorch/Datasets"
# 一维卷积层
def corr1d(X, K):
w = K.shape[0]
Y = torch.zeros((X.shape[0] - w + 1))
for i in range(Y.shape[0]):
Y[i] = (X[i: i + w] * K).sum()
return Y
# X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
# print(corr1d(X, K))
# 一维多通道卷积层
def corr1d_multi_in(X, K):
# 首先沿着X和K的第0维(通道维)遍历并计算一维互相关结果。然后将所有结果堆叠起来沿第0维累加
return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)
# X = torch.tensor([[0, 1, 2, 3, 4, 5, 6],
# [1, 2, 3, 4, 5, 6, 7],
# [2, 3, 4, 5, 6, 7, 8]])
# K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
# print(corr1d_multi_in(X, K))
# 时序最大池化层
class GlobalMaxPool1d(nn.Module):
def __init__(self):
super(GlobalMaxPool1d, self).__init__()
def forward(self, x):
# x shape: (batch_size, channel, seq_len)
# return shape: (batch_size, channel, 1)
return F.max_pool1d(x, kernel_size=x.shape[2])
batch_size = 128
train_data = d2l.read_imdb('train', data_root=os.path.join(DATA_ROOT, "aclImdb"))
test_data = d2l.read_imdb('test', data_root=os.path.join(DATA_ROOT, "aclImdb"))
vocab = d2l.get_vocab_imdb(train_data)
train_set = Data.TensorDataset(*d2l.preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*d2l.preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
class TextCNN(nn.Module):
def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
super(TextCNN, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size)
# 不参与训练的嵌入层
self.constant_embedding = nn.Embedding(len(vocab), embed_size)
self.dropout = nn.Dropout(0.5)
self.decoder = nn.Linear(sum(num_channels), 2)
# 时序最大池化层没有权重,所以可以共用一个实例
self.pool = GlobalMaxPool1d()
self.convs = nn.ModuleList() # 创建多个一维卷积层
for c, k in zip(num_channels, kernel_sizes):
self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
out_channels = c,
kernel_size = k))
def forward(self, inputs):
# 将两个形状是(批量大小, 词数, 词向量维度)的嵌入层的输出按词向量连结
embeddings = torch.cat((
self.embedding(inputs),
self.constant_embedding(inputs)), dim=2) # (batch, seq_len, 2*embed_size)
'''
embedding运算其实就是根据输入的inputs来索引嵌入层的词向量
'''
# 根据Conv1D要求的输入格式,将词向量维,即一维卷积层的通道维(即词向量那一维),变换到前一维
embeddings = embeddings.permute(0, 2, 1)
# 对于每个一维卷积层,在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
# Tensor。使用flatten函数去掉最后一维,然后在通道维上连结
encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
# .squeeze(-1)即:如果最后一维维度为1则把该维度去掉
# 应用丢弃法后使用全连接层得到输出
outputs = self.decoder(self.dropout(encoding))
return outputs
# 创建一个TextCNN实例。它有3个卷积层,它们的核宽分别为3、4和5,输出通道数均为100。
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)
# 加载预训练的词向量
glove_vocab = Vocab.GloVe(name='6B', dim=100,
cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(
d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(
d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False
# 训练模型
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
predict1 = d2l.predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) # positive
print(predict1)
predict2 = d2l.predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) # negative
print(predict2)
predict3 = d2l.predict_sentiment(net, vocab, ['As','far','as','I','am','consider','this', 'movie', 'is', 'not', 'bad']) # negative
print(predict3)
'''
通过debug可以发现,通过这个网络计算出来的标签估计值,其实是没有经过归一化的
并且会出现某个概率为负数的情况
'''
print("*" * 50)