1.1 TextCNN的模型原理
- Yoon Kim在2014年将卷积神经网络CNN应用到文本分类任务,利用多个不同大小的卷积核来提取句子中的关键信息,从而能够更好地捕捉局部相关性。
1.2 TextCNN的详细过程
- Embedding:词嵌入层,单词的向量表示
- Convolution:卷积层,提取单词的特征
- MaxPolling:池化层,将不同长度句子经过pooling层之后都能变成定长的表示
- FullConnection:全连接层,输出每个类别的概率
1.3 通道
-
图像中可以利用 (R, G, B) 作为不同channel
-
文本的输入的channel通常是不同方式的embedding方式(比如 word2vec或Skip-Gram)
1.4 一维卷积
-
文本是一维数据,因此在TextCNN卷积用的是一维卷积。
-
一维卷积需要通过设计不同 kernel_size 的 filter 获取不同宽度的视野。
1.5 代码
- 步骤一:导入工具库
```python
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
- 步骤二:模型初始化
def __init__(self):
super(TextCNN, self).__init__()
self.num_filters_total = num_filters * len(filter_sizes)
self.W = nn.Embedding(vocab_size, embedding_size)
self.Weight = nn.Linear(self.num_filters_total, num_classes, bias=False)
self.Bias = nn.Parameter(torch.ones([num_classes]))
self.filter_list = nn.ModuleList([nn.Conv2d(1, num_filters,
(size, embedding_size)) for size in filter_sizes])
print(" +++")
- 步骤三:前向计算
def forward(self, X):
embedded_chars = self.W(X) # [batch_size, embedding_size, sequence_length]
embedded_chars = embedded_chars.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
pooled_outputs = []
for i, conv in enumerate(self.filter_list):
# conv : [input_channel(=1), output_channel(=3), (filter_height, filter_width), bias_option]
h = F.relu(conv(embedded_chars))
# mp : ((filter_height, filter_width))
mp = nn.MaxPool2d((sequence_length - filter_sizes[i] + 1, 1))
# pooled : [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3)]
pooled = mp(h).permute(0, 3, 2, 1)
pooled_outputs.append(pooled)
# [batch_size(=6), output_height(=1), output_width(=1), output_channel(=3) * 3]
h_pool = torch.cat(pooled_outputs, len(filter_sizes))
# [batch_size(=6), output_height * output_width * (output_channel * 3)]
h_pool_flat = torch.reshape(h_pool, [-1,self.num_filters_total])
model = self.Weight(h_pool_flat) + self.Bias # [batch_size, num_classes]
return model
- 步骤四:主函数
if __name__ == '__main__':
embedding_size = 2 # 词向量的维度
sequence_length = 3 # 句子的长度
num_classes = 2 # 分类结果
filter_sizes = [2, 2, 2] # 卷积核
num_filters = 3 # 通道的数量
# 3 words sentences (=sequence_length is 3)
sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
labels = [1, 1, 1, 0, 0, 0] # 1 is good, 0 is not good.
# 1.建立词汇表
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
vocab_size = len(word_dict)
# 2. 构建模型
model = TextCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
inputs = torch.LongTensor([np.asarray([word_dict[n] for n in sen.split()]) for sen in sentences])
targets = torch.LongTensor([out for out in labels]) # To using Torch Softmax Loss function
# 3.训练
for epoch in range(5000):
optimizer.zero_grad()
output = model(inputs)
# output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)
loss = criterion(output, targets)
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
# Test
test_text = 'sorry hate you'
tests = [np.asarray([word_dict[n] for n in test_text.split()])]
test_batch = torch.LongTensor(tests)
# Predict
predict = model(test_batch).data.max(1, keepdim=True)[1]
if predict[0][0] == 0:
print(test_text, "is Bad Mean...")
else:
print(test_text, "is Good Mean!!")