文本分类步骤
一、文本预处理
- 读取文本
with open(os.path.join(folder_name, file), 'rb') as f:
review = f.read().decode('utf-8').replace('\n', '').lower()
data.append([review, 1 if label == 'pos' else 0])
- 对文本进行tokenize:根据文本的格式进行单词的切分
- 创建vocab词典:利用
torchtext.vocab.Vocab
创建词典。 - stoi:将数据集的文本从字符串的形式转换为单词下标序列的形式
- clip or pad:通过截断或填充,得到定长文本
- 对train data和test data创建数据迭代器:Data.TensorDataset,Data.DataLoader。
二、文本分类器
LSTM
class BiRNN(nn.Module):
def __init__(self, vocab, embed_size, num_hiddens, num_layers):
'''
@params:
vocab: 在数据集上创建的词典,用于获取词典大小
embed_size: 嵌入维度大小
num_hiddens: 隐藏状态维度大小
num_layers: 隐藏层个数
'''
super(BiRNN, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size)
self.encoder = nn.LSTM(input_size=embed_size,
hidden_size=num_hiddens,
num_layers=num_layers,
bidirectional=True)
self.decoder = nn.Linear(4*num_hiddens, 2)
def forward(self, inputs):
'''
@params:
inputs: 词语下标序列,形状为 (batch_size, seq_len) 的整数张量
@return:
outs: 对文本情感的预测,形状为 (batch_size, 2) 的张量
'''
embeddings = self.embedding(inputs.permute(1, 0))
outputs, _ = self.encoder(embeddings)
encoding = torch.cat((outputs[0], outputs[-1]), -1)
outs = self.decoder(encoding)
return outs
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
CNN
class TextCNN(nn.Module):
def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
'''
@params:
vocab: 在数据集上创建的词典,用于获取词典大小
embed_size: 嵌入维度大小
kernel_sizes: 卷积核大小列表
num_channels: 卷积通道数列表
'''
super(TextCNN, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size)
self.constant_embedding = nn.Embedding(len(vocab), embed_size)
self.pool = GlobalMaxPool1d()
self.convs = nn.ModuleList()
for c, k in zip(num_channels, kernel_sizes):
self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
out_channels = c,
kernel_size = k))
self.decoder = nn.Linear(sum(num_channels), 2)
self.dropout = nn.Dropout(0.5)
def forward(self, inputs):
'''
@params:
inputs: 词语下标序列,形状为 (batch_size, seq_len) 的整数张量
@return:
outputs: 对文本情感的预测,形状为 (batch_size, 2) 的张量
'''
embeddings = torch.cat((
self.embedding(inputs),
self.constant_embedding(inputs)), dim=2)
embeddings = embeddings.permute(0, 2, 1)
encoding = torch.cat([
self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
outputs = self.decoder(self.dropout(encoding))
return outputs
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)
预训练词向量加载
cache_dir = "/home/kesci/input/GloVe6B5429"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)
def load_pretrained_embedding(words, pretrained_vocab):
'''
@params:
words: 需要加载词向量的词语列表,以 itos (index to string) 的词典形式给出
pretrained_vocab: 预训练词向量
@return:
embed: 加载到的词向量
'''
embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
oov_count = 0
for i, word in enumerate(words):
try:
idx = pretrained_vocab.stoi[word]
embed[i, :] = pretrained_vocab.vectors[idx]
except KeyError:
oov_count += 1
if oov_count > 0:
print("There are %d oov words." % oov_count)
return embed
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False