接下里我们使用一些深度学习模型,深度学习模型会把机器学习模型的两个阶段联合起来进行end2end学习,即把特征表示和分类一起训练,最后一层进行分类,其余层对输入文本进行特征表示。
我们主要使用了一些经典的深度学习模型,包括FastText、TextCNN、biLSTM、TextGRUCNN等模型,为了增加模型的多样性,每个模型提供了word-level和character-level两个版本(一般来说,word-level效果会更好)。由于数据集做了脱敏处理,没有明文,文本用数字字符串进行了编码,所以暂时不能使用预训练语言模型(如 Bert、RoBerta、ALBert、XLNet等,之后我会还整理一个专栏专门介绍预训练语言模型以及在文本分类上的应用)。
目录
1. 数据预处理
- word-level
if __name__ == '__main__':
#读取训练集和测试集
train_df = pd.read_csv('../../input/train_set.csv')
test_df = pd.read_csv('../../input/test_set.csv')
#训练集
train_char = train_df['article'].values.tolist() #以字为间隔
train_word = train_df['word_seg'].values.tolist() #以词为间隔
train_label = train_df['class'].values - 1 #类别标签 从0开始
#测试集
test_char = test_df['article'].values.tolist()
test_word = test_df['word_seg'].values.tolist()
np.save('../../data/label', train_label)
# np.percentile(train_word_len, [0, 50, 80, 90, 95, 98, 100])
# of labels: 19
# Training set
# [6. 514. 990. 1428. 1949. 2858.48 39759.]
# [50. 842. 1618. 2346. 3201. 4720.96 55804.]
# Test set
# [6. 516. 992. 1429. 1949. 2826. 19755.]
# [50. 842. 1621. 2349. 3207. 4672. 31694.]
MAX_LEN = 2000 #序列最大长度 将每个mini-batch的数据都填充成MAX_LEN 方便并行处理
EMBED_DIM = 300 #词嵌入维度
#可以基于训练数据 预先训练好word2vec或GloVe词向量,二者可以单独使用也可以进行拼接 增加多样性
#由于没有明文 不能使用别人开源的预训练词向量 只能自己训练
EMBED_PATH = '../../data/word_vector_300d.vec' #预训练词向量路径
NUM_WORDS = 359279 #词典大小
tokenizer = Tokenizer(num_words=NUM_WORDS) #使用Keras内置的构建词典的函数 默认用空格分词 (官方已经帮我们分好词 并且用空格连接起来了,不然需要手动先分词)
tokenizer.fit_on_texts(train_word + test_word) #基于训练集和测试集构建词典(一般只基于训练集构建词典)
#把所有序列/文本中的单词 转换为词典中的索引,并填充为同一长度
train_sequence = pad_sequences(tokenizer.texts_to_sequences(train_word), MAX_LEN)
test_sequence = pad_sequences(tokenizer.texts_to_sequences(test_word), MAX_LEN)
word2vec_mapping = {} #词到向量的映射字典
with open(EMBED_PATH, 'r') as f: #读取预训练词向量
lines = f.readlines()[1:]
for line in lines:
line = line.strip()
vals = line.split()
word2vec_mapping[vals[0]] = np.fromiter(vals[1:], dtype=np.float32)
print(f'# of words: {len(word2vec_mapping)}')
oov = 0 #没有预训练词向量的词的个数
embed_mat = np.random.uniform(-0.1, 0.1, (NUM_WORDS + 1, EMBED_DIM)) #初始化词嵌入矩阵 预留出一个填充词PAD
for word, i in tokenizer.word_index.items():
if i > NUM_WORDS:
break
else:
if word in word2vec_mapping: #使用预训练词向量覆盖
embed_mat[i, :] = word2vec_mapping[word]
else:
print(i)
oov += 1
print(f'# of OOV words: {oov}') #不存在预训练词向量的单词数
# 保存处理好的训练和测试序列
np.save('../../data/train_input', train_sequence)
np.save('../../data/test_input', test_sequence)
# 保存初始化的词嵌入矩阵
np.save('../../data/word_embed_mat', embed_mat)
- character-level
if __name__ == '__main__':
#读取原始数据集
train_df = pd.read_csv('../../input/train_set.csv')
test_df = pd.read_csv('../../input/test_set.csv')
train_char = train_df['article'].values.tolist() #以字为间隔
train_word = train_df['word_seg'].values.tolist() #以词为间隔
train_label = train_df['class'].values - 1 #类别标签
test_char = test_df['article'].values.tolist()
test_word = test_df['word_seg'].values.tolist()
np.save('../../data/label', train_label)
# np.percentile(train_word_len, [0, 50, 80, 90, 95, 98, 100])
# of labels: 19
# Training set
# [6. 514. 990. 1428. 1949. 2858.48 39759.]
# [50. 842. 1618. 2346. 3201. 4720.96 55804.]
# Test set
# [6. 516. 992. 1429. 1949. 2826. 19755.]
# [50. 842. 1621. 2349. 3207. 4672. 31694.]
MAX_LEN = 3200 #序列最大长度 将每个mini-batch的数据都填充成MAX_LEN 方便并行处理 character-level序列会更长
EMBED_DIM = 300 #字潜入维度
#可以基于训练数据 预先训练好word2vec或GloVe字向量,二者可以单独使用也可以进行拼接 增加多样性
#由于没有明文 不能使用别人开源的预训练字向量 只能自己训练
EMBED_PATH = '../../data/char_vector_300d_new.vec' #加载预训练字向量
tokenizer = Tokenizer()#使用Keras内置的构建字典的函数 默认用空格分字 (官方已经帮我们分好字 并且用空格连接起来了,不然需要手动先分字)
tokenizer.fit_on_texts(train_char + test_char)#基于训练集和测试集构建字典(一般只基于训练集构建字典)
NUM_WORDS = len([w for w, c in tokenizer.word_counts.items() if c >= 5]) #得到每个字及其频数 过滤掉频数<5的字
print(NUM_WORDS) #统计过滤后字的个数,作为字典的大小
#可以像之前那样根据一些预先的统计信息 直接指定词典的大小(取词频排在前词典大小的词) 也可以先过滤掉低频词,把剩余的词数作为词典大小
tokenizer = Tokenizer(num_words=NUM_WORDS) #构建字典
tokenizer.fit_on_texts(train_char + test_char)
#把所有序列/文本中的字 转换为字典中的索引,并填充为同一长度
train_sequence = pad_sequences(tokenizer.texts_to_sequences(train_char), MAX_LEN)
test_sequence = pad_sequences(tokenizer.texts_to_sequences(test_char), MAX_LEN)
char2vec_mapping = {} #词到向量的映射字典
with open(EMBED_PATH, 'r') as f: #读取预训练字向量
lines = f.readlines()[1:]
for line in lines:
line = line.strip()
vals = line.split()
char2vec_mapping[vals[0]] = np.fromiter(vals[1:], dtype=np.float32)
print(f'# of chars: {len(char2vec_mapping)}')
oov = 0
embed_mat = np.random.uniform(-0.1, 0.1, (len(char2vec_mapping) + 1, EMBED_DIM)).astype(np.float32) #初始化字嵌入矩阵 预留出一个填充词PAD
for char, i in tokenizer.word_index.items():
if i > NUM_WORDS:
break
else:
if char in char2vec_mapping:
embed_mat[i, :] = char2vec_mapping[char] #使用预训练字向量覆盖
else:
oov += 1
print(f'# of OOV words: {oov}') #不存在预训练字向量的字数
# 保存处理好的训练和测试序列 character-level
np.save('../../data/char_train_input', train_sequence)
np.save('../../data/char_test_input', test_sequence)
# 保存初始化的字嵌入矩阵
np.save('../../data/char_embed_mat', embed_mat)
2. FastText
- FastText word-level
class FastText(nn.Module):
def __init__(self, fc_dim1, fc_dim2, granularity='word'):
super(FastText, self).__init__()
# 加载初始化的词嵌入矩阵
embed_mat = torch.from_numpy(np.load(f'../../data/{granularity}_embed_mat.npy').astype(np.float32))
num_word, embed_dim = embed_mat.size()
self.embed = nn.Embedding.from_pretrained(embed_mat, False)
#两个隐层
self.fc1 = nn.Linear(embed_dim, fc_dim1)
self.fc2 = nn.Linear(fc_dim1, fc_dim2)
#输出层
self.out = nn.Linear(fc_dim2, 19)
#激活函数
self.act = nn.RReLU()
#BN层
self.bn1 = nn.BatchNorm1d(fc_dim1)
self.bn2 = nn.BatchNorm1d(fc_dim2)
def forward(self, input):
out = self.embed(input) #(batch_size,max_len) -> (batch_size,max_len,embed_size)
out = torch.mean(out, dim=1) #(batch_size,embed_size)
#使用Functional中的Dropout
#也可以把Dropout声明为层 使用更方便 丢弃率为0.5
out = self.bn1(F.dropout(self.act(self.fc1(out)), p=0.5, training=self.training, inplace=True))#(batch_size,fc_dim1)
out = self.bn2(F.dropout(self.act(self.fc2(out)), p=0.5, training=self.training, inplace=True))#(batch_size,fc_dim2)
out = self.out(out)#(batch_size,19)
return out
def _initialize_weights(self): #自定义参数初始化方式
for m in self.modules():
if isinstance(m, nn.Conv2d): #卷积层 权重初始化方式
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None: #卷积层偏置参数初始化为0
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d): #BN层权重初始化为1 偏置初始化为0
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear): #全连接层 权重用xavier,偏置初始化为0
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
- FastAttentionText word-level
原始的FastText是将序列中,每个词的词向量直接取平均;FastAttentionText通过Attention机制计算序列中每个词对应的权重,对每个词的词向量进行加权求和。再接几个全连接层做分类。
class Fast_Attention_Text(nn.Module):
def __init__(self, fc_dim1, fc_dim2):
super(Fast_Attention_Text, self).__init__()
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# 加载初始化的词嵌入矩阵
embed_mat = torch.from_numpy(np.load(f'../../data/word_embed_mat.npy').astype(np.float32))
num_word, embed_dim = embed_mat.size()
self.embed = nn.Embedding.from_pretrained(embed_mat, False) #False表示不冻结 进行微调
#初始化一参数向量 作为Query
self.ctx_vec_size = (300,)
self.ctx_vec = nn.Parameter(torch.randn(self.ctx_vec_size).float().to(self.device))
#先定义一个全连接层/映射层
self.proj = nn.Linear(in_features=300, out_features=300)
#第一个隐层
self.fc1 = nn.Linear(embed_dim, fc_dim1)
#第2个隐层
self.fc2 = nn.Linear(fc_dim1, fc_dim2)
#输出层
self.out = nn.Linear(fc_dim2, 19)
#激活函数
self.act = nn.RReLU()
#BN层
self.bn1 = nn.BatchNorm1d(fc_dim1)
self.bn2 = nn.BatchNorm1d(fc_dim2)
def forward(self, input):
out = self.embed(input) #(batch_size,sentence_len) -> (batch_size,sentence_len,embed_dim)
u = F.tanh(self.proj(out)) # [batch, sentence_len, embed_dim]每个词转换为词向量后 通过一个映射层 结果作为Key
#将结果和参数向量计算内积(Query和Key计算内积) (batch,sentence_len) 沿sentence_len方向计算softmax 计算句子中每个词向量对应的权重
a = F.softmax(torch.einsum('bse,e->bs', (u.clone(), self.ctx_vec.clone())), dim=1)
#对Value 即out作加权求和
s = torch.einsum('bse,bs->be', (out.clone(), a.clone())) #(batch,embed)
#将结果通过两个隐层
out = self.bn1(F.dropout(self.act(self.fc1(s)), p=0.5, training=self.training, inplace=True)) #(batch,fc_dim1)
out = self.bn2(F.dropout(self.act(self.fc2(out)), p=0.5, training=self.training, inplace=True))#(batch,fc_dim2)
#输出层
out = self.out(out) ##(batch,19)
return out
def _initialize_weights(self):#自定义参数初始化方式 同上
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
- FastText character-level
class FastText(nn.Module):
def __init__(self, fc_dim1, fc_dim2):
super(FastText, self).__init__()
#加载初始化的字潜入矩阵
embed_mat = torch.from_numpy(np.load(f'../../data/char_embed_mat.npy'))
num_word, embed_dim = embed_mat.size()
self.embed = nn.Embedding.from_pretrained(embed_mat)
#两个隐层
self.fc1 = nn.Linear(embed_dim, fc_dim1, bias=False)
self.fc2 = nn.Linear(fc_dim1, fc_dim2, bias=False)
#输出层
self.out = nn.Linear(fc_dim2, 19)
#激活函数
self.act = nn.PReLU()
#BN层
self.bn1 = nn.BatchNorm1d(fc_dim1)
self.bn2 = nn.BatchNorm1d(fc_dim2)
def forward(self, input):
out = self.embed(input) #(batch_size,max_len) -> (batch_size,max_len,embed_size)
out = torch.mean(out, dim=1)#(batch_size,embed_size)
# 使用Functional中的Dropout
# 也可以把Dropout声明为层 使用更方便 丢弃率为0.5
out = self.bn1(F.dropout(self.act(self.fc1(out)), p=0.5, training=self.training, inplace=True))#(batch_size,fc_dim1)
out = self.bn2(F.dropout(self.act(self.fc2(out)), p=0.2, training=self.training, inplace=True))#(batch_size,fc_dim2)
out = self.out(out)#(batch_size,19)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
3. TextCNN
- word-level
class TextCNN(nn.Module):
def __init__(self, filter_sizes, num_filter, fc_dim1):
super(TextCNN, self).__init__()
#加载初始化的词潜入矩阵
embed_mat = torch.from_numpy(np.load('../../data/word_embed_mat.npy').astype(np.float32)) #读取预训练
num_word, embed_dim = embed_mat.size()
self.embed = nn.Embedding.from_pretrained(embed_mat, freeze=False)
#用多个不同大小的卷积核 提取特征
self.conv = nn.ModuleList([nn.Conv2d(1, num_filter, (size, embed_dim), bias=False) for size in filter_sizes])
#激活函数
self.act = nn.RReLU()
#隐层
self.fc = nn.Linear(len(filter_sizes) * num_filter, fc_dim1)
#输出层
self.out = nn.Linear(fc_dim1, 19)
#BN层
self.bn1 = nn.BatchNorm1d(len(filter_sizes) * num_filter)
self.bn2 = nn.BatchNorm1d(fc_dim1)
self._initialize_weights() #运行自定义初始化方式 否则将采用默认初始化
def forward(self, input):
#将输入通过词嵌入矩阵转换为词向量后 在接一个dropout (batch,sentence_len,embed)
embed_out = F.dropout(self.embed(input), p=0.1, training=self.training, inplace=True)
embed_out = embed_out.unsqueeze(1) #添加通道维 (batch,1, sentence_len,embed) 进行2维卷积
conv_out = [self.act(conv(embed_out)).squeeze(3) for conv in self.conv]
conv_out = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in conv_out] #全局最大池化
conv_out = torch.cat(conv_out, dim=1) #将池化结果进行拼接
#通过dropout 和 BN层
out = self.bn1(F.dropout(conv_out, p=0.5, training=self.training, inplace=True))
fc_out = self.bn2(F.dropout(self.act(self.fc(out)), p=0.25, training=self.training, inplace=True))
out = self.out(fc_out) #输出层(batch,19)
return out
def _initialize_weights(self): #自定义初始化方式
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
- TextCNN character-level
代码和word-level相同,只是把词向量换为字向量,输入是基于字分隔的,不是基于词。
class TextCNN(nn.Module):
def __init__(self, filter_sizes, num_filter, fc_dim):
super(TextCNN, self).__init__()
embed_mat = torch.from_numpy(np.load('../../data/char_embed_mat.npy').astype(np.float32))
num_word, embed_dim = embed_mat.size()
self.embed = nn.Embedding.from_pretrained(embed_mat)
self.conv = nn.ModuleList([nn.Conv2d(1, num_filter, (size, embed_dim)) for size in filter_sizes])
self.act = nn.PReLU()
self.fc = nn.Linear(len(filter_sizes) * num_filter, fc_dim, bias=False)
self.out = nn.Linear(fc_dim, 19)
self.bn1 = nn.BatchNorm1d(len(filter_sizes) * num_filter)
self.bn2 = nn.BatchNorm1d(fc_dim)
self._initialize_weights()
def forward(self, input):
embed_out = F.dropout(self.embed(input), p=0.2, training=self.training, inplace=True)
embed_out = embed_out.unsqueeze(1)
conv_out = [self.act(conv(embed_out)).squeeze(3) for conv in self.conv]
conv_out = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in conv_out]
conv_out = torch.cat(conv_out, dim=1)
out = self.bn1(F.dropout(conv_out, p=0.5, training=self.training, inplace=True))
fc_out = self.bn2(F.dropout(self.act(self.fc(out)), p=0.2, training=self.training, inplace=True))
out = self.out(fc_out)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
4. BiLSTM
- 单层BiLSTM word-level
对单层BiLSTM的输出结果,沿长度方向作最大池化和平均池化,把两个池化结果拼接,再接几个全连接层进行分类。
class Pooled_BiLSTM(nn.Module):
def __init__(self, hidden_dim, fc_dim):
super(Pooled_BiLSTM, self).__init__()
#加载初始化词嵌入矩阵
embed_mat = torch.from_numpy(np.load('../../data/word_embed_mat.npy').astype(np.float32))
embed_dim = embed_mat.size(1)
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' #可以在训练之前设置 直接model.to(device) 没必要在类内部设置
self.embed = nn.Embedding.from_pretrained(embed_mat)
#单层双向LSTM batch_size为第一个维度
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
#隐层和输出层
self.fc = nn.Linear(4 * hidden_dim, fc_dim)
self.out = nn.Linear(fc_dim, 19)
#BN层
self.bn1 = nn.BatchNorm1d(4 * hidden_dim)
self.bn2 = nn.BatchNorm1d(fc_dim)
self._initialize_weights()
def forward(self, input):
#将输入通过词嵌入矩阵得到词向量 在通过dropout 丢弃率为0.2
out = F.dropout(self.embed(input), p=0.2, training=self.training, inplace=True)
out, _ = self.lstm(out) #得到lstm输出 (batch,sentence_len,hidden*2)
max_pool, _ = torch.max(out, dim=1) #最大池化 (batch,hidden*2)
avg_pool = torch.mean(out, dim=1)#平均池化 (batch,hidden*2)
#将池化结果拼接 再通过dropout 丢弃率为0.3 再通过BN (batch,hidden*4)
out = self.bn1(F.dropout(torch.cat((max_pool, avg_pool), dim=1), p=0.3, training=self.training, inplace=True))
#再通过隐层 dropout 和BN (batch,fc_dim)
out = self.bn2(F.dropout(self.fc(out), p=0.5, training=self.training, inplace=True))
#输出层
out = self.out(out) #(batch,19)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LSTM):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.orthogonal_(param)
elif 'bias' in name:
nn.init.constant_(param, 0.0)
- 2层BiLSTM word-level
代码与单层BiLSTM word-level相同,只不过lstm变成了两层。
class Pooled_BiLSTM(nn.Module):
def __init__(self, hidden_dim, fc_dim):
super(Pooled_BiLSTM, self).__init__()
embed_mat = torch.from_numpy(np.load('../../data/word_embed_mat.npy').astype(np.float32))
embed_dim = embed_mat.size(1)
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
self.embed = nn.Embedding.from_pretrained(embed_mat)
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True, num_layers=2)
self.fc = nn.Linear(4 * hidden_dim, fc_dim)
self.out = nn.Linear(fc_dim, 19)
self.bn1 = nn.BatchNorm1d(4 * hidden_dim)
self.bn2 = nn.BatchNorm1d(fc_dim)
self._initialize_weights()
def forward(self, input):
out = F.dropout(self.embed(input), p=0.2, training=self.training, inplace=True)
out, _ = self.lstm(out)
max_pool, _ = torch.max(out, dim=1)
avg_pool = torch.mean(out, dim=1)
out = self.bn1(F.dropout(torch.cat((max_pool, avg_pool), dim=1), p=0.3, training=self.training, inplace=True))
out = self.bn2(F.dropout(self.fc(out), p=0.5, training=self.training, inplace=True))
out = self.out(out)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LSTM):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.orthogonal_(param)
elif 'bias' in name:
nn.init.constant_(param, 0.0)
- 2层BiLSTM word-level(High_Dropout)
代码与单层BiLSTM word-level相同,只不过lstm变成了两层,将两种池化结果拼接后直接跟输出层进行分类,中间没有加隐层和BN层,速度更快。
class High_Dropout_Pooled_BiLSTM(nn.Module):
def __init__(self, hidden_dim):
super(High_Dropout_Pooled_BiLSTM, self).__init__()
embed_mat = torch.from_numpy(np.load('../../data/word_embed_mat.npy').astype(np.float32))
embed_dim = embed_mat.size(1)
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
self.embed = nn.Embedding.from_pretrained(embed_mat)
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True,
num_layers=2)
self.out = nn.Linear(4 * hidden_dim, 19)
self._initialize_weights()
def forward(self, input):
out = F.dropout(self.embed(input), p=0.5, training=self.training, inplace=True)
out, _ = self.lstm(out)
max_pool, _ = torch.max(out, dim=1)
avg_pool = torch.mean(out, dim=1)
out = torch.cat((max_pool, avg_pool), dim=1)
out = self.out(out)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LSTM):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.orthogonal_(param)
elif 'bias' in name:
nn.init.constant_(param, 0.0)
5. TextGRU
word-level,将2层双向GRU的输出,沿长度方向作最大池化、平均池化,取最后时刻的隐藏状态再通过Attention机制计算得到一个隐状态(各时刻隐状态的加权和),将四者进行拼接,再通过隐层、输出层进行分类。
class TextGRU_Ultimate(nn.Module):
def __init__(self, hidden_dim, fc_dim):
super(TextGRU_Ultimate, self).__init__()
#加载初始化的词嵌入矩阵
embed_mat = torch.from_numpy(np.load('../../data/word_embed_mat.npy').astype(np.float32))
embed_dim = embed_mat.size(1)
self.hidden_dim = hidden_dim
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
self.embed = nn.Embedding.from_pretrained(embed_mat)
#2层双向GRU batch_size为第一个维度
self.lstm = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True,
num_layers=2)
self.ctx_vec_size = (2 * hidden_dim,) #初始化一个参数向量 作为query
self.ctx_vec = nn.Parameter(torch.randn(self.ctx_vec_size).float().to(self.device))
self.proj = nn.Linear(in_features=2 * hidden_dim, out_features=2 * hidden_dim)
self.fc = nn.Linear(8 * hidden_dim, fc_dim, bias=False)
self.act = nn.RReLU()
self.out = nn.Linear(fc_dim, 19)
self.bn1 = nn.BatchNorm1d(8 * hidden_dim)
self.bn2 = nn.BatchNorm1d(fc_dim)
self._initialize_weights()
def forward(self, input):
# input_shape = (input.size(0), input.size(1))
# probs = torch.empty(input_shape).uniform_(0, 1).to(self.device)
# spatial_dropout_input = torch.where(probs > 0.2, input,
# torch.zeros(input_shape, dtype=torch.int64).to(self.device))
# del probs
#将输入通过词嵌入矩阵后转为词向量 在通过dropout 丢弃率为0.2
out = F.dropout(self.embed(input), p=0.2, training=self.training, inplace=True)
out, _ = self.lstm(out) #(batch,sentence_len,hidden*2) gru的结果 作为value
# last_pool = torch.cat((out[:, -1, :self.hidden_dim], out[:, 0, self.hidden_dim:]), dim=1)
last_pool = out[:, -1, :] #最后时刻的隐状态
max_pool, _ = torch.max(out, dim=1) #最大池化 (batch,hidden*2)
avg_pool = torch.mean(out, dim=1) #平均池化 (batch,hidden*2)
u = F.tanh(self.proj(out)) # [batch, sentence_len, 2*hidden_dim] gru输出通过映射层 将结果作为key
a = F.softmax(torch.einsum('bsh,h->bs', (u.clone(), self.ctx_vec.clone())), dim=1) #key和query作点积 结果通过softmax转换为权重
attention_pool = torch.einsum('bsh,bs->bh', (out.clone(), a.clone())) #将权重和value加权求和 即对每一时刻的隐状态加权求和
pool = torch.cat((last_pool, max_pool, avg_pool, attention_pool), dim=1) #把最后时刻的隐状态 各个时刻隐状态的最大池化、平均池化以及attention计算的结果作拼接 (batch,hidden*8)
#将拼接结果通过 dropout和bn
out = self.bn1(F.dropout(pool, p=0.5, training=self.training, inplace=True))
#再通过隐层 激活函数 dropout bn (batch,fc_dim)
out = self.bn2(F.dropout(self.act(self.fc(out)), p=0.2, training=self.training, inplace=True))
#输出层 (batch,19)
out = self.out(out)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.GRU):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.orthogonal_(param)
elif 'bias' in name:
nn.init.constant_(param, 0.0)
6. TextGRUCNN
word-level,将单层双向GRU的输出结果,再通过TextCNN用不同大小的卷积核提取特征,再作全局最大池化,对池化结果进行拼接,再接隐层和全连接层进行分类。
class TextGRUCNN(nn.Module):
def __init__(self, hidden_dim, num_filter, fc_dim):
super(TextGRUCNN, self).__init__()
#加载初始化的词嵌入矩阵
embed_mat = torch.from_numpy(np.load('../../data/word_embed_mat.npy').astype(np.float32))
embed_dim = embed_mat.size(1)
self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
self.embed = nn.Embedding.from_pretrained(embed_mat)
#单层双向GRU batch_size是第一维度
self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
#声明5种不同大小的2维卷积核
self.conv = nn.ModuleList(
[nn.Conv2d(1, num_filter, (size, 2 * hidden_dim), bias=False) for size in range(1, 6)])
self.act = nn.PReLU()
self.fc = nn.Linear(5 * num_filter, fc_dim, bias=False)
self.out = nn.Linear(fc_dim, 19)
self.bn1 = nn.BatchNorm1d(5 * num_filter)
self.bn2 = nn.BatchNorm1d(fc_dim)
self._initialize_weights()
def forward(self, input):
#将输入通过词嵌入矩阵转为词向量 在通过dropout 丢弃率为0.2
out = F.dropout(self.embed(input), p=0.2, training=self.training, inplace=True)
out, _ = self.gru(out) # [batch, sentence_len, 2*hidden_dim]
out = out.unsqueeze(dim=1) #增加一个通道维 (batch,1,sentence_len,2*hidden_dim) 方便进行卷积运算
out = [self.act(conv(out)).squeeze(3) for conv in self.conv] #对GRU的输出 通过多个不同大小的卷积核 提取特征
out = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in out] #对卷积结果作全局最大池化
out = torch.cat(out, dim=1) #将池化结果拼接在一起 (batch,5*num_filter)
#通过dropout 和 bn
out = self.bn1(F.dropout(out, p=0.5, training=self.training, inplace=True))
#再通过隐层 激活函数 dropout bn (batch,fc_dim)
out = self.bn2(F.dropout(self.act(self.fc(out)), p=0.2, training=self.training, inplace=True))
#输出层
out = self.out(out)#(batch,19)
return out
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.GRU):
for name, param in m.named_parameters():
if 'weight' in name:
nn.init.orthogonal_(param)
elif 'bias' in name:
nn.init.constant_(param, 0.0)
7. 模型训练
- hold_out
将训练集留出一小部分作验证集,其余全部进行训练:
def hold_out_test(model_fn, model_name, train_data, train_label, test_data, batch_size=128, lr=1e-3,
num_folds=10, seed=1):
#将训练集分为 训练集和验证集
device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
#设置随机种子
np.random.seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
else:
torch.manual_seed(seed)
#num_folds折交叉验证 10折
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2018)
skf_indices = []
for i, (train_idx, valid_idx) in enumerate(skf.split(np.zeros(len(train_label)), train_label)):
skf_indices.extend(valid_idx)
fold_len = len(train_label) // num_folds #每一折的数据量
fold = 2 #确定第几折为测试集
print(f'Processing fold {fold}...')
fold_start = fold * fold_len
fold_end = (fold + 1) * fold_len
if fold == 0:
#训练集为 1-8折
train_indices = skf_indices[fold_len:(num_folds - 1) * fold_len]
#验证集为最后一折
valid_indices = skf_indices[(num_folds - 1) * fold_len:]
elif fold == num_folds - 1:
#训练集为 0-7折
train_indices = skf_indices[:((num_folds - 2) * fold_len)]
#验证集为 8折 倒数第2折
valid_indices = skf_indices[(num_folds - 2) * fold_len:(num_folds - 1) * fold_len]
fold_end = len(train_label)
else:
#验证集为交叉测试集的前一折
valid_indices = skf_indices[fold_start - fold_len:fold_start]
#训练集为剩下的
train_indices = skf_indices[:fold_start - fold_len] + skf_indices[fold_end:]
test_indices = skf_indices[fold_start:fold_end]
#获取索引对应的数据及标签
train_x, valid_x, test_x = train_data[train_indices], train_data[valid_indices], train_data[test_indices]
train_y, valid_y, test_y = train_label[train_indices], train_label[valid_indices], train_label[test_indices]
train_size = len(train_y)
valid_size = len(valid_y)
print(f'# of training samples: {train_size}')
print(f'# of validation samples: {valid_size}')
#转换为tensor 并to(device) 这样训练时就可以不to(device)了
train_x_tensor = torch.from_numpy(train_x).long().to(device)
train_y_tensor = torch.from_numpy(train_y).long().to(device)
# train_y_tensor = torch.from_numpy(train_y).float().to(device)
valid_x_tensor = torch.from_numpy(valid_x).long().to(device)
valid_y_tensor = torch.from_numpy(valid_y).long().to(device)
# valid_y_tensor = torch.from_numpy(valid_y).float().to(device)
test_x_tensor = torch.from_numpy(test_x).long().to(device)
test_y_tensor = torch.from_numpy(test_y).long().to(device)
#构建Dataset
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
valid_dataset = TensorDataset(valid_x_tensor, valid_y_tensor)
test_dataset = TensorDataset(test_x_tensor, test_y_tensor)
#构建DataLoader
train_loader = DataLoader(train_dataset, batch_size, True) #True表示打乱
valid_loader = DataLoader(valid_dataset, batch_size)
model = model_fn().to(device) #把模型整体to(device)
criterion = nn.CrossEntropyLoss(size_average=False) #多分类交叉熵损失函数 不按样本取平均
#找出需要训练的参数
trainable_params = [p for p in model.parameters() if p.requires_grad]
#对需要训练的参数定义优化器 学习率
optimizer = Adam(trainable_params, lr)
#如果一轮epoch训练结束 相较上轮验证集上的f1-score没有上升, 学习率减半。也可以把忍耐值patience设置的大一些 学习率最小减到min_lr为止
scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=1, min_lr=5e-6, verbose=True)
epochs = 1000 #完整遍历训练集的次数
patience = 5
best_epoch = -1
best_f1 = -1
for epoch in range(epochs):
# Early Stopping
if epoch - best_epoch > patience:
print(f'No improvement for {patience} epochs, stop training...')
break
start = time.time()
# Training phase
model.train()
train_loss = 0.0 #每一轮epoch训练集的平均损失
y_true, y_pred = [], [] #存储每一轮 样本的真实标签和预测标签
for data in train_loader:
input, y = data
optimizer.zero_grad()
outputs = model(input).squeeze()
loss = criterion(outputs, y)
y_true.append(y)
y_pred.append(outputs.cpu().detach().numpy().argmax(axis=1))
loss.backward() #计算梯度
clip_grad_norm_(trainable_params, 1.0) #梯度剪切
optimizer.step() #更新参数
train_loss += loss.item() / train_size
del input, y
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)
#计算macro f1-score
train_f1 = f1_score(y_true, y_pred, average='macro')
del y_pred, y_true
# Validation phase 每一轮epoch训练结束,在验证集上做一次验证
model.eval()
valid_loss = 0.0#每一轮epoch验证集的平均损失
pred = []
with torch.no_grad():
for data in valid_loader:
input, y = data
outputs = model(input).squeeze()
loss = criterion(outputs, y)
pred.append(outputs.cpu().detach().numpy().argmax(axis=1))
valid_loss += loss.item() / valid_size
del input, y
valid_y_pred = np.concatenate(pred)
valid_f1 = f1_score(valid_y, valid_y_pred, average='macro')
scheduler.step(valid_f1)
del valid_y_pred
if valid_f1 > best_f1:
best_epoch = epoch
best_f1 = valid_f1
elapsed = time.time() - start
print(f'Epoch {epoch} in {elapsed:.1f}s: improved!')
print(f' train loss: {train_loss:.4f} valid loss: {valid_loss:.4f} ')
print(f' train f1_macro: {train_f1:.4f} valid f1_macro: {valid_f1:.4f}')
else:
elapsed = time.time() - start
print(f'Epoch {epoch} in {elapsed:.1f}s:')
print(f' train loss: {train_loss:.4f} valid loss: {valid_loss:.4f}')
print(f' train f1_macro: {train_f1:.4f} valid f1_macro: {valid_f1:.4f}')
- cross validation
由于深度学习的数据量巨大,我们一般不采用交叉验证,但是为了尽可能提高比赛结果,可以尝试使用,不过需要很大的计算代价。
def cross_validation_bagging(model_fn, model_name, train_data, train_label, test_data, batch_size=128, lr=1e-3,
num_folds=10, patience=10, seed=1):
device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
np.random.seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
else:
torch.manual_seed(seed)
#10折交叉验证
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2018)
skf_indices = []
for i, (train_idx, valid_idx) in enumerate(skf.split(np.zeros(len(train_label)), train_label)):
skf_indices.extend(valid_idx)
fold_len = len(train_label) // num_folds #每一折的数据量
num_classes = len(set(train_label)) #类别数
meta_train = np.zeros((len(train_label), num_classes)) #在训练集上的预测结果
meta_test = np.zeros((test_data.shape[0], num_classes)) #在测试集上的预测结果
#将测试数据转换为tensor to(device)
test_data_tensor = torch.from_numpy(test_data).long().to(device)
test_label_tensor = torch.zeros(len(test_data)).long().to(device)
#对测试数据构建Dataset 和 DataLoader
test_data_dataset = TensorDataset(test_data_tensor, test_label_tensor)
test_data_loader = DataLoader(test_data_dataset, batch_size)
print(f'test_data_dataset & test_data_loader: {torch.cuda.memory_allocated() / 1024 ** 3:.2f}GB')
for fold in range(num_folds):
print(f'Loop Begin: {torch.cuda.memory_allocated() / 1024 ** 3:.2f}GB')
print(f'Processing fold {fold}...')
fold_start = fold * fold_len
fold_end = (fold + 1) * fold_len
#将训练集划分为训练集、验证集、交叉测试集 fold决定哪个是交叉测试集
if fold == 0:
train_indices = skf_indices[fold_len:(num_folds - 1) * fold_len]
valid_indices = skf_indices[(num_folds - 1) * fold_len:]
elif fold == num_folds - 1:
train_indices = skf_indices[:((num_folds - 2) * fold_len)]
valid_indices = skf_indices[(num_folds - 2) * fold_len:(num_folds - 1) * fold_len]
fold_end = len(train_label)
else:
valid_indices = skf_indices[fold_start - fold_len:fold_start]
train_indices = skf_indices[:fold_start - fold_len] + skf_indices[fold_end:]
test_indices = skf_indices[fold_start:fold_end]
train_x, valid_x, test_x = train_data[train_indices], train_data[valid_indices], train_data[test_indices]
train_y, valid_y, test_y = train_label[train_indices], train_label[valid_indices], train_label[test_indices]
train_size = len(train_y)
valid_size = len(valid_y)
print(f'# of training samples: {train_size}')
print(f'# of validation samples: {valid_size}')
train_x_tensor = torch.from_numpy(train_x).long().to(device)
train_y_tensor = torch.from_numpy(train_y).long().to(device)
# train_y_tensor = torch.from_numpy(train_y).float().to(device)
valid_x_tensor = torch.from_numpy(valid_x).long().to(device)
valid_y_tensor = torch.from_numpy(valid_y).long().to(device)
# valid_y_tensor = torch.from_numpy(valid_y).float().to(device)
test_x_tensor = torch.from_numpy(test_x).long().to(device)
test_y_tensor = torch.from_numpy(test_y).long().to(device)
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
valid_dataset = TensorDataset(valid_x_tensor, valid_y_tensor)
test_dataset = TensorDataset(test_x_tensor, test_y_tensor)
train_loader = DataLoader(train_dataset, batch_size, True)
valid_loader = DataLoader(valid_dataset, batch_size)
test_loader = DataLoader(test_dataset, batch_size)
model = model_fn().to(device)
criterion = nn.CrossEntropyLoss(size_average=False)
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = Adam(trainable_params, lr)
scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=1, min_lr=5e-6, verbose=True)
epochs = 1000
best_epoch = -1
best_f1 = -1
best_loss = 1e3
ckpt_path = None
for epoch in range(epochs):
# Early Stopping
if epoch - best_epoch > patience:
print(f'No improvement for {patience} epochs, stop training...')
break
start = time.time()
# Training phase
model.train()
train_loss = 0.0
y_true, y_pred = [], []
for data in train_loader:
input, y = data
optimizer.zero_grad()
outputs = model(input).squeeze()
loss = criterion(outputs, y)
y_true.append(y)
y_pred.append(outputs.cpu().detach().numpy().argmax(axis=1))
loss.backward()
clip_grad_norm_(trainable_params, 1.0)
optimizer.step()
train_loss += loss.item() / train_size
del input, y
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)
train_f1 = f1_score(y_true, y_pred, average='macro')
del y_pred, y_true
# Validation phase 每一个epoch结束在验证集上验证
model.eval()
valid_loss = 0.0
pred = []
with torch.no_grad():
for data in valid_loader:
input, y = data
outputs = model(input).squeeze()
loss = criterion(outputs, y)
pred.append(outputs.cpu().detach().numpy().argmax(axis=1))
valid_loss += loss.item() / valid_size
del input, y
valid_y_pred = np.concatenate(pred)
valid_f1 = f1_score(valid_y, valid_y_pred, average='macro')
scheduler.step(valid_f1)
del valid_y_pred
if valid_f1 > best_f1: #保存在验证集上 f1-score最高的一组参数
# Remove stale checkpoint
if ckpt_path is not None:
os.remove(ckpt_path)
best_epoch = epoch
best_f1 = valid_f1
ckpt_path = f'../../ckpt/{model_name}/{seed}/fold{fold}.ckpt'
if not os.path.exists(os.path.dirname(ckpt_path)):
os.makedirs(os.path.dirname(ckpt_path))
torch.save(model.state_dict(), ckpt_path)
elapsed = time.time() - start
print(f'Epoch {epoch} in {elapsed:.1f}s: saved in {ckpt_path}')
print(f' train loss: {train_loss:.4f} valid loss: {valid_loss:.4f} ')
print(f' train f1_macro: {train_f1:.4f} valid f1_macro: {valid_f1:.4f}')
else:
elapsed = time.time() - start
print(f'Epoch {epoch} in {elapsed:.1f}s: not improved')
print(f' train loss: {train_loss:.4f} valid loss: {valid_loss:.4f}')
print(f' train f1_macro: {train_f1:.4f} valid f1_macro: {valid_f1:.4f}')
#一次训练结束后 加载当前最好的参数 (10折 有10次训练)
model.load_state_dict(torch.load(ckpt_path))
pred = []
model.eval()
with torch.no_grad(): #在交叉测试集上测试
for data in test_loader:
input, _ = data
outputs = model(input).squeeze()
pred.append(outputs)
del input
pred = np.concatenate(pred, axis=0)
meta_train[test_indices] = softmax(pred) #将预测结果先通过softmax 转换为概率分布 在把值赋给完整训练集的相应的位置 10折训练完后 将会得到完整训练集的预测结果
del pred
pred = []
model.eval()
with torch.no_grad(): #在测试集上测试
for data in test_data_loader:
input, _ = data
outputs = model(input).squeeze()
pred.append(outputs)
del input
pred = np.concatenate(pred, axis=0) #得到在测试集上的预测结果
meta_test += softmax(pred) / num_folds #10个在测绘集上的预测结果 直接取平均
del pred
del train_dataset, valid_dataset, test_dataset
del train_loader, valid_loader, test_loader
del train_x_tensor, train_y_tensor, valid_x_tensor, valid_y_tensor, test_x_tensor, test_y_tensor
del model
del criterion, optimizer, scheduler
torch.cuda.empty_cache()
gc.collect()
oof_pred = np.argmax(meta_train, axis=1) #得到在完整训练集上的预测标签
oof_f1_macro = f1_score(train_label, oof_pred, average='macro') #计算macro f1-score
#保存在完整训练集上的预测结果
meta_train_path = f'../../oof_pred/{model_name}/{seed}/{model_name}_train_{oof_f1_macro:.4f}'
if not os.path.exists(os.path.dirname(meta_train_path)):
os.makedirs(os.path.dirname(meta_train_path))
np.save(meta_train_path, meta_train)
#保存在测试集上的预测结果
meta_test_path = f'../../oof_pred/{model_name}/{seed}/{model_name}_test_{oof_f1_macro:.4f}'
if not os.path.exists(os.path.dirname(meta_test_path)):
os.makedirs(os.path.dirname(meta_test_path))
np.save(meta_test_path, meta_test)