使用transformers的官方接口
使用transformers的官方文档介绍可以轻松的将bert做fine-tuning用于分类等问题。链接点击这里
自己搭建model
自己搭建model,让roberta成为我们模型的forward过程中的一部分,即可实现fine-tuning。需要说明的是虽然fine-tuning在很多情况很多任务下证明能对任务效果有改进。
需要额外注意地是,我们要给roberta组件设置十分小的学习率,不能直接和模型整体统一用1e-4,1e-3这么大的学习率,应该要设置成1e-6或1e-7是比较合适的。设置方法如下:
net = HomorNetv3()
lr = 0.001
robertamodel _params = list(map(id, net.Robertamodel .parameters()))
base_params = filter(lambda p: id(p) not in robertamodel _params,
net.parameters())
optimizer = torch.optim.SGD([
{'params': base_params},
{'params': net.Robertamodel .parameters(), 'lr': 1e-7},
, lr=lr, momentum=0.9)
如下图所示,将roberta嵌入成为forward的一部分。
参数说明:
- input_dim:输入的向量维度,roberta为768
- out_size:输出层的输出维度
- train1:包含第一个修改的新闻标题的text列表
- train2:包含第二个修改的新闻标题的text列表
- n_layers:GRU的层数
- batch_size:样本的batch大小
- window_size:CNN窗口的大小
- out_channels:CNN的输出维度
- bidirectional:布尔值,决定是否使用双向GRU
import torch
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaModel
class HomorNetv3(torch.nn.Module):
def __init__(self, input_dim,hidden_size, out_size, train1,train2, n_layers=1, batch_size=1,window_size=3,out_channels=200,bidirectional=True):
super(HomorNetv3, self).__init__()
self.batch_size = batch_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.out_size = out_size
self.out_channels = out_channels
self.bidirectional = bidirectional
self.train1 = train1
self.train2 = train2
# convolute the word_vectors first
self.conv = nn.Conv2d(in_channels=1, out_channels=out_channels,kernel_size=(window_size, input_dim))
self.conv2 = nn.Conv2d(in_channels=1, out_channels=out_channels,kernel_size=(20, input_dim))
# 这里指定了 BATCH FIRST
# then put it into GRU layers
self.gru = torch.nn.GRU(out_channels, hidden_size, n_layers, batch_first=True,bidirectional=self.bidirectional)
self.Ro_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
self.Robertamodel = RobertaModel.from_pretrained('roberta-base') # Robertamodel
if torch.cuda.is_available():
self.Robertamodel.cuda()
self.Robertamodel.train()
# 加了一个线性层,全连接
if self.bidirectional:
self.fc1 = torch.nn.Linear(hidden_size*2, 200)
else:
self.fc1 = torch.nn.Linear(hidden_size, 200)
# output_layer
self.fc2 = torch.nn.Linear(200, out_size)
def forward(self, word_inputs, hidden):
# hidden 就是上下文输出,output 就是 RNN 输出
#print("word_inputs",word_inputs.shape)
count = 0
for index in word_inputs.cpu():
index = int(index)
encoded_input1 = self.Ro_tokenizer(self.train1[index], return_tensors='pt')
encoded_input2 = self.Ro_tokenizer(self.train2[index], return_tensors='pt')
for key in encoded_input1.keys():
encoded_input1[key] = encoded_input1[key].cuda()
for key in encoded_input2.keys():
encoded_input2[key] = encoded_input2[key].cuda()
word_vectors1 = self.Robertamodel(**encoded_input1)[0]
word_vectors2 = self.Robertamodel(**encoded_input2)[0]
word_vectors = torch.cat((word_vectors1,word_vectors2),1).cuda()
if len(word_vectors[0]) >= max_len:
# if the headline is longer than max_len, cut it!
word_vectors = word_vectors[:,:max_len,:]
else:
# else padding zero
padding_len = max_len - len(word_vectors[0])
padding_vector = torch.zeros(1,padding_len,768).cuda()
word_vectors = torch.cat((word_vectors,padding_vector),1) # padding done
if count == 0:
count += 1
batchtesnor_for_training = word_vectors # get a tensor for training
else:
batchtesnor_for_training = torch.cat((batchtesnor_for_training,word_vectors),0)
embedded = batchtesnor_for_training.unsqueeze(1)
feature_maps1 = self.conv(embedded)
feature_maps2 = self.conv2(embedded)
feature_maps = torch.cat((feature_maps1,feature_maps2),2)
pooled = self.pool_normalize_function(feature_maps)
#print("pooled",pooled)
output, hidden = self.gru(pooled, hidden)
#print("gruoutput",output.shape)
output = self.fc1(output)
output = self.fc2(output)
# 仅仅获取 time seq 维度中的最后一个向量
# the last of time_seq
output = output[:,-1,:]
#print("beforesoftmax",output.shape)
output = F.softmax(output,dim=1)
print("output",output)
return output, hidden
def init_hidden(self):
# 这个函数写在这里,有一定迷惑性,这个不是模型的一部分,是每次第一个向量没有上下文,在这里捞一个上下文,仅此而已。
if self.bidirectional:
hidden = torch.autograd.Variable(torch.zeros(2*self.n_layers, self.batch_size, self.hidden_size, device='cuda'))
else:
hidden = torch.autograd.Variable(torch.zeros(self.n_layers, self.batch_size, self.hidden_size, device='cuda'))
return hidden
def pool_normalize_function(self,feature_maps):
feature_maps = feature_maps.squeeze(3)
# Apply ReLU
feature_maps = F.relu(feature_maps)
# Apply the max pooling layer
pooled = F.max_pool1d(feature_maps, 2)
pooled = pooled.permute(0,2,1) # 转置矩阵
normalized = F.normalize(pooled,p=2,dim=2)
#normalized = normalized.unsqueeze(2)
return normalized