为什么需要预训练模型
- 模型的参数巨多
- 在有限的时间内得到好的训练结果
- 解决小数据集问题
ELMo 一词多义
ELMo,全称(Embeddiings from Language Models),找到一个词在句子中的意思,即通过前后文的信息来解释一个词的意思
,而不仅仅是用Skip和CBOW根据前后几个词来推断该词的向量表示,使得词语的表示起来更加贴合人的理解。
ELMo 核心思想
- 从前往后的信息
- 从后往前的信息
- 当前词汇的信息
举例:
假设有一句话是:“今天天气真好!”,我们想要预测天气这个词的意思,那么就需要知道它在这句话中意思,因为它前面有“今天”后面有“真好”,所以需要用这前后两个词+“天气”本身的词向量对“天气”的意思进行预测。
训练
- 一般希望的预训练模型是在无监督条件下进行训练的
由于ELMo使用的两个LSTM,所以训练的速度可能不快~
一些函数的学习
torch.flip()
作用:按照维度对输入进行翻转,第一个参数是输入,第二个参数是输入第几维
x = torch.randint(0,11,(3,2,2))
print('x:\n',x)
x0 = torch.flip(x,(0,))
print('x0:\n',x0)
x1 = torch.flip(x,(1,))
print('x1:\n',x1)
x2 = torch.flip(x,(2,))
print('x2:\n',x2)
运行结果:
x:
tensor( [[[ 8, 3],
[ 8, 10]],
[[ 1, 8],
[ 9, 9]],
[[ 8, 1],
[ 8, 6]]])
x0:
tensor([[[ 8, 1],
[ 8, 6]],
[[ 1, 8],
[ 9, 9]],
[[ 8, 3],
[ 8, 10]]])
x1:
tensor([[[ 8, 10],
[ 8, 3]],
[[ 9, 9],
[ 1, 8]],
[[ 8, 6],
[ 8, 1]]])
x2:
tensor([[[ 3, 8],
[10, 8]],
[[ 8, 1],
[ 9, 9]],
[[ 1, 8],
[ 6, 8]]])
代码展示
from torch import nn, optim
import torch
from torch.nn.functional import cross_entropy, softmax
import utils
from torch.utils.data import DataLoader
import os
# 超参数
HIDDEN_SIZE = 256
N_LAYERS = 2
BATCH_SIZE = 16
LEARNING_RATE = 2e-3
# seq_len 单词数
# emb_dim 词嵌入维度
# hidden_size 隐藏层神经元数量
# n_layers 隐藏层数量
# lr 学习率
class ELMo(nn.Module):
def __init__(self, word_num, emb_dim, hidden_size, n_layers, lr):
super().__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.word_num = word_num
# encoder
# padding_idx=0 表示0下标对应的单词的词向量表示为0
self.word_embed = nn.Embedding(word_num, emb_dim, padding_idx=0)
self.word_embed.weight.data.normal_(0, 0.1)
# forward LSTM
self.fs = nn.ModuleList(
[nn.LSTM(input_size=emb_dim, hidden_size=hidden_size, batch_first=True) if i == 0 else nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True) for i in range(n_layers)])
self.f_logits = nn.Linear(
in_features=hidden_size, out_features=word_num)
# backward LSTM
self.bs = nn.ModuleList(
[nn.LSTM(input_size=emb_dim, hidden_size=hidden_size, batch_first=True) if i == 0 else nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True) for i in range(n_layers)])
self.b_logits = nn.Linear(
in_features=hidden_size, out_features=word_num)
# 优化器
self.opt = optim.Adam(self.parameters(), lr=lr)
def forward(self, seqs):
# seq.shape=[batch_size,seq_len]
# 获取设备信息
device = next(self.parameters()).device
# 将单词转化为词向量,embedded.shape=[batch_size,seq_len,emb_dim]
embedded = self.word_embed(seqs)
# 前向遍历预测不需要最后一个单词,fxs中每一个shape=[batch_size,seq_len-1,emb_dim]
fxs = [embedded[:, :-1, :]]
# 后向遍历预测不需要第一个单词,bxs中每一个shape=[batch_size,seq_len-1,emb_dim]
bxs = [embedded[:, 1:, :]]
# 初始化h_state和cell,前向预测h_f,c_f,后向预测h_b,c_b
# h_f.shape=c_f.shape=h_b.shape=c_b.shape=[direction*n_layer,batch_size,hidden_size]
(h_f, c_f) = (torch.zeros(1, seqs.shape[0], self.hidden_size).to(
device), torch.zeros(1, seqs.shape[0], self.hidden_size).to(device))
(h_b, c_b) = (torch.zeros(1, seqs.shape[0], self.hidden_size).to(
device), torch.zeros(1, seqs.shape[0], self.hidden_size).to(device))
# 将前向LSTM和后向LSTM结合起来
for fl, bl in zip(self.fs, self.bs):
# [n, step-1, hidden_size], [1, n, hidden_size]
output_f, (h_f, c_f) = fl(fxs[-1], (h_f, c_f))
# 将结果加入结果集作为下一次输入
fxs.append(output_f)
# bxs[-1].shape=[batch_size,seq_len-1,emb_dim]
# torch.flip(bxs[-1],dims=[1,]),将单词按第一个维度翻转,比如原来的index是[1,2,3...37],现在反转后为[37,36,35...1]
# [n, step-1, hidden_size], [1, n, hidden_size]
output_b, (h_b, c_b) = bl(torch.flip(
bxs[-1], dims=[1, ]), (h_b, c_b))
# 将结果在翻转回去,作为下一次预测的输入,必须翻转,因为此时是倒叙,而预测时又要倒叙,负负得正,不翻转的话相当于正序输入
bxs.append(torch.flip(output_b, dims=(1,)))
return fxs, bxs
# 更新参数
def step(self, seqs):
# seqs.shape=[batch_size,seq_len]
# 梯度清零
self.opt.zero_grad()
# 本身经过forward得到两个list
fo, bo = self(seqs)
# 经过一个线性变化,变化维度hidden_size->seq_len,都拿到的是预测完最后一个单词之后的全部输出
# fo.shape=bo.shape=[batch_size,seq_len-1,word_num]
fo = self.f_logits(fo[-1])
bo = self.b_logits(bo[-1])
# loss取均值
# seqs[:,1:].shape=[batch_size,seq_len-1]
# print(seqs[:,1:].reshape(-1).shape)=torch.Size([592])
# print(seqs[:,1:].shape)=torch.Size([16, 37])
# fo预测出来的是[1->n],bo预测出来的是[0<-n-1]
loss = (
cross_entropy(fo.reshape(-1, self.word_num), seqs[:, 1:].reshape(-1)) +
cross_entropy(bo.reshape(-1, self.word_num), seqs[:, :-1].reshape(-1)))/2
# 反向更新
loss.backward()
self.opt.step()
return loss.cpu().detach().numpy(), (fo, bo)
# 得到每一层中该单词的理解
def get_emb(self, seqs):
fxs, bxs = self(seqs)
# 分步加是因为fxs[0]是[0->n-1],fxs[1:]得到的是[1->n]
# bxs[0]是[1<-n],bxs[1:]得到的是[0<-n-1]
xs = [
torch.cat((fxs[0][:, 1:, :], bxs[0][:, :-1, :]),
dim=2).cpu().data.numpy()
] + [
torch.cat((f[:, -1:, :], b[:, :1, :]), dim=2).cpu().data.numpy() for f, b in zip(fxs[1:], bxs[1:])
]
for x in xs:
print("layers shape=", x.shape)
# print(x)
return xs
# 得到数据集
dataset = utils.MRPCSingle("./MRPC", rows=2000)
def train():
# print('num word: ',dataset.num_word)
model = ELMo(
word_num=dataset.num_word,
emb_dim=HIDDEN_SIZE,
hidden_size=HIDDEN_SIZE,
n_layers=N_LAYERS,
lr=LEARNING_RATE
)
# 是否有gpu
if torch.cuda.is_available():
print("GPU train avaliable")
device = torch.device("cuda")
model = model.cuda()
else:
device = torch.device("cpu")
model = model.cpu()
# 数据加载器
loader = DataLoader(
dataset=dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=2
)
# 训练
for i in range(10):
for batch_idx, batch in enumerate(loader):
batch = batch.type(torch.LongTensor).to(device)
# fo.shape=bo.shape=[batch_size,seq_len-1,word_num]
loss, (fo, bo) = model.step(batch)
# print(fo[0].shape)=torch.Size([37, 12880])
# 测试
if batch_idx % 20 == 0:
# 得到每一个单词最可能的下标
fp = fo[0].cpu().data.numpy().argmax(axis=1)
bp = bo[0].cpu().data.numpy().argmax(axis=1)
print("\n\nEpoch: ", i,
"| batch: ", batch_idx,
"| loss: %.3f" % loss,
"\n| tgt: ", " ".join(
[dataset.i2v[i] for i in batch[0].cpu().data.numpy() if i != dataset.pad_id]),
"\n| f_prd: ", " ".join(
[dataset.i2v[i] for i in fp if i != dataset.pad_id]),
"\n| b_prd: ", " ".join(
[dataset.i2v[i] for i in bp if i != dataset.pad_id]),
)
# 创建文件夹
os.makedirs("./visual/models/elmo",exist_ok=True)
# 保存模型参数
torch.save(model.state_dict(),"./visual/models/elmo/model.pth")
export_w2v(model,batch[:4],device)
def export_w2v(model,data,device):
model.load_state_dict(torch.load("./visual/models/elmo/model.pth",map_location=device))
emb = model.get_emb(data)
print(emb)
if __name__ == "__main__":
train()
训练结果
使用预训练模型
当保存state_dict之后,如果下次需要使用预训练模型时,直接修改部分代码就可以使用之前训练好的模型。
def train():
model = ELMo(
word_num=dataset.num_word,
emb_dim=HIDDEN_SIZE,
hidden_size=HIDDEN_SIZE,
n_layers=N_LAYERS,
lr=LEARNING_RATE
)
# 是否有gpu
if torch.cuda.is_available():
print("GPU train avaliable")
device = torch.device("cuda")
model = model.cuda()
else:
device = torch.device("cpu")
model = model.cpu()
# 第一步:在这里加载之前保存的模型
model.load_state_dict(torch.load("./visual/models/elmo/model.pth",map_location=device))
....
# 第二步:不需要再次创建文件夹
# os.makedirs("./visual/models/elmo",exist_ok=True)
torch.save(model.state_dict(),"./visual/models/elmo/model.pth")
export_w2v(model,batch[:4],device)