模型怎么构建?
翻译就是文本生成任务,其实就是一个分类任务,我们输入是英文,输出的类别是汉字里面的哪几个?encoder和decoder 之间怎么做联系?有很多种方式,我们这里用最简单的方式。将apple的句子级输出作为历史信息和苹果的embedding融合,然后用Decoder的字符集的输出去做loss
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from tqdm import tqdm
import pickle
def get_data():
english = ["apple", "banana", "orange", "pear", "black", "red", "white", "pink", "green", "blue"]
chinese = ["苹果", "香蕉", "橙子", "梨", "黑色", "红色", "白色", "粉红色", "绿色", "蓝色"]
return english,chinese
def get_word_dict(english,chinese): # softmax 作为中间过程的激活函数
eng_to_index = {"PAD":0,"UNK":1}
chn_to_index = {"PAD":0,"UNK":1,"STA":2,"END":3}
for eng in english:
for w in eng:
eng_to_index[w] = eng_to_index.get(w,len(eng_to_index))
for chn in chinese:
for w in chn:
chn_to_index[w] = chn_to_index.get(w,len(chn_to_index))
return eng_to_index,list(eng_to_index),chn_to_index,list(chn_to_index)
class TDataset(Dataset):
def __init__(self,english,chinese,param):
self.param = param
self.english = english
self.chinese = chinese
assert len(self.english) == len(chinese),"双语长度不一致,翻译个毛线呢!"
def __getitem__(self, index):
e_data = self.english[index][:self.param["eng_max_len"]]
c_data = self.chinese[index][:self.param["chn_max_len"]]
e_index = [self.param["eng_to_index"].get(i,1) for i in e_data] + [0] * (self.param["eng_max_len"] - len(e_data))
c_index = [self.param["chn_to_index"]["STA"]] + [self.param["chn_to_index"].get(i,1) for i in c_data]+ [self.param["chn_to_index"]["END"]] + [0] * (self.param["chn_max_len"] - len(c_data))
return torch.tensor(e_index),torch.tensor(c_index)
def __len__(self):
return len(self.english)
class TModel(nn.Module):
def __init__(self,param):
super().__init__()
self.eng_embedding = nn.Embedding(len(param["eng_to_index"]),param["embedding_num"])
self.chn_embedding = nn.Embedding(len(param["chn_to_index"]),param["embedding_num"])
self.encoder = nn.GRU(param["embedding_num"],param["hidden_num"],batch_first=True,bidirectional=param["bi"])
self.decoder = nn.GRU(param["embedding_num"],param["hidden_num"],batch_first=True,bidirectional=param["bi"])
self.classifier = nn.Linear(param["hidden_num"],len(param["chn_to_index"]))
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,eng_index,chn_index):
eng_e = self.eng_embedding.forward(eng_index)
chn_e = self.chn_embedding.forward(chn_index[:,:-1])
_,encoder_out = self.encoder.forward(eng_e)
decoder_out,_ = self.decoder.forward(chn_e,encoder_out)
pre = self.classifier.forward(decoder_out)
loss = self.loss_fun(pre.reshape(-1,pre.shape[-1]),chn_index.reshape(-1)[1:])
#在做loss的时候我们这里给他传入二维的,给他前两个维度拉成一个。
return loss
def translate(self, eng_index,chn_to_index, index_to_chn):
assert len(eng_index) == 1
result = []
eng_e = self.eng_embedding.forward(eng_index)
_, encoder_out = self.encoder.forward(eng_e)
decoder_hid = encoder_out
chn_index = torch.tensor([[chn_to_index["STA"]]])
while True:
chn_e = self.chn_embedding.forward(chn_index)
decoder_out, decoder_hid = self.decoder.forward(chn_e, decoder_hid)
pre = self.classifier.forward(decoder_out)
chn_index = torch.argmax(pre, dim=-1)
if int(chn_index) == chn_to_index["END"] or len(result) > 20:
break
word = index_to_chn[int(chn_index)]
result.append(word)
return "".join(result)
if __name__ =="__main__":
english,chinese = get_data()
eng_to_index,index_to_eng,chn_to_index,index_to_chn = get_word_dict(english,chinese)
#提前构建一个变量,然后把参数都放进去,这样就不用每次都传参数了
param = {
"eng_to_index":eng_to_index,
"index_to_eng":index_to_eng,
"chn_to_index":chn_to_index,
"index_to_chn":index_to_chn,
"hidden_num" : 200,
"embedding_num":100,
"chn_max_len":3,
"eng_max_len":6,
"batch_size":1,
"epoch":40,
"lr":1e-3,
"bi":False
}
dataset = TDataset(english,chinese,param)
dataloader = DataLoader(dataset,batch_size=param["batch_size"],shuffle=False)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = TModel(param).to(device)
opt = torch.optim.AdamW(model.parameters(),param["lr"])
epoch_total_loss = 0
best_loss = 9999
# print("train!!")
for e in range(param["epoch"]):
for eng_index,chn_index in tqdm(dataloader,desc="train:"):#desc是进度条的描述
eng_index = eng_index.to(device)
chn_index = chn_index.to(device)
loss = model.forward(eng_index,chn_index)
loss.backward()
opt.step()
opt.zero_grad()
epoch_total_loss += loss
if epoch_total_loss < best_loss:
print("save best model! loss:",epoch_total_loss)
torch.save(model.state_dict(),"best_model.pt")#保存最好的模型 保存模型有两种,一个是保存模型本身,另一个是保存模型的参数。
best_loss = epoch_total_loss
epoch_total_loss = 0
with open("param.pkl","wb") as f:
pickle.dump(param,f)
#pickle,是用来保存模型参数的。
在训练的时候我们值得注意的一点还是,我们任务的构建,我们在一开始构建chn—to_index的时候,我们加入了star and end 后面我们在任务构建的时候,我们将chn_to_index放到decoder的时候我们取得是[;,:-1],也只有这样我们在后面计算loss的时候将取得是[1:,:],这样就可以对应上了。
测试:
测试的时候也要注意,我们把输入的英文正常操作,变成eng_index放入到encoder中,然后我们将其句子级别的输出结合chn——index放入到模型中,注意我们这里面,将chn_index 输入的是star 然后写一个 while true 去生成文件。
import torch
import torch.nn as nn
import pickle
class TModel(nn.Module):
def __init__(self,param):
super().__init__()
self.eng_embedding = nn.Embedding(len(param["eng_to_index"]),param["embedding_num"])
self.chn_embedding = nn.Embedding(len(param["chn_to_index"]),param["embedding_num"])
self.encoder = nn.GRU(param["embedding_num"],param["hidden_num"],batch_first=True,bidirectional=param["bi"])
self.decoder = nn.GRU(param["embedding_num"],param["hidden_num"],batch_first=True,bidirectional=param["bi"])
self.classifier = nn.Linear(param["hidden_num"],len(param["chn_to_index"]))
self.loss_fun = nn.CrossEntropyLoss()
def forward(self,eng_index,chn_index):
eng_e = self.eng_embedding.forward(eng_index)
chn_e = self.chn_embedding.forward(chn_index[:,:-1])
_,encoder_out = self.encoder.forward(eng_e)
decoder_out,_ = self.decoder.forward(chn_e,encoder_out)
pre = self.classifier.forward(decoder_out)
loss = self.loss_fun(pre.reshape(-1,pre.shape[-1]),chn_index.reshape(-1)[1:])
return loss
def translate(self, eng_index,chn_to_index, index_to_chn):
assert len(eng_index) == 1
result = []
eng_e = self.eng_embedding.forward(eng_index)
_, encoder_out = self.encoder.forward(eng_e)
decoder_hid = encoder_out
chn_index = torch.tensor([[chn_to_index["STA"]]])
while True:
chn_e = self.chn_embedding.forward(chn_index)
decoder_out, decoder_hid = self.decoder.forward(chn_e, decoder_hid)
pre = self.classifier.forward(decoder_out)
chn_index = torch.argmax(pre, dim=-1)
if int(chn_index) == chn_to_index["END"] or len(result) > 20:
break
word = index_to_chn[int(chn_index)]
result.append(word)
return "".join(result)
if __name__ =="__main__":
with open("param.pkl","rb") as f:
param = pickle.load(f)
model = TModel(param)
model_param = torch.load("best_model.pt",map_location="cpu")
model.load_state_dict(model_param)
while True:
input_text = input("请输入:")
input_idx = [param["eng_to_index"].get(i,1) for i in input_text]
input_idx = torch.tensor([input_idx + [0] *(param["eng_max_len"]-len(input_text))])
result = model.translate(input_idx,param["chn_to_index"],param["index_to_chn"])
print(result)