Import required packages
Firstly, let’s make sure we have the below packages installed in our system, if you found that some packages are missing, make sure to install them.
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from collections import Counter
from torchtext.vocab import Vocab
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
import io
import time
import pandas as pd
import numpy as np
import pickle
import tqdm
import sentencepiece as spm
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(torch.cuda.get_device_name(0)) ## 如果你有GPU,请在你自己的电脑上尝试运行这一套代码
Get the parallel dataset
In this tutorial, we will use the Japanese-English parallel dataset downloaded from JParaCrawl![JParaCrawl] which is described as the “largest publicly available English-Japanese parallel corpus created by NTT. It was created by largely crawling the web and automatically aligning parallel sentences.” You can also see the paper here.
df = pd.read_csv('./zh-ja/zh-ja.bicleaner05.txt', sep='\\t', engine='python', header=None)# 获取第三列的数据,并转换为列表形式存储在trainen中
trainen = df[2].values.tolist()#[:10000]
trainja = df[3].values.tolist()#[:10000]
# trainen.pop(5972)
# trainja.pop(5972)
After importing all the Japanese and their English counterparts, I deleted the last data in the dataset because it has a missing value. In total, the number of sentences in both trainen and trainja is 5,973,071, however, for learning purposes, it is often recommended to sample the data and make sure everything is working as intended, before using all the data at once, to save time.
Here is an example of sentence contained in the dataset.
print(trainen[500])
print(trainja[500])
Prepare the tokenizers
Unlike English or other alphabetical languages, a Japanese sentence does not contain whitespaces to separate the words. We can use the tokenizers provided by JParaCrawl which was created using SentencePiece for both Japanese and English, you can visit the JParaCrawl website to download them, or click here.
en_tokenizer = spm.SentencePieceProcessor(model_file='enja_spm_models/spm.en.nopretok.model')# 加载英文 SentencePiece 模型,模型文件路径为 'enja_spm_models/spm.en.nopretok.model'
ja_tokenizer = spm.SentencePieceProcessor(model_file='enja_spm_models/spm.ja.nopretok.model')# 加载日文 SentencePiece 模型,模型文件路径为 'enja_spm_models/spm.ja.nopretok.model'
en_tokenizer.encode("All residents aged 20 to 59 years who live in Japan must enroll in public pension system.", out_type='str')# 调用 en_tokenizer 对象的 encode 方法,将给定的英文句子进行编码
ja_tokenizer.encode("年金 日本に住んでいる20歳~60歳の全ての人は、公的年金制度に加入しなければなりません。", out_type='str')# 调用 ja_tokenizer 对象的 encode 方法,将给定的日文句子进行编码
Build the TorchText Vocab objects and convert the sentences into Torch tensors
Using the tokenizers and raw sentences, we then build the Vocab object imported from TorchText. This process can take a few seconds or minutes depending on the size of our dataset and computing power. Different tokenizer can also affect the time needed to build the vocab, I tried several other tokenizers for Japanese but SentencePiece seems to be working well and fast enough for me.
def build_vocab(sentences, tokenizer): # 创建一个空的计数器 Counter 对象
counter = Counter() # 遍历每个句子并更新计数器
for sentence in sentences:# 使用给定的 tokenizer 对句子进行编码,并以字符串形式输出
counter.update(tokenizer.encode(sentence, out_type=str))
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) # 更新计数器,统计每个子词的出现次数
ja_vocab = build_vocab(trainja, ja_tokenizer)
en_vocab = build_vocab(trainen, en_tokenizer)# 使用 build_vocab 函数分别构建日文和英文的词汇表
def data_process(ja, en):
data = []
for (raw_ja, raw_en) in zip(ja, en):
ja_tensor_ = torch.tensor([ja_vocab[token] for token in ja_tokenizer.encode(raw_ja.rstrip("\n"), out_type=str)],
dtype=torch.long)
en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer.encode(raw_en.rstrip("\n"), out_type=str)],
dtype=torch.long) # 使用 en_tokenizer 对英语句子进行编码,并将其转换为 torch.Tensor
data.append((ja_tensor_, en_tensor_))
return data
train_data = data_process(trainja, trainen)
# 使用 data_process 函数处理训练数据集 trainja 和 trainen,并将结果赋给 train_data 变量
Create the DataLoader object to be iterated during training¶
Here, I set the BATCH_SIZE to 16 to prevent “cuda out of memory”, but this depends on various things such as your machine memory capacity, size of data, etc., so feel free to change the batch size according to your needs (note: the tutorial from PyTorch sets the batch size as 128 using the Multi30k German-English dataset.)
BATCH_SIZE = 8
PAD_IDX = ja_vocab['<pad>']
BOS_IDX = ja_vocab['<bos>']
EOS_IDX = ja_vocab['<eos>']
def generate_batch(data_batch):
ja_batch, en_batch = [], []
for (ja_item, en_item) in data_batch:
ja_batch.append(torch.cat([torch.tensor([BOS_IDX]), ja_item, torch.tensor([EOS_IDX])], dim=0))
en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))# 在日语句子前后添加开始和结束标记,并转换为张量
ja_batch = pad_sequence(ja_batch, padding_value=PAD_IDX)
en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
return ja_batch, en_batch
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
shuffle=True, collate_fn=generate_batch)# 使用 DataLoader 创建一个训练数据迭代器 train_iter,用于按批次加载训练数据
Sequence-to-sequence Transformer
The next couple of codes and text explanations (written in italic) are taken from the original PyTorch tutorial [Language Translation with nn.Transformer and torchtext — PyTorch Tutorials 2.3.0+cu121 documentation]. I did not make any change except for the BATCH_SIZE and the word de_vocabwhich is changed to ja_vocab.
Transformer is a Seq2Seq model introduced in “Attention is all you need” paper for solving machine translation task. Transformer model consists of an encoder and decoder block each containing fixed number of layers.
Encoder processes the input sequence by propagating it, through a series of Multi-head Attention and Feed forward network layers. The output from the Encoder referred to as memory, is fed to the decoder along with target tensors. Encoder and decoder are trained in an end-to-end fashion using teacher forcing technique.
from torch.nn import (TransformerEncoder, TransformerDecoder,
TransformerEncoderLayer, TransformerDecoderLayer)
class Seq2SeqTransformer(nn.Module):
def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
dim_feedforward:int = 512, dropout:float = 0.1):
super(Seq2SeqTransformer, self).__init__()
encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
dim_feedforward=dim_feedforward)
self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
dim_feedforward=dim_feedforward)
self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
self.generator = nn.Linear(emb_size, tgt_vocab_size)# 输出层,将解码器输出的嵌入表示映射到目标词汇表大小
self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)# 源语言和目标语言的词嵌入层
self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout) # 位置编码层,为输入添加位置信息
def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
tgt_mask: Tensor, src_padding_mask: Tensor,
tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
src_emb = self.positional_encoding(self.src_tok_emb(src)) # 对源序列嵌入应用位置编码
tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
# 对目标序列嵌入应用位置编码
memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)# 使用Transformer编码器对源序列进行编码
outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
tgt_padding_mask, memory_key_padding_mask)
# 使用Transformer解码器对目标序列进行解码
return self.generator(outs)
# 生成目标序列的输出logits
def encode(self, src: Tensor, src_mask: Tensor):
return self.transformer_encoder(self.positional_encoding(
self.src_tok_emb(src)), src_mask)
def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
return self.transformer_decoder(self.positional_encoding(
self.tgt_tok_emb(tgt)), memory,
tgt_mask)class PositionalEncoding(nn.Module):
def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
super(PositionalEncoding, self).__init__()
den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
pos = torch.arange(0, maxlen).reshape(maxlen, 1)
pos_embedding = torch.zeros((maxlen, emb_size))
pos_embedding[:, 0::2] = torch.sin(pos * den)
pos_embedding[:, 1::2] = torch.cos(pos * den)
pos_embedding = pos_embedding.unsqueeze(-2)
self.dropout = nn.Dropout(dropout)
self.register_buffer('pos_embedding', pos_embedding)
def forward(self, token_embedding: Tensor):
return self.dropout(token_embedding +
self.pos_embedding[:token_embedding.size(0),:]) # 将 token_embedding 与位置编码相加,并应用 dropout
class TokenEmbedding(nn.Module):
def __init__(self, vocab_size: int, emb_size):
super(TokenEmbedding, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_size) # 创建词嵌入层,将 vocab_size 个词映射到 emb_size 维的词向量空间
self.emb_size = emb_size
def forward(self, tokens: Tensor):
return self.embedding(tokens.long()) * math.sqrt(self.emb_size)# 根据 tokens 的索引,查找并返回对应的词向量,并乘以 math.sqrt(self.emb_size) 进行缩放
def generate_square_subsequent_mask(sz): # 创建一个上三角矩阵,对角线及其以下为True,其余为False
mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1) # 将布尔型矩阵转换为浮点型,True变为1.0,False变为0.0
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def create_mask(src, tgt):
src_seq_len = src.shape[0]
tgt_seq_len = tgt.shape[0]
tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
# 生成目标序列的上三角矩阵掩码
src_padding_mask = (src == PAD_IDX).transpose(0, 1)
tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask # 生成源序列和目标序列的填充掩码
SRC_VOCAB_SIZE = len(ja_vocab)
TGT_VOCAB_SIZE = len(en_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 16
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
FFN_HID_DIM)
for p in transformer.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)# 对模型参数进行Xavier初始化
transformer = transformer.to(device)# 将模型移动到指定的设备(如GPU)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)# 定义损失函数:交叉熵损失函数,忽略填充部分的计算
optimizer = torch.optim.Adam(
transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)
def train_epoch(model, train_iter, optimizer):
model.train()
losses = 0
for idx, (src, tgt) in enumerate(train_iter):
src = src.to(device)
tgt = tgt.to(device)
tgt_input = tgt[:-1, :] # 去除tgt末尾的一行,作为输入
src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
logits = model(src, tgt_input, src_mask, tgt_mask,
src_padding_mask, tgt_padding_mask, src_padding_mask) # 前向传播得到预测值logits
optimizer.zero_grad()
tgt_out = tgt[1:,:]
loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
loss.backward()# 反向传播计算梯度
optimizer.step()
losses += loss.item()
return losses / len(train_iter) # 返回平均损失
def evaluate(model, val_iter):
model.eval()
losses = 0
for idx, (src, tgt) in (enumerate(valid_iter)):
src = src.to(device)
tgt = tgt.to(device)
tgt_input = tgt[:-1, :]
src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) # 前向传播得到预测值logits
logits = model(src, tgt_input, src_mask, tgt_mask,
src_padding_mask, tgt_padding_mask, src_padding_mask)
tgt_out = tgt[1:,:]
loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
losses += loss.item()
return losses / len(val_iter)
Start training
Finally, after preparing the necessary classes and functions, we are ready to train our model. This goes without saying but the time needed to finish training could vary greatly depending on a lot of things such as computing power, parameters, and size of datasets.
When I trained the model using the complete list of sentences from JParaCrawl which has around 5.9 million sentences for each language, it took around 5 hours per epoch using a single NVIDIA GeForce RTX 3070 GPU.
Here is the code:
for epoch in tqdm.tqdm(range(1, NUM_EPOCHS+1)):# 循环训练每个epoch
start_time = time.time()
train_loss = train_epoch(transformer, train_iter, optimizer)
end_time = time.time()
print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, "
f"Epoch time = {(end_time - start_time):.3f}s")) # 输出当前epoch的训练结果:epoch数、训练损失和该epoch的训练时间
Try translating a Japanese sentence using the trained model
First, we create the functions to translate a new sentence, including steps such as to get the Japanese sentence, tokenize, convert to tensors, inference, and then decode the result back into a sentence, but this time in English.
def greedy_decode(model, src, src_mask, max_len, start_symbol):
src = src.to(device)
src_mask = src_mask.to(device)
memory = model.encode(src, src_mask)
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
for i in range(max_len-1):
memory = memory.to(device)
memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
tgt_mask = (generate_square_subsequent_mask(ys.size(0))
.type(torch.bool)).to(device)
out = model.decode(ys, memory, tgt_mask)
out = out.transpose(0, 1)
prob = model.generator(out[:, -1])
_, next_word = torch.max(prob, dim = 1)
next_word = next_word.item()
ys = torch.cat([ys,
torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
if next_word == EOS_IDX:
break
return ys
def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
model.eval()
tokens = [BOS_IDX] + [src_vocab.stoi[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
num_tokens = len(tokens)
src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) # 将源文本分词并转换为索引序列
tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten() # 使用贪婪解码生成目标序列的索引
return " ".join([tgt_vocab.itos[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")# 将目标序列的索引转换为文本
translate(transformer, "HSコード 8515 はんだ付け用、ろう付け用又は溶接用の機器(電気式(電気加熱ガス式を含む。)", ja_vocab, en_vocab, ja_tokenizer)
Save the Vocab objects and trained model
Finally, after the training has finished, we will save the Vocab objects (en_vocab and ja_vocab) first, using Pickle.
import pickle
# open a file, where you want to store the data
file = open('en_vocab.pkl', 'wb') # 以二进制写入模式打开名为'en_vocab.pkl'的文件
# dump information to that file
pickle.dump(en_vocab, file)# 将en_vocab对象序列化并写入到文件中
file.close()
file = open('ja_vocab.pkl', 'wb')
pickle.dump(ja_vocab, file)# 将ja_vocab对象序列化并写入到文件中
file.close()
save model for inference
torch.save(transformer.state_dict(), 'inference_model')
# save model + checkpoint to resume training later
# 使用torch.save()函数保存以下内容到'model_checkpoint.tar'文件中:torch.save({
'epoch': NUM_EPOCHS, # 当前训练的轮次数
'model_state_dict': transformer.state_dict(), # 模型的状态字典,包含模型的权重和参数
'optimizer_state_dict': optimizer.state_dict(),# 优化器的状态字典,包含优化器的状态信息(如动量、学习率等
'loss': train_loss,# 当前训练的损失值
}, 'model_checkpoint.tar')