学习baseline3,并进行实践
导入库
接着,导入所需的库:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from torchtext.data.metrics import bleu_score
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import List, Tuple
import jieba
import random
from torch.nn.utils.rnn import pad_sequence
import sacrebleu
import time
import math
数据预处理
-
定义Tokenizer:
- 英文使用spacy库的tokenizer。
- 中文使用jieba分词。
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm') zh_tokenizer = lambda x: list(jieba.cut(x))
-
读取数据:
- 读取文件中的数据,并返回列表。
def read_data(file_path: str) -> List[str]: with open(file_path, 'r', encoding='utf-8') as f: return [line.strip() for line in f]
-
预处理数据:
- 将英文和中文数据分词,并控制每个句子的最大长度。
def preprocess_data(en_data: List[str], zh_data: List[str]) -> List[Tuple[List[str], List[str]]]: processed_data = [] for en, zh in zip(en_data, zh_data): en_tokens = en_tokenizer(en.lower())[:MAX_LENGTH] zh_tokens = zh_tokenizer(zh)[:MAX_LENGTH] if en_tokens and zh_tokens: processed_data.append((en_tokens, zh_tokens)) return processed_data
-
构建词汇表:
- 为英文和中文分别构建词汇表,并设置默认索引。
def build_vocab(data: List[Tuple[List[str], List[str]]]): en_vocab = build_vocab_from_iterator( (en for en, _ in data), specials=['<unk>', '<pad>', '<bos>', '<eos>'] ) zh_vocab = build_vocab_from_iterator( (zh for _, zh in data), specials=['<unk>', '<pad>', '<bos>', '<eos>'] ) en_vocab.set_default_index(en_vocab['<unk>']) zh_vocab.set_default_index(zh_vocab['<unk>']) return en_vocab, zh_vocab
数据集与数据加载
-
自定义数据集:
- 用于加载和索引数据。
class TranslationDataset(Dataset): def __init__(self, data: List[Tuple[List[str], List[str]]], en_vocab, zh_vocab): self.data = data self.en_vocab = en_vocab self.zh_vocab = zh_vocab def __len__(self): return len(self.data) def __getitem__(self, idx): en, zh = self.data[idx] en_indices = [self.en_vocab['<bos>']] + [self.en_vocab[token] for token in en] + [self.en_vocab['<eos>']] zh_indices = [self.zh_vocab['<bos>']] + [self.zh_vocab[token] for token in zh] + [self.zh_vocab['<eos>']] return en_indices, zh_indices
-
批次处理函数:
- 用于将批次数据填充到相同长度。
def collate_fn(batch): en_batch, zh_batch = [], [] for en_item, zh_item in batch: if en_item and zh_item: en_batch.append(torch.tensor(en_item)) zh_batch.append(torch.tensor(zh_item)) if not en_batch or not zh_batch: return torch.tensor([]), torch.tensor([]) en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=en_vocab['<pad>']) zh_batch = nn.utils.rnn.pad_sequence(zh_batch, batch_first=True, padding_value=zh_vocab['<pad>']) return en_batch, zh_batch
-
加载数据:
- 加载训练集、开发集和测试集数据,并构建数据加载器。
def load_data(train_path: str, dev_en_path: str, dev_zh_path: str, test_en_path: str): # 读取训练数据 train_data = read_data(train_path) train_en, train_zh = zip(*(line.split('\t') for line in train_data)) # 读取开发集和测试集 dev_en = read_data(dev_en_path) dev_zh = read_data(dev_zh_path) test_en = read_data(test_en_path) # 预处理数据 train_processed = preprocess_data(train_en, train_zh) dev_processed = preprocess_data(dev_en, dev_zh) test_processed = [(en_tokenizer(en.lower())[:MAX_LENGTH], []) for en in test_en if en.strip()] # 构建词汇表 global en_vocab, zh_vocab en_vocab, zh_vocab = build_vocab(train_processed) # 创建数据集 train_dataset = TranslationDataset(train_processed, en_vocab, zh_vocab) dev_dataset = TranslationDataset(dev_processed, en_vocab, zh_vocab) test_dataset = TranslationDataset(test_processed, en_vocab, zh_vocab) from torch.utils.data import Subset indices = list(range(N)) train_dataset = Subset(train_dataset, indices) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True) dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, drop_last=True) return train_loader, dev_loader, test_loader, en_vocab, zh_vocab
模型构建
-
位置编码:
- 为输入添加位置编码,以保留序列信息。
class PositionalEncoding(nn.Module): def __init__(self, d_model, dropout=0.1, max_len=5000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe) def forward(self, x): x = x + self.pe[:x.size(0), :] return self.dropout(x)
-
Transformer模型:
- 基于Transformer结构的翻译模型。
class TransformerModel(nn.Module): def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout): super(TransformerModel, self).__init__() self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout) self.src_embedding = nn.Embedding(len(src_vocab), d_model) self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model) self.positional_encoding = PositionalEncoding(d_model, dropout) self.fc_out = nn.Linear(d_model, len(tgt_vocab)) self.src_vocab = src_vocab self.tgt_vocab = tgt_vocab self.d_model = d_model def forward(self, src, tgt): src = src.transpose(0, 1) tgt = tgt.transpose(0, 1) src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device) tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device) src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1) tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1) src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model)) tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)) output = self.transformer(src_embedded, tgt_embedded, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask) return self.fc_out(output) def encode(self, src): src = src.transpose(0, 1) src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device) src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1) src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model)) memory = self.transformer.encoder(src_embedded, src_mask, src_padding_mask) return memory def decode(self, tgt, memory): tgt = tgt.transpose(0, 1) tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device) tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1) tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)) output = self.transformer.decoder(tgt_embedded, memory, tgt_mask, None, tgt_padding_mask, tgt_padding_mask) return self.fc_out(output)
模型训练
-
训练函数:
- 定义训练步骤。
def train_epoch(model, data_loader, criterion, optimizer, device): model.train() epoch_loss = 0 for en, zh in data_loader: en, zh = en.to(device), zh.to(device) optimizer.zero_grad() output = model(en, zh[:, :-1]) loss = criterion(output.reshape(-1, output.shape[-1]), zh[:, 1:].reshape(-1)) loss.backward() clip_grad_norm_(model.parameters(), 1.0) optimizer.step() epoch_loss += loss.item() return epoch_loss / len(data_loader)
-
评估函数:
- 定义评估步骤。
def evaluate(model, data_loader, criterion, device): model.eval() epoch_loss = 0 with torch.no_grad(): for en, zh in data_loader: en, zh = en.to(device), zh.to(device) output = model(en, zh[:, :-1]) loss = criterion(output.reshape(-1, output.shape[-1]), zh[:, 1:].reshape(-1)) epoch_loss += loss.item() return epoch_loss / len(data_loader)
-
训练循环:
- 进行训练和评估。
def train_model(train_loader, dev_loader, model, optimizer, criterion, device, num_epochs): for epoch in range(num_epochs): start_time = time.time() train_loss = train_epoch(model, train_loader, criterion, optimizer, device) dev_loss = evaluate(model, dev_loader, criterion, device) end_time = time.time() epoch_mins, epoch_secs = divmod(end_time - start_time, 60) print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f}') print(f'\t Val. Loss: {dev_loss:.3f}')
模型推理
-
翻译函数:
- 用于生成翻译。
def translate_sentence(model, sentence, en_vocab, zh_vocab, device, max_length=50): model.eval() tokens = [en_vocab['<bos>']] + [en_vocab[token] for token in en_tokenizer(sentence.lower())] + [en_vocab['<eos>']] src_tensor = torch.LongTensor(tokens).unsqueeze(1).to(device) memory = model.encode(src_tensor) outputs = [zh_vocab['<bos>']] for _ in range(max_length): tgt_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device) output = model.decode(tgt_tensor, memory) pred_token = output.argmax(2)[-1].item() outputs.append(pred_token) if pred_token == zh_vocab['<eos>']: break translated_tokens = [zh_vocab.itos[i] for i in outputs] return translated_tokens[1:-1]
-
计算BLEU分数:
- 评估翻译的质量。
def calculate_bleu(data, model, en_vocab, zh_vocab, device): trgs = [] pred_trgs = [] for src, trg in data: src = " ".join([en_vocab.itos[token] for token in src]) pred_trg = translate_sentence(model, src, en_vocab, zh_vocab, device) pred_trgs.append(pred_trg) trgs.append([trg]) return bleu_score(pred_trgs, trgs)
总结
上述代码实现了一个基于Transformer的中英翻译模型,从数据预处理、模型构建到模型训练和评估,完整地展示了如何实现一个机器翻译系统。