Task1 - 速通 Baseline
导入package
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from collections import Counter
import random
from torch.utils.data import Subset, DataLoader
import time
训练
# 主函数
if __name__ == '__main__':
start_time = time.time() # 开始计时
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#terminology = load_terminology_dictionary('../dataset/en-zh.dic')
terminology = load_terminology_dictionary('./dataset/en-zh.dic')
# 加载数据
dataset = TranslationDataset('./dataset/train.txt',terminology = terminology)
# 选择数据集的前N个样本进行训练
N = 1000 #int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
subset_indices = list(range(N))
subset_dataset = Subset(dataset, subset_indices)
train_loader = DataLoader(subset_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
# 定义模型参数
INPUT_DIM = len(dataset.en_vocab)
OUTPUT_DIM = len(dataset.zh_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
# 初始化模型
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)
# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=dataset.zh_word2idx['<pad>'])
# 训练模型
N_EPOCHS = 10
CLIP = 1
for epoch in range(N_EPOCHS):
train_loss = train(model, train_loader, optimizer, criterion, CLIP)
print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')
# 在训练循环结束后保存模型
torch.save(model.state_dict(), 'translation_model_GRU.pth')
end_time = time.time() # 结束计时
# 计算并打印运行时间
elapsed_time_minute = (end_time - start_time)/60
print(f"Total running time: {elapsed_time_minute:.2f} minutes")
输出结果如下:
Epoch: 01 | Train Loss: 6.555
Epoch: 02 | Train Loss: 6.060
Epoch: 03 | Train Loss: 6.030
Epoch: 04 | Train Loss: 5.988
Epoch: 05 | Train Loss: 5.922
Epoch: 06 | Train Loss: 5.868
Epoch: 07 | Train Loss: 5.799
Epoch: 08 | Train Loss: 5.700
Epoch: 09 | Train Loss: 5.603
Epoch: 10 | Train Loss: 5.518
Total running time: 1.31 minutes
在测试集上进行推理
# 主函数
if __name__ == '__main__':
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 加载术语词典
terminology = load_terminology_dictionary('./dataset/en-zh.dic')
# 加载数据集和模型
dataset = TranslationDataset('./dataset/train.txt',terminology = terminology)
# 定义模型参数
INPUT_DIM = len(dataset.en_vocab)
OUTPUT_DIM = len(dataset.zh_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
# 初始化模型
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)
# 加载训练好的模型
model.load_state_dict(torch.load('translation_model_GRU.pth'))
save_dir = './dataset/submit.txt'
inference(model, dataset, src_file="./dataset/test_en.txt", save_dir = save_dir, terminology = terminology, device = device)
print(f"翻译完成!文件已保存到{save_dir}")
输出结果如下:
翻译完成!文件已保存到./dataset/submit.txt