基于术语词典干预的机器翻译挑战赛
目录
目录
简介
从零入门NLP竞赛 是 Datawhale 2024 年 AI 夏令营第二期的学习活动(“NLP”方向),基于讯飞开放平台“基于术语词典干预的机器翻译挑战赛”开展的实践学习。
全流程指南,0基础小白也能跑通自己的baseline
适宜人群
-
新手
-
对机器学习,人工智能,自然语言处理感兴趣
-
乐于实践
学习内容
- 人工智能环境配置方法
- NLP领域模型搭建与调参技术
- NLP领域赛事参赛方法
本篇笔记概要
- 介绍Datawhale活动与科大讯飞赛事详情与资源链接
- baseline具体精读在链接中有详细介绍,本篇主要介绍本地端部署环境配置
资源链接
本地端部署
环境配置
在pycharm中选择已经安装过pytorch的虚拟环境创建项目,启动终端
在终端中输入安装代码
常规下载库方式(未使用镜像网站下载加速,建议使用镜像)
-
torchtext :是一个用于自然语言处理(NLP)任务的库,它提供了丰富的功能,包括数据预处理、词汇构建、序列化和批处理等,特别适合于文本分类、情感分析、机器翻译等任务
pip install torchtext
-
jieba:是一个中文分词库,用于将中文文本切分成有意义的词语
pip install jieba
-
sacrebleu:用于评估机器翻译质量的工具,主要通过计算BLEU(Bilingual Evaluation Understudy)得分来衡量生成文本与参考译文之间的相似度
pip install sacrebleu
-
安装en_core_web_trf
python -m spacy download en_core_web_trf
镜像加速下载(使用阿里云镜像网站)
-
torchtext :是一个用于自然语言处理(NLP)任务的库,它提供了丰富的功能,包括数据预处理、词汇构建、序列化和批处理等,特别适合于文本分类、情感分析、机器翻译等任务
pip install torchtext -i https://mirrors.aliyun.com/pypi/simple/
-
jieba:是一个中文分词库,用于将中文文本切分成有意义的词语
pip install jieba -i https://mirrors.aliyun.com/pypi/simple/
-
sacrebleu:用于评估机器翻译质量的工具,主要通过计算BLEU(Bilingual Evaluation Understudy)得分来衡量生成文本与参考译文之间的相似度
pip install sacrebleu -i https://mirrors.aliyun.com/pypi/simple/
-
安装en_core_web_trf(阿里云镜像下载不了)
python -m spacy download en_core_web_trf
解释: -i 镜像网站地址(使用该镜像网站下载,网站在国内,下载速度远快于国外网站)
镜像网站汇总
阿里云
https://mirrors.aliyun.com/pypi/simple/
中国科技大学
https://pypi.mirrors.ustc.edu.cn/simple/
豆瓣(douban)
http://pypi.douban.com/simple/
清华大学
https://pypi.tuna.tsinghua.edu.cn/simple/
中国科学技术大学
https://pypi.mirrors.ustc.edu.cn/simple/
部署
- 在创建好的pycharm项目中创建python文件
- 将baseline2代码复制粘贴到文件中
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.nn.utils import clip_grad_norm_ from torchtext.data.metrics import bleu_score from torch.utils.data import Dataset, DataLoader from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from typing import List, Tuple import jieba import random from torch.nn.utils.rnn import pad_sequence import sacrebleu import time import math # !python -m spacy download en_core_web_sm # %% md ## 数据预处理 # %% # 定义tokenizer en_tokenizer = get_tokenizer('spacy', language='en_core_web_trf') zh_tokenizer = lambda x: list(jieba.cut(x)) # 使用jieba分词 # %% # 读取数据函数 def read_data(file_path: str) -> List[str]: with open(file_path, 'r', encoding='utf-8') as f: return [line.strip() for line in f] # 数据预处理函数 def preprocess_data(en_data: List[str], zh_data: List[str]) -> List[Tuple[List[str], List[str]]]: processed_data = [] for en, zh in zip(en_data, zh_data): en_tokens = en_tokenizer(en.lower())[:MAX_LENGTH] zh_tokens = zh_tokenizer(zh)[:MAX_LENGTH] if en_tokens and zh_tokens: # 确保两个序列都不为空 processed_data.append((en_tokens, zh_tokens)) return processed_data # 构建词汇表 def build_vocab(data: List[Tuple[List[str], List[str]]]): en_vocab = build_vocab_from_iterator( (en for en, _ in data), specials=['<unk>', '<pad>', '<bos>', '<eos>'] ) zh_vocab = build_vocab_from_iterator( (zh for _, zh in data), specials=['<unk>', '<pad>', '<bos>', '<eos>'] ) en_vocab.set_default_index(en_vocab['<unk>']) zh_vocab.set_default_index(zh_vocab['<unk>']) return en_vocab, zh_vocab # %% class TranslationDataset(Dataset): def __init__(self, data: List[Tuple[List[str], List[str]]], en_vocab, zh_vocab): self.data = data self.en_vocab = en_vocab self.zh_vocab = zh_vocab def __len__(self): return len(self.data) def __getitem__(self, idx): en, zh = self.data[idx] en_indices = [self.en_vocab['<bos>']] + [self.en_vocab[token] for token in en] + [self.en_vocab['<eos>']] zh_indices = [self.zh_vocab['<bos>']] + [self.zh_vocab[token] for token in zh] + [self.zh_vocab['<eos>']] return en_indices, zh_indices # %% def collate_fn(batch): en_batch, zh_batch = [], [] for en_item, zh_item in batch: if en_item and zh_item: # 确保两个序列都不为空 # print("都不为空") en_batch.append(torch.tensor(en_item)) zh_batch.append(torch.tensor(zh_item)) else: print("存在为空") if not en_batch or not zh_batch: # 如果整个批次为空,返回空张量 return torch.tensor([]), torch.tensor([]) # src_sequences = [item[0] for item in batch] # trg_sequences = [item[1] for item in batch] en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=en_vocab['<pad>']) zh_batch = nn.utils.rnn.pad_sequence(zh_batch, batch_first=True, padding_value=zh_vocab['<pad>']) # en_batch = pad_sequence(en_batch, batch_first=True, padding_value=en_vocab['<pad>']) # zh_batch = pad_sequence(zh_batch, batch_first=True, padding_value=zh_vocab['<pad>']) return en_batch, zh_batch # %% # 数据加载函数 def load_data(train_path: str, dev_en_path: str, dev_zh_path: str, test_en_path: str): # 读取训练数据 train_data = read_data(train_path) train_en, train_zh = zip(*(line.split('\t') for line in train_data)) # 读取开发集和测试集 dev_en = read_data(dev_en_path) dev_zh = read_data(dev_zh_path) test_en = read_data(test_en_path) # 预处理数据 train_processed = preprocess_data(train_en, train_zh) dev_processed = preprocess_data(dev_en, dev_zh) test_processed = [(en_tokenizer(en.lower())[:MAX_LENGTH], []) for en in test_en if en.strip()] # 构建词汇表 global en_vocab, zh_vocab en_vocab, zh_vocab = build_vocab(train_processed) # 创建数据集 train_dataset = TranslationDataset(train_processed, en_vocab, zh_vocab) dev_dataset = TranslationDataset(dev_processed, en_vocab, zh_vocab) test_dataset = TranslationDataset(test_processed, en_vocab, zh_vocab) from torch.utils.data import Subset # 假设你有10000个样本,你只想用前1000个样本进行测试 indices = list(range(N)) train_dataset = Subset(train_dataset, indices) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True) dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True) return train_loader, dev_loader, test_loader, en_vocab, zh_vocab # %% # def pad_sequence(batch,batch_first,padding_value): # # 对源语言和目标语言序列分别进行填充 # src_sequences = [item[0] for item in batch] # trg_sequences = [item[1] for item in batch] # src_padded = nn.utils.rnn.pad_sequence(src_sequences, batch_first=True, padding_value=en_vocab['<pad>']) # trg_padded = nn.utils.rnn.pad_sequence(trg_sequences, batch_first=True, padding_value=zh_vocab['<pad>']) # return src_padded, trg_padded # %% md ## 模型构建 # %% class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout): super().__init__() self.hid_dim = hid_dim self.n_layers = n_layers self.embedding = nn.Embedding(input_dim, emb_dim) self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) self.dropout = nn.Dropout(dropout) def forward(self, src): # src = [batch size, src len] embedded = self.dropout(self.embedding(src)) # embedded = [batch size, src len, emb dim] outputs, hidden = self.gru(embedded) # outputs = [batch size, src len, hid dim * n directions] # hidden = [n layers * n directions, batch size, hid dim] return outputs, hidden class Attention(nn.Module): def __init__(self, hid_dim): super().__init__() self.attn = nn.Linear(hid_dim * 2, hid_dim) self.v = nn.Linear(hid_dim, 1, bias=False) def forward(self, hidden, encoder_outputs): # hidden = [1, batch size, hid dim] # encoder_outputs = [batch size, src len, hid dim] batch_size = encoder_outputs.shape[0] src_len = encoder_outputs.shape[1] hidden = hidden.repeat(src_len, 1, 1).transpose(0, 1) # hidden = [batch size, src len, hid dim] energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2))) # energy = [batch size, src len, hid dim] attention = self.v(energy).squeeze(2) # attention = [batch size, src len] return F.softmax(attention, dim=1) class Decoder(nn.Module): def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention): super().__init__() self.output_dim = output_dim self.hid_dim = hid_dim self.n_layers = n_layers self.attention = attention self.embedding = nn.Embedding(output_dim, emb_dim) self.gru = nn.GRU(hid_dim + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim) self.dropout = nn.Dropout(dropout) def forward(self, input, hidden, encoder_outputs): # input = [batch size, 1] # hidden = [n layers, batch size, hid dim] # encoder_outputs = [batch size, src len, hid dim] input = input.unsqueeze(1) embedded = self.dropout(self.embedding(input)) # embedded = [batch size, 1, emb dim] a = self.attention(hidden[-1:], encoder_outputs) # a = [batch size, src len] a = a.unsqueeze(1) # a = [batch size, 1, src len] weighted = torch.bmm(a, encoder_outputs) # weighted = [batch size, 1, hid dim] rnn_input = torch.cat((embedded, weighted), dim=2) # rnn_input = [batch size, 1, emb dim + hid dim] output, hidden = self.gru(rnn_input, hidden) # output = [batch size, 1, hid dim] # hidden = [n layers, batch size, hid dim] embedded = embedded.squeeze(1) output = output.squeeze(1) weighted = weighted.squeeze(1) prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1)) # prediction = [batch size, output dim] return prediction, hidden class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device def forward(self, src, trg, teacher_forcing_ratio=0.5): # src = [batch size, src len] # trg = [batch size, trg len] batch_size = src.shape[0] trg_len = trg.shape[1] trg_vocab_size = self.decoder.output_dim outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device) encoder_outputs, hidden = self.encoder(src) input = trg[:, 0] for t in range(1, trg_len): output, hidden = self.decoder(input, hidden, encoder_outputs) outputs[:, t] = output teacher_force = random.random() < teacher_forcing_ratio top1 = output.argmax(1) input = trg[:, t] if teacher_force else top1 return outputs # %% # 初始化模型 def initialize_model(input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, device): attn = Attention(hid_dim) enc = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout) dec = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout, attn) model = Seq2Seq(enc, dec, device).to(device) return model # %% md ## 训练 # %% # 定义优化器 def initialize_optimizer(model, learning_rate=0.001): return optim.Adam(model.parameters(), lr=learning_rate) # %% # 运行时间 def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs # %% def train(model, iterator, optimizer, criterion, clip): model.train() epoch_loss = 0 for i, batch in enumerate(iterator): # print(f"Training batch {i}") src, trg = batch # print(f"Source shape before: {src.shape}, Target shape before: {trg.shape}") if src.numel() == 0 or trg.numel() == 0: # print("Empty batch detected, skipping...") continue # 跳过空的批次 src, trg = src.to(DEVICE), trg.to(DEVICE) optimizer.zero_grad() output = model(src, trg) output_dim = output.shape[-1] output = output[:, 1:].contiguous().view(-1, output_dim) trg = trg[:, 1:].contiguous().view(-1) loss = criterion(output, trg) loss.backward() clip_grad_norm_(model.parameters(), clip) optimizer.step() epoch_loss += loss.item() print(f"Average loss for this epoch: {epoch_loss / len(iterator)}") return epoch_loss / len(iterator) def evaluate(model, iterator, criterion): model.eval() epoch_loss = 0 with torch.no_grad(): for i, batch in enumerate(iterator): # print(f"Evaluating batch {i}") src, trg = batch if src.numel() == 0 or trg.numel() == 0: continue # 跳过空批次 src, trg = src.to(DEVICE), trg.to(DEVICE) output = model(src, trg, 0) # 关闭 teacher forcing output_dim = output.shape[-1] output = output[:, 1:].contiguous().view(-1, output_dim) trg = trg[:, 1:].contiguous().view(-1) loss = criterion(output, trg) epoch_loss += loss.item() return epoch_loss / len(iterator) # %% # 翻译函数 def translate_sentence(src_indexes, src_vocab, tgt_vocab, model, device, max_length=50): model.eval() src_tensor = src_indexes.unsqueeze(0).to(device) # 添加批次维度 # with torch.no_grad(): # encoder_outputs = model.encoder(model.positional_encoding(model.src_embedding(src_tensor) * math.sqrt(model.d_model))) trg_indexes = [tgt_vocab['<bos>']] for i in range(max_length): trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) # print("src_tensor:",src_tensor) # print("trg_tensor:",trg_tensor) with torch.no_grad(): output = model(src_tensor, trg_tensor) pred_token = output.argmax(2)[:, -1].item() trg_indexes.append(pred_token) if pred_token == tgt_vocab['<eos>']: break trg_tokens = [tgt_vocab.get_itos()[i] for i in trg_indexes] return trg_tokens[1:-1] # 移除<bos>和<eos>标记 # %% def calculate_bleu(dev_loader, src_vocab, tgt_vocab, model, device): model.eval() translations = [] references = [] with torch.no_grad(): for src, tgt in dev_loader: src = src.to(device) for sentence in src: translated = translate_sentence(sentence, src_vocab, tgt_vocab, model, device) translations.append(' '.join(translated)) for reference in tgt: ref_tokens = [tgt_vocab.get_itos()[idx] for idx in reference if idx not in [tgt_vocab['<bos>'], tgt_vocab['<eos>'], tgt_vocab['<pad>']]] references.append([' '.join(ref_tokens)]) bleu = sacrebleu.corpus_bleu(translations, references) return bleu.score # %% # 主训练循环 def train_model(model, train_iterator, valid_iterator, optimizer, criterion, N_EPOCHS=10, CLIP=1, save_path='../model/best-model.pt'): best_valid_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() # print(f"Starting Epoch {epoch + 1}") train_loss = train(model, train_iterator, optimizer, criterion, CLIP) valid_loss = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), save_path) print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}') print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}') # %% # 定义常量 MAX_LENGTH = 100 # 最大句子长度 BATCH_SIZE = 32 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') N = 148363 # 采样训练集的数量 train_path = '../dataset/train.txt' dev_en_path = '../dataset/dev_en.txt' dev_zh_path = '../dataset/dev_zh.txt' test_en_path = '../dataset/test_en.txt' train_loader, dev_loader, test_loader, en_vocab, zh_vocab = load_data( train_path, dev_en_path, dev_zh_path, test_en_path ) print(f"英语词汇表大小: {len(en_vocab)}") print(f"中文词汇表大小: {len(zh_vocab)}") print(f"训练集大小: {len(train_loader.dataset)}") print(f"开发集大小: {len(dev_loader.dataset)}") print(f"测试集大小: {len(test_loader.dataset)}") # %% # 主函数 if __name__ == '__main__': N_EPOCHS = 5 CLIP = 1 # 模型参数 INPUT_DIM = len(en_vocab) OUTPUT_DIM = len(zh_vocab) EMB_DIM = 128 HID_DIM = 256 N_LAYERS = 2 DROPOUT = 0.5 # 初始化模型 model = initialize_model(INPUT_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, DEVICE) print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters') # 定义损失函数 criterion = nn.CrossEntropyLoss(ignore_index=zh_vocab['<pad>']) # 初始化优化器 optimizer = initialize_optimizer(model) # 训练模型 save_path = '../model/best-model.pt' train_model(model, train_loader, dev_loader, optimizer, criterion, N_EPOCHS, CLIP, save_path=save_path) print(f"训练完成!模型已保存到:{save_path}") # %% md ## 在开发集上进行评价 # %% # # 加载最佳模型 # model.load_state_dict(torch.load('../model/best-model.pt')) # # 计算BLEU分数 # bleu_score = calculate_bleu(dev_loader, en_vocab, zh_vocab, model, DEVICE) # print(f'BLEU score = {bleu_score:.2f}') # %% md ## 对测试集进行翻译 # %% # 加载最佳模型 # model.load_state_dict(torch.load('../model/best-model_test.pt')) # %% save_dir = '../results/submit_task2.txt' with open(save_dir, 'w') as f: translated_sentences = [] for batch in test_loader: # 遍历所有数据 src, _ = batch src = src.to(DEVICE) translated = translate_sentence(src[0], en_vocab, zh_vocab, model, DEVICE, max_length=50) # 翻译结果,max_length生成翻译的最大长度 # print(translated) results = "".join(translated) f.write(results + '\n') # 将结果写入文件 print(f"翻译完成,结果已保存到{save_dir}")
- 数据集下载
从科大讯飞赛事官网中下载数据集,解压缩到项目的dataset文件夹(自己创建)中 - 运行