#AI夏令营 #Datawhale #夏令营
Task1是用于机器翻译任务的,主要包含以下几个部分:
- 数据预处理
核心代码:
class TranslationDataset(Dataset):\n",
" def __init__(self, filename, terminology):\n",
" self.data = []\n",
" with open(filename, 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" en, zh = line.strip().split('\\t')\n",
" self.data.append((en, zh))\n",
" \n",
" self.terminology = terminology\n",
" \n",
" # 创建词汇表,注意这里需要确保术语词典中的词也被包含在词汇表中\n",
" self.en_tokenizer = get_tokenizer('basic_english')\n",
" self.zh_tokenizer = list # 使用字符级分词\n",
" \n",
" en_vocab = Counter(self.terminology.keys()) # 确保术语在词汇表中\n",
" zh_vocab = Counter()\n",
" \n",
" for en, zh in self.data:\n",
" en_vocab.update(self.en_tokenizer(en))\n",
" zh_vocab.update(self.zh_tokenizer(zh))\n",
" \n",
" # 添加术语到词汇表\n",
" self.en_vocab = ['<pad>', '<sos>', '<eos>'] + list(self.terminology.keys()) + [word for word, _ in en_vocab.most_common(10000)]\n",
" self.zh_vocab = ['<pad>', '<sos>', '<eos>'] + [word for word, _ in zh_vocab.most_common(10000)]\n",
" \n",
" self.en_word2idx = {word: idx for idx, word in enumerate(self.en_vocab)}\n",
" self.zh_word2idx = {word: idx for idx, word in enumerate(self.zh_vocab)}\n",
"\n",
"\n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, idx):\n",
" en, zh = self.data[idx]\n",
" en_tensor = torch.tensor([self.en_word2idx.get(word, self.en_word2idx['<sos>']) for word in self.en_tokenizer(en)] + [self.en_word2idx['<eos>']])\n",
" zh_tensor = torch.tensor([self.zh_word2idx.get(word, self.zh_word2idx['<sos>']) for word in self.zh_tokenizer(zh)] + [self.zh_word2idx['<eos>']])\n",
" return en_tensor, zh_tensor\n",
"\n",
"def collate_fn(batch):\n",
" en_batch, zh_batch = [], []\n",
" for en_item, zh_item in batch:\n",
" en_batch.append(en_item)\n",
" zh_batch.append(zh_item)\n",
" \n",
" # 对英文和中文序列分别进行填充\n",
" en_batch = nn.utils.rnn.pad_sequence(en_batch, padding_value=0, batch_first=True)\n",
" zh_batch = nn.utils.rnn.pad_sequence(zh_batch, padding_value=0, batch_first=True)\n",
" \n",
" return en_batch, zh_batch\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2024-07-08T12:50:52.093405Z",
"iopub.status.busy": "2024-07-08T12:50:52.093120Z",
"iopub.status.idle": "2024-07-08T12:50:52.102203Z",
"shell.execute_reply": "2024-07-08T12:50:52.101497Z",
"shell.execute_reply.started": "2024-07-08T12:50:52.093388Z"
},
"tags": []
},
"outputs": [],
"source": [
代码中定义了一个TranslationDataset类,用于处理数据集。这个类主要负责以下任务:
- 加载数据集文件(例如train.txt),并将其分割为英文和中文句子对。
- 创建词汇表,包括术语词典,确保术语在词汇表中。
- 对句子进行分词处理,使用不同的分词方法(如英文使用basic_english,中文使用字符级分词)。
- 将分词后的句子转换为索引表示,以便于模型训练。
- 模型定义
核心代码:
"class Encoder(nn.Module):\n",
" def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):\n",
" super().__init__()\n",
" self.embedding = nn.Embedding(input_dim, emb_dim)\n",
" self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, src):\n",
" # src shape: [batch_size, src_len]\n",
" embedded = self.dropout(self.embedding(src))\n",
" # embedded shape: [batch_size, src_len, emb_dim]\n",
" outputs, hidden = self.rnn(embedded)\n",
" # outputs shape: [batch_size, src_len, hid_dim]\n",
" # hidden shape: [n_layers, batch_size, hid_dim]\n",
" return outputs, hidden\n",
"\n",
"class Decoder(nn.Module):\n",
" def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):\n",
" super().__init__()\n",
" self.output_dim = output_dim\n",
" self.embedding = nn.Embedding(output_dim, emb_dim)\n",
" self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)\n",
" self.fc_out = nn.Linear(hid_dim, output_dim)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, input, hidden):\n",
" # input shape: [batch_size, 1]\n",
" # hidden shape: [n_layers, batch_size, hid_dim]\n",
" \n",
" embedded = self.dropout(self.embedding(input))\n",
" # embedded shape: [batch_size, 1, emb_dim]\n",
" \n",
" output, hidden = self.rnn(embedded, hidden)\n",
" # output shape: [batch_size, 1, hid_dim]\n",
" # hidden shape: [n_layers, batch_size, hid_dim]\n",
" \n",
" prediction = self.fc_out(output.squeeze(1))\n",
" # prediction shape: [batch_size, output_dim]\n",
" \n",
" return prediction, hidden\n",
"\n",
"class Seq2Seq(nn.Module):\n",
" def __init__(self, encoder, decoder, device):\n",
" super().__init__()\n",
" self.encoder = encoder\n",
" self.decoder = decoder\n",
" self.device = device\n",
"\n",
" def forward(self, src, trg, teacher_forcing_ratio=0.5):\n",
" # src shape: [batch_size, src_len]\n",
" # trg shape: [batch_size, trg_len]\n",
" \n",
" batch_size = src.shape[0]\n",
" trg_len = trg.shape[1]\n",
" trg_vocab_size = self.decoder.output_dim\n",
"\n",
" outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)\n",
" \n",
" _, hidden = self.encoder(src)\n",
" \n",
" input = trg[:, 0].unsqueeze(1) # Start token\n",
" \n",
" for t in range(1, trg_len):\n",
" output, hidden = self.decoder(input, hidden)\n",
" outputs[:, t, :] = output\n",
" teacher_force = random.random() < teacher_forcing_ratio\n",
" top1 = output.argmax(1)\n",
" input = trg[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)\n",
"\n",
" return outputs"
代码中定义了Encoder、Decoder和Seq2Seq类,用于构建序列到序列(Sequence-to-Sequence)模型。这些类分别对应模型的编码器、解码器和整个模型。模型使用GRU作为循环神经网络(RNN)的基元。
- 训练和评估
核心代码:
"def train(model, iterator, optimizer, criterion, clip):\n",
" model.train()\n",
" epoch_loss = 0\n",
" for i, (src, trg) in enumerate(iterator):\n",
" src, trg = src.to(device), trg.to(device)\n",
" optimizer.zero_grad()\n",
" output = model(src, trg)\n",
" output_dim = output.shape[-1]\n",
" output = output[:, 1:].contiguous().view(-1, output_dim)\n",
" trg = trg[:, 1:].contiguous().view(-1)\n",
" loss = criterion(output, trg)\n",
" loss.backward()\n",
" torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
" optimizer.step()\n",
" epoch_loss += loss.item()\n",
" return epoch_loss / len(iterator)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2024-07-08T12:50:52.108867Z",
"iopub.status.busy": "2024-07-08T12:50:52.108637Z"
},
"tags": []
},
"outputs": [],
"source": [
"def evaluate_bleu(model: Seq2Seq, dataset: TranslationDataset, src_file: str, ref_file: str, terminology,device: torch.device):\n",
" model.eval()\n",
" src_sentences = load_sentences(src_file)\n",
" ref_sentences = load_sentences(ref_file)\n",
" \n",
" translated_sentences = []\n",
" for src in src_sentences:\n",
" translated = translate_sentence(src, model, dataset, terminology, device)\n",
" translated_sentences.append(translated)\n",
" \n",
" bleu = BLEU()\n",
" score = bleu.corpus_score(translated_sentences, [ref_sentences])\n",
" \n",
" return score"
- 训练:定义了一个train函数,用于模型的训练过程。它包括模型的前向传播、损失计算、反向传播和梯度裁剪等步骤。
- 评估:定义了一个evaluate_bleu函数,用于评估模型在开发集上的BLEU分数。这个函数首先翻译开发集中的英文句子,然后计算翻译结果与参考翻译的BLEU分数。
- 模型推理
核心代码:
"def inference(model: Seq2Seq, dataset: TranslationDataset, src_file: str, save_dir:str, terminology, device: torch.device):\n",
" model.eval()\n",
" src_sentences = load_sentences(src_file)\n",
" \n",
" translated_sentences = []\n",
" for src in src_sentences:\n",
" translated = translate_sentence(src, model, dataset, terminology, device)\n",
" #print(translated)\n",
" translated_sentences.append(translated)\n",
" #print(translated_sentences)\n",
"\n",
" # 将列表元素连接成一个字符串,每个元素后换行\n",
" text = '\\n'.join(translated_sentences)\n",
"\n",
" # 打开一个文件,如果不存在则创建,'w'表示写模式\n",
" with open(save_dir, 'w', encoding='utf-8') as f:\n",
" # 将字符串写入文件\n",
" f.write(text)\n",
"\n",
" #return translated_sentences"
- 推理:定义了一个inference函数,用于在测试集上进行推理。这个函数读取测试集中的英文句子,使用训练好的模型进行翻译,并将结果保存到指定的文件中。
- 主函数
代码:
"if __name__ == '__main__':\n",
" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"\n",
" # 加载术语词典\n",
" terminology = load_terminology_dictionary('../dataset/en-zh.dic')\n",
" # 加载数据集和模型\n",
" dataset = TranslationDataset('../dataset/train.txt',terminology = terminology)\n",
"\n",
" # 定义模型参数\n",
" INPUT_DIM = len(dataset.en_vocab)\n",
" OUTPUT_DIM = len(dataset.zh_vocab)\n",
" ENC_EMB_DIM = 256\n",
" DEC_EMB_DIM = 256\n",
" HID_DIM = 512\n",
" N_LAYERS = 2\n",
" ENC_DROPOUT = 0.5\n",
" DEC_DROPOUT = 0.5\n",
"\n",
" # 初始化模型\n",
" enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n",
" dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n",
" model = Seq2Seq(enc, dec, device).to(device)\n",
"\n",
" # 加载训练好的模型\n",
" model.load_state_dict(torch.load('./translation_model_GRU.pth'))\n",
" \n",
" save_dir = '../dataset/submit.txt'\n",
" inference(model, dataset, src_file=\"../dataset/test_en.txt\", save_dir = save_dir, terminology = terminology, device = device)\n",
" print(f\"翻译完成!文件已保存到{save_dir}\")"
]
主函数中执行了以下操作:
- 初始化设备(GPU或CPU)。
- 加载术语词典。
- 创建数据集实例,并加载训练好的模型。
- 调用evaluate_bleu函数来评估模型的性能。
- 调用inference函数,将测试集中的英文句子翻译成中文,并将结果保存到../dataset/submit.txt文件中。
总结与思考
这段代码是一个完整的机器翻译系统,从数据预处理、模型训练、模型评估到最终的推理输出。它使用了序列到序列模型(seq2seq)结构,包括GRU作为RNN单元,以及BLEU分数作为评估模型性能的指标。通过主函数,可以方便地运行整个流程,从训练模型到最终生成翻译结果。刚开始只知道跟着步骤操作,让代码跑起来,根本不知道其中的含义,经过分析和查阅资料后对代码的作用有了一点了解。