Datawhale AI 夏令营机器翻译挑战赛 Task1 学习笔记

#AI夏令营 #Datawhale #夏令营

Task1是用于机器翻译任务的,主要包含以下几个部分:

  1. 数据预处理

核心代码:

class TranslationDataset(Dataset):\n",

    "    def __init__(self, filename, terminology):\n",

    "        self.data = []\n",

    "        with open(filename, 'r', encoding='utf-8') as f:\n",

    "            for line in f:\n",

    "                en, zh = line.strip().split('\\t')\n",

    "                self.data.append((en, zh))\n",

    "        \n",

    "        self.terminology = terminology\n",

    "        \n",

    "        # 创建词汇表,注意这里需要确保术语词典中的词也被包含在词汇表中\n",

    "        self.en_tokenizer = get_tokenizer('basic_english')\n",

    "        self.zh_tokenizer = list  # 使用字符级分词\n",

    "        \n",

    "        en_vocab = Counter(self.terminology.keys())  # 确保术语在词汇表中\n",

    "        zh_vocab = Counter()\n",

    "        \n",

    "        for en, zh in self.data:\n",

    "            en_vocab.update(self.en_tokenizer(en))\n",

    "            zh_vocab.update(self.zh_tokenizer(zh))\n",

    "        \n",

    "        # 添加术语到词汇表\n",

    "        self.en_vocab = ['<pad>', '<sos>', '<eos>'] + list(self.terminology.keys()) + [word for word, _ in en_vocab.most_common(10000)]\n",

    "        self.zh_vocab = ['<pad>', '<sos>', '<eos>'] + [word for word, _ in zh_vocab.most_common(10000)]\n",

    "        \n",

    "        self.en_word2idx = {word: idx for idx, word in enumerate(self.en_vocab)}\n",

    "        self.zh_word2idx = {word: idx for idx, word in enumerate(self.zh_vocab)}\n",

    "\n",

    "\n",

    "    def __len__(self):\n",

    "        return len(self.data)\n",

    "\n",

    "    def __getitem__(self, idx):\n",

    "        en, zh = self.data[idx]\n",

    "        en_tensor = torch.tensor([self.en_word2idx.get(word, self.en_word2idx['<sos>']) for word in self.en_tokenizer(en)] + [self.en_word2idx['<eos>']])\n",

    "        zh_tensor = torch.tensor([self.zh_word2idx.get(word, self.zh_word2idx['<sos>']) for word in self.zh_tokenizer(zh)] + [self.zh_word2idx['<eos>']])\n",

    "        return en_tensor, zh_tensor\n",

    "\n",

    "def collate_fn(batch):\n",

    "    en_batch, zh_batch = [], []\n",

    "    for en_item, zh_item in batch:\n",

    "        en_batch.append(en_item)\n",

    "        zh_batch.append(zh_item)\n",

    "    \n",

    "    # 对英文和中文序列分别进行填充\n",

    "    en_batch = nn.utils.rnn.pad_sequence(en_batch, padding_value=0, batch_first=True)\n",

    "    zh_batch = nn.utils.rnn.pad_sequence(zh_batch, padding_value=0, batch_first=True)\n",

    "    \n",

    "    return en_batch, zh_batch\n"

   ]

  },

  {

   "cell_type": "code",

   "execution_count": 3,

   "metadata": {

    "execution": {

     "iopub.execute_input": "2024-07-08T12:50:52.093405Z",

     "iopub.status.busy": "2024-07-08T12:50:52.093120Z",

     "iopub.status.idle": "2024-07-08T12:50:52.102203Z",

     "shell.execute_reply": "2024-07-08T12:50:52.101497Z",

     "shell.execute_reply.started": "2024-07-08T12:50:52.093388Z"

    },

    "tags": []

   },

   "outputs": [],

   "source": [

代码中定义了一个TranslationDataset类,用于处理数据集。这个类主要负责以下任务:

  • 加载数据集文件(例如train.txt),并将其分割为英文和中文句子对。
  • 创建词汇表,包括术语词典,确保术语在词汇表中。
  • 对句子进行分词处理,使用不同的分词方法(如英文使用basic_english,中文使用字符级分词)。
  • 将分词后的句子转换为索引表示,以便于模型训练。
  1. 模型定义

核心代码:

"class Encoder(nn.Module):\n",

    "    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):\n",

    "        super().__init__()\n",

    "        self.embedding = nn.Embedding(input_dim, emb_dim)\n",

    "        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)\n",

    "        self.dropout = nn.Dropout(dropout)\n",

    "\n",

    "    def forward(self, src):\n",

    "        # src shape: [batch_size, src_len]\n",

    "        embedded = self.dropout(self.embedding(src))\n",

    "        # embedded shape: [batch_size, src_len, emb_dim]\n",

    "        outputs, hidden = self.rnn(embedded)\n",

    "        # outputs shape: [batch_size, src_len, hid_dim]\n",

    "        # hidden shape: [n_layers, batch_size, hid_dim]\n",

    "        return outputs, hidden\n",

    "\n",

    "class Decoder(nn.Module):\n",

    "    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):\n",

    "        super().__init__()\n",

    "        self.output_dim = output_dim\n",

    "        self.embedding = nn.Embedding(output_dim, emb_dim)\n",

    "        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)\n",

    "        self.fc_out = nn.Linear(hid_dim, output_dim)\n",

    "        self.dropout = nn.Dropout(dropout)\n",

    "\n",

    "    def forward(self, input, hidden):\n",

    "        # input shape: [batch_size, 1]\n",

    "        # hidden shape: [n_layers, batch_size, hid_dim]\n",

    "        \n",

    "        embedded = self.dropout(self.embedding(input))\n",

    "        # embedded shape: [batch_size, 1, emb_dim]\n",

    "        \n",

    "        output, hidden = self.rnn(embedded, hidden)\n",

    "        # output shape: [batch_size, 1, hid_dim]\n",

    "        # hidden shape: [n_layers, batch_size, hid_dim]\n",

    "        \n",

    "        prediction = self.fc_out(output.squeeze(1))\n",

    "        # prediction shape: [batch_size, output_dim]\n",

    "        \n",

    "        return prediction, hidden\n",

    "\n",

    "class Seq2Seq(nn.Module):\n",

    "    def __init__(self, encoder, decoder, device):\n",

    "        super().__init__()\n",

    "        self.encoder = encoder\n",

    "        self.decoder = decoder\n",

    "        self.device = device\n",

    "\n",

    "    def forward(self, src, trg, teacher_forcing_ratio=0.5):\n",

    "        # src shape: [batch_size, src_len]\n",

    "        # trg shape: [batch_size, trg_len]\n",

    "        \n",

    "        batch_size = src.shape[0]\n",

    "        trg_len = trg.shape[1]\n",

    "        trg_vocab_size = self.decoder.output_dim\n",

    "\n",

    "        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)\n",

    "        \n",

    "        _, hidden = self.encoder(src)\n",

    "        \n",

    "        input = trg[:, 0].unsqueeze(1)  # Start token\n",

    "        \n",

    "        for t in range(1, trg_len):\n",

    "            output, hidden = self.decoder(input, hidden)\n",

    "            outputs[:, t, :] = output\n",

    "            teacher_force = random.random() < teacher_forcing_ratio\n",

    "            top1 = output.argmax(1)\n",

    "            input = trg[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)\n",

    "\n",

    "        return outputs"

代码中定义了EncoderDecoderSeq2Seq类,用于构建序列到序列(Sequence-to-Sequence)模型。这些类分别对应模型的编码器、解码器和整个模型。模型使用GRU作为循环神经网络(RNN)的基元。

  1. 训练和评估

核心代码:

 "def train(model, iterator, optimizer, criterion, clip):\n",

    "    model.train()\n",

    "    epoch_loss = 0\n",

    "    for i, (src, trg) in enumerate(iterator):\n",

    "        src, trg = src.to(device), trg.to(device)\n",

    "        optimizer.zero_grad()\n",

    "        output = model(src, trg)\n",

    "        output_dim = output.shape[-1]\n",

    "        output = output[:, 1:].contiguous().view(-1, output_dim)\n",

    "        trg = trg[:, 1:].contiguous().view(-1)\n",

    "        loss = criterion(output, trg)\n",

    "        loss.backward()\n",

    "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",

    "        optimizer.step()\n",

    "        epoch_loss += loss.item()\n",

    "    return epoch_loss / len(iterator)"

   ]

  },

  {

   "cell_type": "code",

   "execution_count": null,

   "metadata": {

    "ExecutionIndicator": {

     "show": true

    },

    "execution": {

     "iopub.execute_input": "2024-07-08T12:50:52.108867Z",

     "iopub.status.busy": "2024-07-08T12:50:52.108637Z"

    },

    "tags": []

   },

   "outputs": [],

   "source": [

 "def evaluate_bleu(model: Seq2Seq, dataset: TranslationDataset, src_file: str, ref_file: str, terminology,device: torch.device):\n",

    "    model.eval()\n",

    "    src_sentences = load_sentences(src_file)\n",

    "    ref_sentences = load_sentences(ref_file)\n",

    "    \n",

    "    translated_sentences = []\n",

    "    for src in src_sentences:\n",

    "        translated = translate_sentence(src, model, dataset, terminology, device)\n",

    "        translated_sentences.append(translated)\n",

    "    \n",

    "    bleu = BLEU()\n",

    "    score = bleu.corpus_score(translated_sentences, [ref_sentences])\n",

    "    \n",

    "    return score"

  • 训练:定义了一个train函数,用于模型的训练过程。它包括模型的前向传播、损失计算、反向传播和梯度裁剪等步骤。
  • 评估:定义了一个evaluate_bleu函数,用于评估模型在开发集上的BLEU分数。这个函数首先翻译开发集中的英文句子,然后计算翻译结果与参考翻译的BLEU分数。
  1. 模型推理

核心代码:

"def inference(model: Seq2Seq, dataset: TranslationDataset, src_file: str, save_dir:str, terminology, device: torch.device):\n",

    "    model.eval()\n",

    "    src_sentences = load_sentences(src_file)\n",

    "    \n",

    "    translated_sentences = []\n",

    "    for src in src_sentences:\n",

    "        translated = translate_sentence(src, model, dataset, terminology, device)\n",

    "        #print(translated)\n",

    "        translated_sentences.append(translated)\n",

    "        #print(translated_sentences)\n",

    "\n",

    "    # 将列表元素连接成一个字符串,每个元素后换行\n",

    "    text = '\\n'.join(translated_sentences)\n",

    "\n",

    "    # 打开一个文件,如果不存在则创建,'w'表示写模式\n",

    "    with open(save_dir, 'w', encoding='utf-8') as f:\n",

    "        # 将字符串写入文件\n",

    "        f.write(text)\n",

    "\n",

    "    #return translated_sentences"

  • 推理:定义了一个inference函数,用于在测试集上进行推理。这个函数读取测试集中的英文句子,使用训练好的模型进行翻译,并将结果保存到指定的文件中。
  1. 主函数

代码:

"if __name__ == '__main__':\n",

    "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",

    "\n",

    "    # 加载术语词典\n",

    "    terminology = load_terminology_dictionary('../dataset/en-zh.dic')\n",

    "    # 加载数据集和模型\n",

    "    dataset = TranslationDataset('../dataset/train.txt',terminology = terminology)\n",

    "\n",

    "    # 定义模型参数\n",

    "    INPUT_DIM = len(dataset.en_vocab)\n",

    "    OUTPUT_DIM = len(dataset.zh_vocab)\n",

    "    ENC_EMB_DIM = 256\n",

    "    DEC_EMB_DIM = 256\n",

    "    HID_DIM = 512\n",

    "    N_LAYERS = 2\n",

    "    ENC_DROPOUT = 0.5\n",

    "    DEC_DROPOUT = 0.5\n",

    "\n",

    "    # 初始化模型\n",

    "    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n",

    "    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n",

    "    model = Seq2Seq(enc, dec, device).to(device)\n",

    "\n",

    "    # 加载训练好的模型\n",

    "    model.load_state_dict(torch.load('./translation_model_GRU.pth'))\n",

    "    \n",

    "    save_dir = '../dataset/submit.txt'\n",

    "    inference(model, dataset, src_file=\"../dataset/test_en.txt\", save_dir = save_dir, terminology = terminology, device = device)\n",

    "    print(f\"翻译完成!文件已保存到{save_dir}\")"

   ]

主函数中执行了以下操作:

  • 初始化设备(GPU或CPU)。
  • 加载术语词典。
  • 创建数据集实例,并加载训练好的模型。
  • 调用evaluate_bleu函数来评估模型的性能。
  • 调用inference函数,将测试集中的英文句子翻译成中文,并将结果保存到../dataset/submit.txt文件中。

总结与思考

这段代码是一个完整的机器翻译系统,从数据预处理、模型训练、模型评估到最终的推理输出。它使用了序列到序列模型(seq2seq)结构,包括GRU作为RNN单元,以及BLEU分数作为评估模型性能的指标。通过主函数,可以方便地运行整个流程,从训练模型到最终生成翻译结果。刚开始只知道跟着步骤操作,让代码跑起来,根本不知道其中的含义,经过分析和查阅资料后对代码的作用有了一点了解。

  • 4
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值