Datawhale AI 夏令营-NLP-Task3笔记

学习baseline3,并进行实践

导入库

接着,导入所需的库:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_
from torchtext.data.metrics import bleu_score
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import List, Tuple
import jieba
import random
from torch.nn.utils.rnn import pad_sequence
import sacrebleu
import time
import math

数据预处理

  1. 定义Tokenizer:

    • 英文使用spacy库的tokenizer。
    • 中文使用jieba分词。
    en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
    zh_tokenizer = lambda x: list(jieba.cut(x))
    
  2. 读取数据:

    • 读取文件中的数据,并返回列表。
    def read_data(file_path: str) -> List[str]:
        with open(file_path, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f]
    
  3. 预处理数据:

    • 将英文和中文数据分词,并控制每个句子的最大长度。
    def preprocess_data(en_data: List[str], zh_data: List[str]) -> List[Tuple[List[str], List[str]]]:
        processed_data = []
        for en, zh in zip(en_data, zh_data):
            en_tokens = en_tokenizer(en.lower())[:MAX_LENGTH]
            zh_tokens = zh_tokenizer(zh)[:MAX_LENGTH]
            if en_tokens and zh_tokens:
                processed_data.append((en_tokens, zh_tokens))
        return processed_data
    
  4. 构建词汇表:

    • 为英文和中文分别构建词汇表,并设置默认索引。
    def build_vocab(data: List[Tuple[List[str], List[str]]]):
        en_vocab = build_vocab_from_iterator(
            (en for en, _ in data),
            specials=['<unk>', '<pad>', '<bos>', '<eos>']
        )
        zh_vocab = build_vocab_from_iterator(
            (zh for _, zh in data),
            specials=['<unk>', '<pad>', '<bos>', '<eos>']
        )
        en_vocab.set_default_index(en_vocab['<unk>'])
        zh_vocab.set_default_index(zh_vocab['<unk>'])
        return en_vocab, zh_vocab
    

数据集与数据加载

  1. 自定义数据集:

    • 用于加载和索引数据。
    class TranslationDataset(Dataset):
        def __init__(self, data: List[Tuple[List[str], List[str]]], en_vocab, zh_vocab):
            self.data = data
            self.en_vocab = en_vocab
            self.zh_vocab = zh_vocab
    
        def __len__(self):
            return len(self.data)
    
        def __getitem__(self, idx):
            en, zh = self.data[idx]
            en_indices = [self.en_vocab['<bos>']] + [self.en_vocab[token] for token in en] + [self.en_vocab['<eos>']]
            zh_indices = [self.zh_vocab['<bos>']] + [self.zh_vocab[token] for token in zh] + [self.zh_vocab['<eos>']]
            return en_indices, zh_indices
    
  2. 批次处理函数:

    • 用于将批次数据填充到相同长度。
    def collate_fn(batch):
        en_batch, zh_batch = [], []
        for en_item, zh_item in batch:
            if en_item and zh_item:
                en_batch.append(torch.tensor(en_item))
                zh_batch.append(torch.tensor(zh_item))
        if not en_batch or not zh_batch:
            return torch.tensor([]), torch.tensor([])
    
        en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=en_vocab['<pad>'])
        zh_batch = nn.utils.rnn.pad_sequence(zh_batch, batch_first=True, padding_value=zh_vocab['<pad>'])
        return en_batch, zh_batch
    
  3. 加载数据:

    • 加载训练集、开发集和测试集数据,并构建数据加载器。
    def load_data(train_path: str, dev_en_path: str, dev_zh_path: str, test_en_path: str):
        # 读取训练数据
        train_data = read_data(train_path)
        train_en, train_zh = zip(*(line.split('\t') for line in train_data))
    
        # 读取开发集和测试集
        dev_en = read_data(dev_en_path)
        dev_zh = read_data(dev_zh_path)
        test_en = read_data(test_en_path)
    
        # 预处理数据
        train_processed = preprocess_data(train_en, train_zh)
        dev_processed = preprocess_data(dev_en, dev_zh)
        test_processed = [(en_tokenizer(en.lower())[:MAX_LENGTH], []) for en in test_en if en.strip()]
    
        # 构建词汇表
        global en_vocab, zh_vocab
        en_vocab, zh_vocab = build_vocab(train_processed)
    
        # 创建数据集
        train_dataset = TranslationDataset(train_processed, en_vocab, zh_vocab)
        dev_dataset = TranslationDataset(dev_processed, en_vocab, zh_vocab)
        test_dataset = TranslationDataset(test_processed, en_vocab, zh_vocab)
    
        from torch.utils.data import Subset
        indices = list(range(N))
        train_dataset = Subset(train_dataset, indices)
    
        # 创建数据加载器
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True)
        dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True)
        test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, drop_last=True)
    
        return train_loader, dev_loader, test_loader, en_vocab, zh_vocab
    

模型构建

  1. 位置编码:

    • 为输入添加位置编码,以保留序列信息。
    class PositionalEncoding(nn.Module):
        def __init__(self, d_model, dropout=0.1, max_len=5000):
            super(PositionalEncoding, self).__init__()
            self.dropout = nn.Dropout(p=dropout)
            pe = torch.zeros(max_len, d_model)
            position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
            div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0).transpose(0, 1)
            self.register_buffer('pe', pe)
    
        def forward(self, x):
            x = x + self.pe[:x.size(0), :]
            return self.dropout(x)
    
  2. Transformer模型:

    • 基于Transformer结构的翻译模型。
    class TransformerModel(nn.Module):
        def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
            super(TransformerModel, self).__init__()
            self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
            self.src_embedding = nn.Embedding(len(src_vocab), d_model)
            self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model)
            self.positional_encoding = PositionalEncoding(d_model, dropout)
            self.fc_out = nn.Linear(d_model, len(tgt_vocab))
            self.src_vocab = src_vocab
            self.tgt_vocab = tgt_vocab
            self.d_model = d_model
    
        def forward(self, src, tgt):
            src = src.transpose(0, 1)
            tgt = tgt.transpose(0, 1)
            src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
            tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
            src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1)
            tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1)
            src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
            tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
            output = self.transformer(src_embedded, tgt_embedded, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
    
    
            return self.fc_out(output)
    
        def encode(self, src):
            src = src.transpose(0, 1)
            src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
            src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1)
            src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
            memory = self.transformer.encoder(src_embedded, src_mask, src_padding_mask)
            return memory
    
        def decode(self, tgt, memory):
            tgt = tgt.transpose(0, 1)
            tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
            tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1)
            tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
            output = self.transformer.decoder(tgt_embedded, memory, tgt_mask, None, tgt_padding_mask, tgt_padding_mask)
            return self.fc_out(output)
    

模型训练

  1. 训练函数:

    • 定义训练步骤。
    def train_epoch(model, data_loader, criterion, optimizer, device):
        model.train()
        epoch_loss = 0
        for en, zh in data_loader:
            en, zh = en.to(device), zh.to(device)
            optimizer.zero_grad()
            output = model(en, zh[:, :-1])
            loss = criterion(output.reshape(-1, output.shape[-1]), zh[:, 1:].reshape(-1))
            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / len(data_loader)
    
  2. 评估函数:

    • 定义评估步骤。
    def evaluate(model, data_loader, criterion, device):
        model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for en, zh in data_loader:
                en, zh = en.to(device), zh.to(device)
                output = model(en, zh[:, :-1])
                loss = criterion(output.reshape(-1, output.shape[-1]), zh[:, 1:].reshape(-1))
                epoch_loss += loss.item()
        return epoch_loss / len(data_loader)
    
  3. 训练循环:

    • 进行训练和评估。
    def train_model(train_loader, dev_loader, model, optimizer, criterion, device, num_epochs):
        for epoch in range(num_epochs):
            start_time = time.time()
            train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
            dev_loss = evaluate(model, dev_loader, criterion, device)
            end_time = time.time()
            epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
            print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f}')
            print(f'\t Val. Loss: {dev_loss:.3f}')
    

模型推理

  1. 翻译函数:

    • 用于生成翻译。
    def translate_sentence(model, sentence, en_vocab, zh_vocab, device, max_length=50):
        model.eval()
        tokens = [en_vocab['<bos>']] + [en_vocab[token] for token in en_tokenizer(sentence.lower())] + [en_vocab['<eos>']]
        src_tensor = torch.LongTensor(tokens).unsqueeze(1).to(device)
        memory = model.encode(src_tensor)
        outputs = [zh_vocab['<bos>']]
        for _ in range(max_length):
            tgt_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
            output = model.decode(tgt_tensor, memory)
            pred_token = output.argmax(2)[-1].item()
            outputs.append(pred_token)
            if pred_token == zh_vocab['<eos>']:
                break
        translated_tokens = [zh_vocab.itos[i] for i in outputs]
        return translated_tokens[1:-1]
    
  2. 计算BLEU分数:

    • 评估翻译的质量。
    def calculate_bleu(data, model, en_vocab, zh_vocab, device):
        trgs = []
        pred_trgs = []
        for src, trg in data:
            src = " ".join([en_vocab.itos[token] for token in src])
            pred_trg = translate_sentence(model, src, en_vocab, zh_vocab, device)
            pred_trgs.append(pred_trg)
            trgs.append([trg])
        return bleu_score(pred_trgs, trgs)
    

总结

上述代码实现了一个基于Transformer的中英翻译模型,从数据预处理、模型构建到模型训练和评估,完整地展示了如何实现一个机器翻译系统。

  • 16
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值