BERT的代码实现

目录

1.BERT的理论

2.代码实现 

 2.1构建输入数据格式

 2.2定义BERT编码器的类

 2.3BERT的两个任务

2.3.1任务一:Masked Language Modeling MLM掩蔽语言模型任务 

2.3.2 任务二:next sentence prediction

3.整合代码 

 4.知识点个人理解


 

1.BERT的理论

BERT全称叫做Bidirectional Encoder Representations from Transformers, 论文地址: [1810.04805] BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (arxiv.org)

BERT是谷歌AI研究院在2018年10月提出的一种预训练模型. BERT本质上就是Transformer模型的encoder部分, 并且对encoder做了一些改进.

下图中编码器部分即BERT的基本结构.

  

2.代码实现 

import torch
from torch import nn
import dltools

 2.1构建输入数据格式

def get_tokens_and_segments(tokens_a, tokens_b=None):
    #classification 分类
    #BERT是两句话作为一对句子一同传入的,也可以单独传一句话,若序列长度长,可以补padding
    #假设先传一句话tokens_a
    tokens = ['<cls>'] + tokens_a + ['<sep>']  #tokens_embedding层的处理
    segments = [0] * (len(tokens_a) + 2)  #判断词元属于哪一句话,加标记,0属于第一句话
    if tokens_b is not None:
        tokens += tokens_b + ['sep']
        segments += [1] * (len(tokens_b) + 1)
    return tokens, segments


#测试上面的函数
get_tokens_and_segments([1, 2, 3], [4, 5, 6])

(['<cls>', 1, 2, 3, '<sep>', 4, 5, 6, 'sep'], [0, 0, 0, 0, 0, 1, 1, 1, 1])

 2.2定义BERT编码器的类

class BERTEncoder(nn.Module):
    #由于前馈网络的ffn_num_outputs = num_hiddens,没有初始化传入
    #__init__()里面的参数,是创建类的时候传入的参数
    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout,
                max_len=1000, key_size=768, query_size=768, value_size=768, **kwargs):
        super().__init__(**kwargs)
        #token_embeddings层
        self.token_emdedding = nn.Embedding(vocab_size, num_hiddens)
        #segment_embedding层  (传入两个句子,所以第0维为2)
        self.segment_embedding = nn.Embedding(2, num_hiddens)
        #pos_embedding层  :位置嵌入层是可以学习的, 用nn.Parameter()定义可学习的参数
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, num_hiddens))
        
        #设置Encoder_block的数量
        self.blks = nn.Sequential()  #为使用的Encoder_block依次编号
        for i in range(num_layers):  #有几层网络循环几层
            self.blks.add_module(f'{i}', dltools.EncoderBlock(key_size, query_size, value_size, num_hiddens, norm_shape, 
                                                              ffn_num_input, ffn_num_hiddens, num_heads, dropout))
    
    #__init__()里面的参数,是创建类的时候传入的参数
    #foward里面的参数是创建完类对象之后,调用类方法时传入的参数
    def forward(self, tokens, segments, valid_lens):
        #X = token_embedding + segment_embedding + pos_embedding
        #传入的token_embedding,segment_embedding两者的shape相同,可以直接相加
        X = self.token_emdedding(tokens) + self.segment_embedding(segments)
        #pos_embedding与前两层的数据shape不相同,不能直接相加
        #切片让self.pos_embedding的第1维度的数据切片到token_embedding,segment_embedding相加之后的数
        X = X + self.pos_embedding.data[:, :X.shape[1], :]
        
        for blk in self.blks:
            X = blk(X, valid_lens)
        return X  
#测试上面代码


#创建BERTEncoder类对象
vocab_size, num_hiddens, ffn_num_hiddens, num_heads = 10000, 768, 1024, 4
norm_shape, ffn_num_input, num_layers, dropout = [768], 768, 2, 0.2
encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout)


tokens = torch.randint(0, vocab_size, (2, 8)) #生成随机正整数
segments = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1]])
#调用类方法
encoded_X = encoder(tokens, segments, None)


encoded_X.shape
torch.Size([2, 8, 768])

#  nn.Sequential()是PyTorch中的一个类,它允许用户将多个计算层按照顺序组合成一个模型。在深度学习中,模型可以是由各种不同类型的层组成的,例如卷积层、池化层、全连接层等。nn.Sequential()方法可以将这些层组合在一起,形成一个整体模型。 

 2.3BERT的两个任务

2.3.1任务一:Masked Language Modeling MLM掩蔽语言模型任务 

class MaskLM(nn.Module):
    def __init__(self, vocab_size, num_inputs=768, **kwargs):
        super().__init__(**kwargs)
        self.mlp = nn.Sequential(nn.Linear(num_inputs, num_hiddens),  #全连接层
                                nn.ReLU(),  
                                nn.LayerNorm(num_hiddens), 
                                nn.Linear(num_hiddens, vocab_size))  #输出层
    
    #X表示随机(15%概率)将一些词元换成mask
    #pred_positions表示已经处理好的80%概率将选中的词换成mask>, 10%概率换成随机词元,10%概率保持原有词元
    #pred_position是二维数据
    def forward(self, X, pred_positions):  
        num_pred_positions = pred_positions.shape[1]  #索引出80%、10%、10%三个概率选出的需要转换的词位置数量
        pred_positions = pred_positions.reshape(-1)  #变成一维数据
        batch_size = X.shape[0]  #获取批次
        batch_idx = torch.arange(0, batch_size) #获取批次的编号
        #将批次编号与元素数量对应起来
        #例如:batch_size = [0, 1]   -->   [0, 0, 0, 1, 1, 1]
        batch_idx = torch.repeat_interleave(batch_idx, num_pred_positions)  #将batch_idx中每个元素重复num_pred_positions次
        #把要预测位置的数据取出来
        masked_X = X[batch_idx, pred_positions]
        masked_X = masked_X.reshape(batch_size, num_pred_positions, -1)  #还原维度
        mlm_Y_hat = self.mlp(masked_X)
        return mlm_Y_hat
#测试代码


mlm = MaskLM(vocab_size, num_hiddens)
mlm_positions = torch.tensor([[1, 5, 2], [6, 1, 5]])
mlm_Y_hat = mlm(encoded_X, mlm_positions)


mlm_Y_hat.shape    #2:2个批次,   3:三个需要转换词元的位置     10000:计算的概率数量(在最后会用softmax函数计算分类结果),vocab_size有10000个,
torch.Size([2, 3, 10000])
mlm_Y = torch.tensor([[7, 8, 9], [10, 20, 30]])  #假设真实值
loss = nn.CrossEntropyLoss(reduction='none')
mlm_l = loss(mlm_Y_hat.reshape(-1, vocab_size), mlm_Y.reshape(-1))  # mlm_Y_hat的shape=(6, 10000)     mlm_Y的shape=(6)
mlm_l.shape

torch.Size([6])

2.3.2 任务二:next sentence prediction

class NextSentencePred(nn.Module):
    def __init__(self, num_inputs, **kwargs):
        super().__init__(**kwargs)
        self.output = nn.Linear(num_inputs, 2)  #预测输入的句子是否为下一个句子,预测目标值为“是/否”二分类问题
        
    def forward(self, X):
        #X的形状(batch_size, num_hiddens)
        return self.output(X)
#测试代码


encoded_X = torch.flatten(encoded_X, start_dim=1)  #将数据展平,相当于reshape
nsp = NextSentencePred(encoded_X.shape[-1])
nsp_Y_hat = nsp(encoded_X)

nsp_Y_hat.shape

torch.Size([2, 2])
#计算损失
nsp_y = torch.tensor([0, 1])   #假设真实值
nsp_1 = loss(nsp_Y_hat, nsp_y)
nsp_1.shape

torch.Size([2])

3.整合代码 

class BERTModel(nn.Module):
    def __init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout,
                 max_len=1000, key_size=768, query_size=768, value_size=768,
                 hid_in_features=768, mlm_in_features=768, nsp_in_features=768, **kwargs):
        super().__init__(**kwargs)
        #初始化编码器对象
        self.encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout,
                                   max_len=max_len, key_size=key_size, query_size=query_size, value_size=value_size)
        #掩蔽语言模型任务
        self.mlm = MaskLM(vocab_size, num_hiddens, mlm_in_features)
        #中间隐藏层的线性转换+激活函数
        self.hidden = nn.Sequential(nn.Linear(hid_in_features, num_hiddens), nn.Tanh())
        #预测出下一句
        self.nsp = NextSentencePred(nsp_in_features)
        
    def forward(self, tokens, seqments, valid_lens=None, pred_position=None):
        encoded_X = self.encoder(tokens, seqments, valid_lens)
        if pred_position is not None:
            mlm_Y_hat = self.mlm(encoded_X, pred_position)
        else:
            pred_position = None
            
        #0表示<cls>标记的索引
        nsp_Y_hat = self.nsp(self.hidden(encoded_X[:, 0, :]))
        return encoded_X, mlm_Y_hat, nsp_Y_hat

 4.知识点个人理解

 

Molecular-graph-BERT 是一种基于图神经网络的化学分子表示方法,可用于分子性质预测、分子设计等应用。以下是 Molecular-graph-BERT代码实现。 1. 安装依赖 ```python !pip install torch !pip install dgl !pip install rdkit ``` 2. 数据预处理 ```python import dgl from rdkit import Chem from dgl.data.utils import load_graphs, save_graphs from dgl.data.chem.utils import smiles_to_bigraph, CanonicalAtomFeaturizer # 将 SMILES 序列转换为 DGLGraph def graph_from_smiles(smiles): mol = Chem.MolFromSmiles(smiles) return smiles_to_bigraph(smiles, atom_featurizer=CanonicalAtomFeaturizer()) # 读取数据,并将 SMILES 序列转换为 DGLGraph data = [] with open('data.txt', 'r') as f: for line in f: smiles, label = line.strip().split('\t') g = graph_from_smiles(smiles) label = int(label) data.append((g, label)) # 将 DGLGraph 序列化并保存为二进制文件 save_graphs('data.bin', data) ``` 3. 定义模型 ```python import torch import torch.nn as nn import dgl.function as fn # 定义 GraphConvLayer class GraphConvLayer(nn.Module): def __init__(self, in_feats, out_feats): super(GraphConvLayer, self).__init__() self.linear = nn.Linear(in_feats, out_feats) self.activation = nn.ReLU() def forward(self, g, features): with g.local_scope(): g.ndata['h'] = features g.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'neigh')) h_neigh = g.ndata['neigh'] h = self.linear(features + h_neigh) h = self.activation(h) return h # 定义 MolecularGraphBERT 模型 class MolecularGraphBERT(nn.Module): def __init__(self, hidden_size, num_layers): super(MolecularGraphBERT, self).__init__() self.embed = nn.Embedding(100, hidden_size) self.layers = nn.ModuleList([GraphConvLayer(hidden_size, hidden_size) for _ in range(num_layers)]) self.pool = dgl.nn.pytorch.glob.max_pool def forward(self, g): h = self.embed(g.ndata['feat']) for layer in self.layers: h = layer(g, h) g.ndata['h'] = h hg = self.pool(g, g.ndata['h']) return hg ``` 4. 训练模型 ```python from torch.utils.data import DataLoader from dgl.data.utils import load_graphs # 加载数据 data, _ = load_graphs('data.bin') labels = torch.tensor([d[1] for d in data]) # 划分训练集和测试集 train_data, test_data = data[:80], data[80:] train_labels, test_labels = labels[:80], labels[80:] # 定义训练参数 lr = 0.01 num_epochs = 50 hidden_size = 128 num_layers = 3 # 定义模型和优化器 model = MolecularGraphBERT(hidden_size, num_layers) optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 训练模型 for epoch in range(num_epochs): model.train() for i, (g, label) in enumerate(train_data): pred = model(g) loss = nn.functional.binary_cross_entropy_with_logits(pred, label.unsqueeze(0).float()) optimizer.zero_grad() loss.backward() optimizer.step() model.eval() with torch.no_grad(): train_acc = 0 for g, label in train_data: pred = model(g) train_acc += ((pred > 0).long() == label).sum().item() train_acc /= len(train_data) test_acc = 0 for g, label in test_data: pred = model(g) test_acc += ((pred > 0).long() == label).sum().item() test_acc /= len(test_data) print('Epoch {:d} | Train Acc {:.4f} | Test Acc {:.4f}'.format(epoch, train_acc, test_acc)) ``` 以上就是 Molecular-graph-BERT代码实现。需要注意的是,由于 Molecular-graph-BERT 是基于图神经网络的方法,需要使用 DGL 库来构建和操作图数据,因此需要先安装 DGL 库。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值
>