参考论文:Attention Is All You Need
参考论文:BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding
源码地址:BERT-pytorch
介绍,源码主要完成了bert预训练的两个TASK
- Masked LM(Masked Language Model)
一个句子中每个词有15%的概率会被选中进行设置,设置的规则有三种:- 80%的概率设置为[MASK]用于学习
- 10%的概率设置为原单词,为了进行下游任务
- 10%的概率设置为随机单词,类似负采样操作
- NSP(Next Sentence Prediction)
[CLS]用于句子的标签,[SEP]用于句子的分隔符,预训练中[CLS]标签代表两个句子是否是上下句(is_next),构成可以使用是上下句的两个句子,然后使用上句与一个不是下文的随机句子构成,这样正负样本各占50%
源码解读:
-
构建数据。
数据格式为分好词的两个上下句在一行,使用\t进行分开:
中文:欢迎 来到 中国\t 我 喜欢 这里\n
英文:Welcome to the the jungle\t I can stay here all night\n
当然英文可能使用wordpiece更好。
使用命令bert-vocab -c data/corpus.small -o data/vocab.small
通过corpus语料构建voca词表:
参数解释见图:
-s: 词表大小; -m: 词表中词出现的最小次数
构建过程:
使用集合中的Counter计数器,按照出现频率进行大到小排序,建立词与索引的映射字典,当字典的大小等于词表的大小,或者当前词的出现的频率小于设定的最小出现频率时,词典建立完成。 -
进行训练
使用命令bert -c data/corpus.small -v data/vocab.small -o output/bert.model
参数含义:
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert")
parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set")
parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab")
parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model")
parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model")
parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers")
parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads")
parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len")
parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size")
parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs")
parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size")
parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false")
parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam")
parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value")
选几个介绍:
-c : 训练数据的路径
-t : 测试数据的路径
-v : 词表路径
-h : 隐藏层大小
-l : 层数
-a : 注意力层数大小
-s : 句子长度(上句和下句拼接在一起的长度,截断从末尾截断,基本是截断下句)
剩下的参数,在代码中介绍吧
- 加载词表
vocab = WordVocab.load_vocab(args.vocab_path)
#####WordVocab中的方法
@staticmethod
def load_vocab(vocab_path: str) -> 'Vocab':
with open(vocab_path, "rb") as f:
return pickle.load(f)
def save_vocab(self, vocab_path):
with open(vocab_path, "wb") as f:
pickle.dump(self, f)
- 得到训练数据
# train_dataset: 训练数据路径
# vocab: 上个步骤加载的词表
# seq_len: 句子长度
# corpus_lines: 加载句子的总对数
# on_memory: 句子是否存在内存中
train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len,
corpus_lines=args.corpus_lines, on_memory=args.on_memory)
# 主要介绍如下索引取值
def __getitem__(self, item):
# 随机生成句子和标签
t1, t2, is_next_label = self.random_sent(item)
# 处理每个句子
t1_random, t1_label = self.random_word(t1)
t2_random, t2_label = self.random_word(t2)
# [CLS] tag = SOS tag, [SEP] tag = EOS tag
# 添加句子的首尾标志和分隔符
t1 = [self.vocab.sos_index] + t1_random + [self.vocab.eos_index]
t2 = t2_random + [self.vocab.eos_index]
# 添加句子首尾标志和分隔符的标签
t1_label = [self.vocab.pad_index] + t1_label + [self.vocab.pad_index]
t2_label = t2_label + [self.vocab.pad_index]
# 得到句子段落的标签
segment_label = ([1 for _ in range(len(t1))] + \
[2 for _ in range(len(t2))])[:self.seq_len]
# 长度过长的进行截断
bert_input = (t1 + t2)[:self.seq_len]
bert_label = (t1_label + t2_label)[:self.seq_len]
# 句子长度不够的,补充pad
padding = [self.vocab.pad_index for _ in range(self.seq_len - len(bert_input))]
bert_input.extend(padding), bert_label.extend(padding),
segment_label.extend(padding)
output = {"bert_input": bert_input,
"bert_label": bert_label,
"segment_label": segment_label,
"is_next": is_next_label}
# 返回output构成的tensor字典
return {key: torch.tensor(value) for key, value in output.items()}
#
- 句子生成(见NSP)
# 句子生成规则,上文NSP中提到的,50%正例,50%负例
def random_sent(self, index):
t1, t2 = self.get_corpus_line(index)
# output_text, label(isNotNext:0, isNext:1)
if random.random() > 0.5:
return t1, t2, 1
else:
return t1, self.get_random_line(), 0
def get_corpus_line(self, item):
if self.on_memory:
return self.lines[item][0], self.lines[item][1]
else:
line = self.file.__next__()
if line is None:
self.file.close()
self.file = open(self.corpus_path, "r", encoding=self.encoding)
line = self.file.__next__()
t1, t2 = line[:-1].split("\t")
return t1, t2
def get_random_line(self):
if self.on_memory:
return self.lines[random.randrange(len(self.lines))][1]
line = self.file.__next__()
if line is None:
self.file.close()
self.file = open(self.corpus_path, "r", encoding=self.encoding)
for _ in range(random.randint(self.corpus_lines \
if self.corpus_lines < 1000 else 1000)):
self.random_file.__next__()
line = self.random_file.__next__()
return line[:-1].split("\t")[1]
- 词生成规则
# 词生成规则
def random_word(self, sentence):
tokens = sentence.split()
output_label = []
for i, token in enumerate(tokens):
prob = random.random()
# 15%的概率被选取进行重新标记
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.8:
tokens[i] = self.vocab.mask_index
# 10% randomly change token to random token
elif prob < 0.9:
tokens[i] = random.randrange(len(self.vocab))
# 10% randomly change token to current token
else:
tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
# label标记为原单词的标记
output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index))
# 85%的概率使用原标记,并且label标记为0
else:
tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
output_label.append(0)
return tokens, output_label
如下图所示,bert_inpu标记为4的地方,词进行了替换,
- 训练数据转化为DataLoader
train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size,
num_workers=args.num_workers)
- 使用参数建立bert模型
# 词表长度
# hidden: 隐层大小
# n_layer: transform层数大小
# attn_heads:
bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers,
attn_heads=args.attn_heads)
bert预训练中包括两层
- input嵌入层
# embedding for BERT, sum of positional, segment, token embeddings
# 三个embedding的和,词嵌入模型,位置嵌入模型,段落嵌入模型
self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
# BERTEmbedding的__init__函数中
# nn.Embddeing
self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
# position embedding
self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
# segment embedding
self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
self.dropout = nn.Dropout(p=dropout)
self.embed_size = embed_size
# BERTEmbedding的forward函数中
def forward(self, sequence, segment_label):
# 三者求和
x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
return self.dropout(x)
- position_embedding:
位置信息,目的是处理需要绝对位置的理解方式,比如A 赢了 B,与B 赢了 A,这种需要position_embedding。
公式:
class PositionalEmbedding(nn.Module):
def __init__(self, d_model, max_len=512):
super().__init__()
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model).float()
pe.require_grad = False
position = torch.arange(0, max_len).float().unsqueeze(1)
# 先取对数,然后在exp,方便计算和避免数据溢出
div_term = (torch.arange(0, d_model, 2).float() * \
-(math.log(10000.0) / d_model)).exp()
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
return self.pe[:, :x.size(1)]
- segment_embedding
class SegmentEmbedding(nn.Embedding):
def __init__(self, embed_size=512):
super().__init__(3, embed_size, padding_idx=0)
可以设置为2,那么需要把句子的标记改为0和1。
不能为2,还有补充的pad为0。
2.transfomer
# hidden: 隐层大小
# attn_heads:
self.transformer_blocks = nn.ModuleList(
[TransformerBlock(hidden, attn_heads, hidden * 4, dropout) \
for _ in range(n_layers)])
# TransformerBlock中__init__函数定义
# multi-head attention
self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
# feed-forward
self.feed_forward = PositionwiseFeedForward(d_model=hidden,
d_ff=feed_forward_hidden, dropout=dropout)
# connect layer (add & norm)
self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
# connect layer (add & norm)
self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.dropout = nn.Dropout(p=dropout)
- Multi-Head Attention
class MultiHeadedAttention(nn.Module):
"""
Take in model size and number of heads.
"""
def __init__(self, h, d_model, dropout=0.1):
super().__init__()
assert d_model % h == 0
# We assume d_v always equals d_k
self.d_k = d_model // h
self.h = h
# Q, K, V的三个linear
self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) \
for _ in range(3)])
# 输出的 linear
self.output_linear = nn.Linear(d_model, d_model)
# scaled Dot-Product Attention
self.attention = Attention()
self.dropout = nn.Dropout(p=dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
# 1) Do all the linear projections in batch from d_model => h x d_k
query, key, value = [l(x).view(batch_size, -1, self.h,
self.d_k).transpose(1, 2) for l, x in zip(self.linear_layers,
(query, key, value))]
# Q: [batch_size, self.h, seq_len, self.d_k]
# K: [batch_size, self.h, seq_len, self.d_k]
# V: [batch_size, self.h, seq_len, self.d_k]
# 2) Apply attention on all the projected vectors in batch.
x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
# x: [batch_size, self.h, seq_len, d_k]
# attn: [batch_size, self.h, seq_len, seq_len] context vector
# 3) "Concat" using a view and apply a final linear.
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
# x: [batch_size, seq_len, embed_size]
return self.output_linear(x)
- scaled Dot-Product Attention
class Attention(nn.Module):
"""
Compute 'Scaled Dot Product Attention
"""
def forward(self, query, key, value, mask=None, dropout=None):
# 计算 Q*K(T)的scale
# Q, K: [batch_size, h, seq_len, self.d_k]
scores = torch.matmul(query, key.transpose(-2, -1)) \
/ math.sqrt(query.size(-1))
# scores: [batch_size, self.h, seq_len, seq_len]
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# softmax
p_attn = F.softmax(scores, dim=-1)
# p_attn: [batch_size, h, seq_len, seq_len]
if dropout is not None:
p_attn = dropout(p_attn)
# torch.matmul(p_attn, value): [batch_size, h, seq_len, d_k]
return torch.matmul(p_attn, value), p_attn
- feed_forward
公式:
class PositionwiseFeedForward(nn.Module):
"Implements FFN equation."
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
self.activation = GELU()
def forward(self, x):
return self.w_2(self.dropout(self.activation(self.w_1(x))))
#GELU 没有深究,之后补
class GELU(nn.Module):
"""
Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
"""
def forward(self, x):
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * \
(x + 0.044715 * torch.pow(x, 3))))
d f f 的 取 值 为 4 ∗ d m o d e l d_{ff}的取值为4 * d_{model} dff的取值为4∗dmodel
- connect_layer
class SublayerConnection(nn.Module):
"""
A residual connection followed by a layer norm.
Note for code simplicity the norm is first as opposed to last.
"""
def __init__(self, size, dropout):
super(SublayerConnection, self).__init__()
self.norm = LayerNorm(size)
self.dropout = nn.Dropout(dropout)
def forward(self, x, sublayer):
"Apply residual connection to any sublayer with the same size."
return x + self.dropout(sublayer(self.norm(x)))
# LayerNorm
class LayerNorm(nn.Module):
"Construct a layernorm module (See citation for details)."
def __init__(self, features, eps=1e-6):
super(LayerNorm, self).__init__()
self.a_2 = nn.Parameter(torch.ones(features))
self.b_2 = nn.Parameter(torch.zeros(features))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
- transform的计算
def forward(self, x, mask):
# x: [batch_size, seq_len, embed_size]
# mask: [batch_size, 1, seq_len, seq_len]
x = self.input_sublayer(x,
lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
# x: [batch_size, seq_len, embed_size]
x = self.output_sublayer(x, self.feed_forward)
# x: [batch_size, seq_len, embed_size]
return self.dropout(x)
然后bert中的forward:
def forward(self, x, segment_info):
# attention masking for padded token
# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
# embedding the indexed sequence to sequence of vectors
x = self.embedding(x, segment_info)
# size: [batch_size, seq_len, embed_size]
# running over multiple transformer blocks
for transformer in self.transformer_blocks:
x = transformer.forward(x, mask)
# size: [batch_size, seq_len, embed_size]
return x
bert的训练,重点结合源码给出相关Tensor的size大小
class BERTTrainer:
"""
BERTTrainer make the pretrained BERT model with two LM training method.
1. Masked Language Model : 3.3.1 Task #1: Masked LM
2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction
please check the details on README.md with simple example.
"""
def __init__(self, bert: BERT, vocab_size: int,
train_dataloader: DataLoader, test_dataloader: DataLoader = None,
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01,
warmup_steps=10000,
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10):
"""
:param bert: BERT model which you want to train
:param vocab_size: total word vocab size
:param train_dataloader: train dataset data loader
:param test_dataloader: test dataset data loader [can be None]
:param lr: learning rate of optimizer
:param betas: Adam optimizer betas
:param weight_decay: Adam optimizer weight decay param
:param with_cuda: traning with cuda
:param log_freq: logging frequency of the batch iteration
"""
# Setup cuda device for BERT training, argument -c, --cuda should be true
cuda_condition = torch.cuda.is_available() and with_cuda
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
# This BERT model will be saved every epoch
self.bert = bert
# Initialize the BERT Language Model, with BERT model
self.model = BERTLM(bert, vocab_size).to(self.device)
# Distributed GPU training if CUDA can detect more than 1 GPU
if with_cuda and torch.cuda.device_count() > 1:
print("Using %d GPUS for BERT" % torch.cuda.device_count())
self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
# Setting the train and test data loader
self.train_data = train_dataloader
self.test_data = test_dataloader
# Setting the Adam optimizer with hyper-param
self.optim = Adam(self.model.parameters(), lr=lr, betas=betas,
weight_decay=weight_decay)
# 优化函数的step定义
self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden,
n_warmup_steps=warmup_steps)
# Using Negative Log Likelihood Loss function for predicting the masked_token
# 损失函数使用NLL,注意索引ignore_index = 0
self.criterion = nn.NLLLoss(ignore_index=0)
self.log_freq = log_freq
print("Total Parameters:", sum([p.nelement() \
for p in self.model.parameters()]))
def train(self, epoch):
self.iteration(epoch, self.train_data)
def test(self, epoch):
self.iteration(epoch, self.test_data, train=False)
BERTLM的定义:
class BERTLM(nn.Module):
"""
BERT Language Model
Next Sentence Prediction Model + Masked Language Model
"""
def __init__(self, bert: BERT, vocab_size):
"""
:param bert: BERT model which should be trained
:param vocab_size: total vocab size for masked_lm
"""
super().__init__()
self.bert = bert
self.next_sentence = NextSentencePrediction(self.bert.hidden)
self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size)
def forward(self, x, segment_label):
x = self.bert(x, segment_label)
return self.next_sentence(x), self.mask_lm(x)
# NSP
class NextSentencePrediction(nn.Module):
"""
2-class classification model : is_next, is_not_next
"""
def __init__(self, hidden):
"""
:param hidden: BERT model output size
"""
super().__init__()
self.linear = nn.Linear(hidden, 2)
self.softmax = nn.LogSoftmax(dim=-1)
# 查看[cls]的值,二分类
def forward(self, x):
return self.softmax(self.linear(x[:, 0]))
# Mask-LM
class MaskedLanguageModel(nn.Module):
"""
predicting origin token from masked input sequence
n-class classification problem, n-class = vocab_size
"""
def __init__(self, hidden, vocab_size):
"""
:param hidden: output size of BERT model
:param vocab_size: total vocab size
"""
super().__init__()
self.linear = nn.Linear(hidden, vocab_size)
self.softmax = nn.LogSoftmax(dim=-1)
def forward(self, x):
return self.softmax(self.linear(x))
训练代码
# Setting the tqdm progress bar
data_iter = tqdm.tqdm(enumerate(data_loader),
desc="EP_%s:%d" % (str_code, epoch),
total=len(data_loader),
bar_format="{l_bar}{r_bar}")
avg_loss = 0.0
total_correct = 0
total_element = 0
for i, data in data_iter:
# 0. batch_data will be sent into the device(GPU or cpu)
data = {key: value.to(self.device) for key, value in data.items()}
# data["bert_input"]: [batch_size, seq_len]
# data["segment_lable]: [batch_size, seq_len]
# 1. forward the next_sentence_prediction and masked_lm model
next_sent_output, mask_lm_output = self.model.forward(data["bert_input"],
data["segment_label"])
# next_sent_output: [batch_size, 2]
# mask_lm_output: [batch_size, seq_len, vocab_size]
# 2-1. NLL(negative log likelihood) loss of is_next classification result
next_loss = self.criterion(next_sent_output, data["is_next"])
# 2-2. NLLLoss of predicting masked token word
mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
# 忽略索引下标为0的值,只会取计算data["bert_label"]中不为0的值的预测,即为处理的单词
# 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
loss = next_loss + mask_loss
# 3. backward and optimization only in train
if train:
self.optim_schedule.zero_grad()
loss.backward()
self.optim_schedule.step_and_update_lr()
# 优化器自己定义的函数
# next sentence prediction accuracy
correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
avg_loss += loss.item()
total_correct += correct
total_element += data["is_next"].nelement()
post_fix = {
"epoch": epoch,
"iter": i,
"avg_loss": avg_loss / (i + 1),
"avg_acc": total_correct / total_element * 100,
"loss": loss.item()
}
if i % self.log_freq == 0:
data_iter.write(str(post_fix))
print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter),
"total_acc=", total_correct * 100.0 / total_element)
优化器自定义的函数:
class ScheduledOptim():
'''A simple wrapper class for learning rate scheduling'''
def __init__(self, optimizer, d_model, n_warmup_steps):
self._optimizer = optimizer
self.n_warmup_steps = n_warmup_steps
self.n_current_steps = 0
self.init_lr = np.power(d_model, -0.5)
def step_and_update_lr(self):
"Step with the inner optimizer"
self._update_learning_rate()
self._optimizer.step()
def zero_grad(self):
"Zero out the gradients by the inner optimizer"
self._optimizer.zero_grad()
def _get_lr_scale(self):
return np.min([
np.power(self.n_current_steps, -0.5),
np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
def _update_learning_rate(self):
''' Learning rate scheduling per step '''
self.n_current_steps += 1
lr = self.init_lr * self._get_lr_scale()
for param_group in self._optimizer.param_groups:
param_group['lr'] = lr
后续的文章补充为利用BERT进行NLP的相关实践任务