1024节,写点东西庆祝一下
之前老师总是说初学者要有深度,代码要吃透,广而不深没用,现在颇有体会:一个东西学深了,上手其他东西很轻松,甚至感觉他们都是完全一类东西。
下面带大家看一下一个简单版的bert源码 。
在这里,我们一次向bert里输入两个句子,任务是判断一个句子是不是另一个句子下一句。输入格式为
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
# sample IsNext and NotNext to be same in small batch size
def make_batch():
batch = []
positive = negative = 0
while positive != batch_size/2 or negative != batch_size/2:
tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
#randrange从给定的范围返回一个随机数,这里是为了给每个句子一个索引
tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index] #根据索引得到两个句子
input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
#当bert输入两个句子时,所要求的输入格式
segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
#当bert输入两个句子时,第一个句子的所有token的segment_ids为0,第二个句子为1
# MASK LM
n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
cand_maked_pos = [i for i, token in enumerate(input_ids)
if token != word_dict['[CLS]'] and token != word_dict['[SEP]']] #可以用来mask的词
shuffle(cand_maked_pos)
masked_tokens, masked_pos = [], []
for pos in cand_maked_pos[:n_pred]:
masked_pos.append(pos)
masked_tokens.append(input_ids[pos])
if random() < 0.8: # 80%
input_ids[pos] = word_dict['[MASK]'] # make mask
elif random() < 0.5: # 10%
index = randint(0, vocab_size - 1) # random index in vocabulary
input_ids[pos] = word_dict[number_dict[index]] # replace
# Zero Paddings
n_pad = maxlen - len(input_ids)
input_ids.extend([0] * n_pad)
segment_ids.extend([0] * n_pad)
# Zero Padding (100% - 15%) tokens
if max_pred > n_pred:
n_pad = max_pred - n_pred
masked_tokens.extend([0] * n_pad)
masked_pos.extend([0] * n_pad)
if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
positive += 1
elif tokens_a_index