BERT简述
BERT是基于transformer架构的双向编码器表示,这个名称具有两个关键点,首先是基于transformer架构的编码器,其次是双向编码器。第一点类似于生成式预训练模型(GPT),只不过GPT是基于transformer架构的decoder,而BERT是基于transformer架构的encoder。至于第二点双向编码器也可以相对于GPT进行理解,GPT采用的是顺序预测,即根据之前的K个字符去预测当前字符,而BERT则同时考虑一个字符的上下文信息,即通过每个字符两个方向的序列对当前字符进行预测。对transformer架构和GPT不了解的可以去看我之前写的两篇博客,链接如下:
transformer
CSDN
https://mp.csdn.net/mp_blog/creation/editor/139606957GPT
CSDN
https://mp.csdn.net/mp_blog/creation/editor/139634738
BERT实现
BERT模型结构

如图,不难看出BERT是基于transformer架构的编码器(encoder)模块来进行实现。而encoder由multi-head attention和ffd模块组成,这里不再赘述。
BERT判断语义相似任务的pytorch实现
参考代码(强烈推荐,本文只做注释,实现代码为转载。下为转载链接,感谢原创作者!!!如构成侵权,请速与我联系删除,如有注释纰漏,感谢指正):GitHub - MorvanZhou/NLP-Tutorials: Simple implementations of NLP models. Tutorials are written in Chinese on my website https://mofanpy.com
数据集构建
使用MRPC数据集。这个数据集的每条数据包含两个string1和string2,以及两条string是否相似的标签,相似标签为1,不相似标签为0。下面给出数据集的训练和测试集。
https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt
https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt
数据集下载
def maybe_download_mrpc(save_dir="./MRPC/", proxy=None):
train_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt'
test_url = 'https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt'
os.makedirs(save_dir, exist_ok=True)
proxies = {"http": proxy, "https": proxy}
for url in [train_url, test_url]:
raw_path = os.path.join(save_dir, url.split("/")[-1])
if not os.path.isfile(raw_path):
print("downloading from %s" % url)
r = requests.get(url, proxies=proxies)
with open(raw_path, "w", encoding="utf-8") as f:
f.write(r.text.replace('"', "<QUOTE>"))
print("completed")
数据集处理
def _process_mrpc(dir="./MRPC", rows=None):
# 创建字典用于分开存储训练和测试数据
data = {"train": None, "test": None}
files = os.listdir(dir)
for f in files:
df = pd.read_csv(os.path.join(dir, f), sep='\t', nrows=rows)
k = "train" if "train" in f else "test"
# 嵌套字典data{train{is_same{标签}, s1{string1}, s2{string2}}, test{测试数据,同train}}
data[k] = {"is_same": df.iloc[:, 0].values, "s1": df["#1 String"].values, "s2": df["#2 String"].values}
vocab = set()
for n in ["train", "test"]:
for m in ["s1", "s2"]:
for i in range(len(data[n][m])):
# string正则化
data[n][m][i] = _text_standardize(data[n][m][i].lower())
# 创建字符set
cs = data[n][m][i].split(" ")
vocab.update(set(cs))
# 字符转化成整数索引
v2i = {v: i for i, v in enumerate(sorted(vocab), start=1)}
# 补位PAD
v2i["<PAD>"] = PAD_ID
v2i["<MASK>"] = len(v2i)
# 间隔or结束标记
v2i["<SEP>"] = len(v2i)
# 开始标记
v2i["<GO>"] = len(v2i)
# 整数索引转字符
i2v = {i: v for v, i in v2i.items()}
for n in ["train", "test"]:
for m in ["s1", "s2"]:
# 嵌套字典data{train{is_same{}, s1{}, s2{}, s1id{字符序列转整数索引序列}, s2id{同s1id}}, test{...}}
data[n][m + "id"] = [[v2i[v] for v in c.split(" ")] for c in data[n][m]]
return data, v2i, i2v
string正则化(正则化知识不再赘述,可自行查询)
def _text_standardize(text):
text = re.sub(r'—', '-', text)
text = re.sub(r'–', '-', text)
text = re.sub(r'―', '-', text)
text = re.sub(r" \d+(,\d+)?(\.\d+)? ", " <NUM> ", text)
text = re.sub(r" \d+-+?\d*", " <NUM>-", text)
return text.strip()
数据集类MRPCData的构建
class MRPCData(tDataset):
# 0、1、2分别表示s1、s2、PAD
num_seg = 3
# PAD整数索引
pad_id = PAD_ID
def __init__(self, data_dir="./MRPC/", rows=None, proxy=None):
# 下载数据集
maybe_download_mrpc(save_dir=data_dir, proxy=proxy)
# 返回嵌套字典data,字符转整数索引字典v2i和整数索引转字符字典i2v
data, self.v2i, self.i2v = _process_mrpc(data_dir, rows)
# 确定输入序列(train,test数据均要考虑)的最大长度max_len +3:<GO> s1id <sep> s2id <sep>
self.max_len = max(
[len(s1) + len(s2) + 3 for s1, s2 in zip(
data["train"]["s1id"] + data["test"]["s1id"], data["train"]["s2id"] + data["test"]["s2id"])])
# 分别计算s1id和s2id的长度为了编码标识string1、string2和PAD
self.xlen = np.array([
[
len(data["train"]["s1id"][i]), len(data["train"]["s2id"][i])
] for i in range(len(data["train"]["s1id"]))], dtype=int)
# 转化整数索引序列 <GO> s1id <SEP> s2id <SEP>
x = [
[self.v2i["<GO>"]] + data["train"]["s1id"][i] + [self.v2i["<SEP>"]] + data["train"]["s2id"][i] + [
self.v2i["<SEP>"]]
for i in range(len(self.xlen))
]
# PAD(0)
self.x = pad_zero(x, max_len=self.max_len)
# 相似标签转二维数组[[]], 便于loss交叉熵计算
self.nsp_y = data["train"]["is_same"][:, None]
# 创建编码, 区分string1、string2、PAD(0, 1, 2)
self.seg = np.full(self.x.shape, self.num_seg - 1, np.int32)
for i in range(len(x)):
# 标记 <GO> s1id <SEP> = 0
si = self.xlen[i][0] + 2
self.seg[i, :si] = 0
# 标记 s2id <SEP> = 1
si_ = si + self.xlen[i][1] + 1
self.seg[i, si:si_] = 1
# 获得非辅助字符的字符的整数索引
self.word_ids = np.array(list(set(self.i2v.keys()).difference(
[self.v2i[v] for v in ["<PAD>", "<MASK>", "<SEP>"]])))
def __getitem__(self, idx):
return self.x[idx], self.seg[idx], self.xlen[idx], self.nsp_y[idx]
def sample(self, n):
bi = np.random.randint(0, self.x.shape[0], size=n)
bx, bs, bl, by = self.x[bi], self.seg[bi], self.xlen[bi], self.nsp_y[bi]
return bx, bs, bl, by
@property
def num_word(self):
return len(self.v2i)
def __len__(self):
return len(self.x)
@property
def mask_id(self):
return self.v2i["<MASK>"]
BERT模型构建
BERT模型是基于transformer架构的encoder结构,每个encoder又由一个multi-head attention模块和ffd模块构成,encoder block的具体实现可以参考之前transformer结构的pytorch实现,这里不再赘述。
CSDN
https://mp.csdn.net/mp_blog/creation/editor/139606957 BERT模型的代码构建,虽然从理论上讲BERT和GPT一个基于transformer架构的encoder结构,另一个基于transformer架构的decoder结构。但是从本质上讲BERT的encoder和GPT的decoder都只包含一个multi-head attention模块,只是具有不同的mask机制(其实就是传入的bool类型矩阵不一致),因此从代码实现上来讲,我们所实现的BERT类完全可以继承于GPT类。
class BERT(GPT):
def __init__(
self, model_dim, max_len, num_layer, num_head, n_vocab, lr,
max_seg=3, drop_rate=0.2, padding_idx=0) -> None:
super().__init__(model_dim, max_len, num_layer, num_head, n_vocab, lr, max_seg, drop_rate, padding_idx)
def step(self, seqs, segs, seqs_, loss_mask, nsp_labels):
device = next(self.parameters()).device
self.opt.zero_grad()
# 直接继承GPT类的forward函数, 输出序列的字符预测结果和序列是否相似结果。
mlm_logits, nsp_logits = self(seqs, segs, training=True) # [batch_size, step, n_vocab], [batch_size, n_cls]
# 交叉熵函数求损失loss,对字符的预测结果求损失
mlm_loss = cross_entropy(
# 获得掩码位置的字符预测结果
torch.masked_select(mlm_logits, loss_mask).reshape(-1, mlm_logits.shape[2]),
# 获得真实序列对应位置字符的整数idx
torch.masked_select(seqs_, loss_mask.squeeze(2))
)
# 对两个序列含义相同与否的预测求损失
nsp_loss = cross_entropy(nsp_logits, nsp_labels.reshape(-1))
# 加权作为模型loss
loss = mlm_loss + 0.2 * nsp_loss
loss.backward()
self.opt.step()
return loss.cpu().data.numpy(), mlm_logits
# PAD,重新父类GPT中的mask函数
def mask(self, seqs):
mask = torch.eq(seqs, self.padding_idx)
return mask[:, None, None, :]
对于BERT而言,屏蔽语言建模也是实现过程中非常重要的一环,BERT对于给定的输入序列会随机mask一定比例的字符用于之后无监督学习的双向预测(和GPT相似,只不过GPT采用sequence-masked来进行顺序预测,自左到右的前向预测),但是对于BERT而言,简单的随机mask结果并不能作为模型的最终输入,我们还需要遵循一个8-1-1操作策略:
- 80%的遮蔽词被替换为[MASK]标记。
- 10%的遮蔽词被随机替换为语料库中的其他词。
- 10%的遮蔽词保持不变。
这样做的原因包括以下几点:
-
避免模型过度依赖[MASK]标记:如果所有被遮蔽的词都使用[MASK]标记替换,模型可能会过度关注这个标记,从而忽略了其他词的信息。通过随机替换和保持原词,模型需要学会从上下文中推断出被遮蔽词的信息,而不是仅仅依赖于[MASK]标记。
-
增加模型的泛化能力:通过随机替换,模型被迫学习如何处理未见过的词,这有助于提高模型在处理实际文本时的泛化能力。
-
保持上下文信息:保持原词的操作使得模型在训练过程中仍然能够接收到一些未被遮蔽的词的信息,这有助于模型更好地理解上下文。
-
模拟真实场景:在实际应用中,模型很少会遇到被明确标记为[MASK]的词。通过8-1-1策略,模型在训练过程中更多地模拟了真实场景,从而在微调和测试阶段能够更好地表现。
-
平衡训练难度:如果所有被遮蔽的词都保持不变,那么模型可能会变得过于简单,因为它只需要简单地复制上下文中的词。通过引入随机替换和[MASK]标记,模型需要更加努力地学习,从而提高了训练的难度和效果。
BERT的随机mask和8-1-1策略的代码实现如下:
def random_mask_or_replace(data, arange, dataset):
# 返回 <GO> s1id <SEP> s2id <SEP> PAD 的整数idx序列, seg_mask序列[0..1..2..]用于区分两个string和PAD序列, s1id\s2id 的长度, 相似标签
seqs, segs, xlen, nsp_labels = data
# 拷贝一个真实数据的idx序列作为预测序列时的标签,求loss
seqs_ = seqs.data.clone()
p = np.random.random()
# 以0.8的概率输入mask序列
if p < 0.8:
# mask
loss_mask = np.concatenate([
do_mask(
seqs[i],
np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
dataset.pad_id,
dataset.mask_id
)
for i in range(len(seqs))], axis=0)
# 以0.1的概率输入原始序列
elif p < 0.9:
# do nothing
loss_mask = np.concatenate([
do_nothing(
seqs[i],
np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
dataset.pad_id
)
for i in range(len(seqs))], axis=0)
# 以0.1的概率输入将mask位置随即替换为其他字符的序列
else:
# replace
loss_mask = np.concatenate([
do_replace(
seqs[i],
np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
dataset.pad_id,
dataset.word_ids
)
for i in range(len(seqs))], axis=0)
loss_mask = torch.from_numpy(loss_mask).unsqueeze(2)
return seqs, segs, seqs_, loss_mask, xlen, nsp_labels
def _get_loss_mask(len_arange, seq, pad_id):
# 选择需要mask掉的字符
rand_id = np.random.choice(len_arange, size=max(2, int(MASK_RATE * len(len_arange))), replace=False)
# 创建和seq相对应的bool矩阵来表示被mask和未mask的字符状态
loss_mask = np.full_like(seq, pad_id, dtype=bool)
# mask的字符标为true
loss_mask[rand_id] = True
return loss_mask[None, :], rand_id
# mask的字符用mask idx替代,针对输入而言
def do_mask(seq, len_arange, pad_id, mask_id):
loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
seq[rand_id] = mask_id
return loss_mask
# mask的字符用随机字符替代,针对输入而言
def do_replace(seq, len_arange, pad_id, word_ids):
loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
seq[rand_id] = torch.from_numpy(np.random.choice(word_ids, size=len(rand_id))).type(torch.IntTensor)
return loss_mask
# mask的字符不进行替代,针对输入而言
def do_nothing(seq, len_arange, pad_id):
loss_mask, _ = _get_loss_mask(len_arange, seq, pad_id)
return loss_mask
模型训练
def train():
# 超参数设定
MODEL_DIM = 256
N_LAYER = 4
LEARNING_RATE = 1e-4
# 创建数据集
dataset = utils.MRPCData("./MRPC", 5000)
print("num word: ", dataset.num_word)
# 初始化BERT模型
model = BERT(
model_dim=MODEL_DIM, max_len=dataset.max_len, num_layer=N_LAYER, num_head=4, n_vocab=dataset.num_word,
lr=LEARNING_RATE, max_seg=dataset.num_seg, drop_rate=0.2, padding_idx=dataset.pad_id
)
# cpu or cuda
if torch.cuda.is_available():
print("GPU train avaliable")
device = torch.device("cuda")
model = model.cuda()
else:
device = torch.device("cpu")
model = model.cpu()
loader = DataLoader(dataset, batch_size=64, shuffle=True)
# 用于之后随机mask
arange = np.arange(0, dataset.max_len)
for epoch in range(1000):
for batch_idx, batch in enumerate(loader):
seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(batch, arange, dataset)
seqs, segs, seqs_, nsp_labels, loss_mask = seqs.type(torch.LongTensor).to(device), segs.type(
torch.LongTensor).to(device), seqs_.type(torch.LongTensor).to(device), nsp_labels.to(
device), loss_mask.to(device)
loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels)
if batch_idx % 100 == 0:
pred = pred[0].cpu().data.numpy().argmax(axis=1)
print(
"\n\nEpoch: ", epoch,
"|batch: ", batch_idx,
"| loss: %.3f" % loss,
"\n| tgt: ", " ".join([dataset.i2v[i] for i in seqs[0].cpu().data.numpy()[:xlen[0].sum() + 1]]),
"\n| prd: ", " ".join([dataset.i2v[i] for i in pred[:xlen[0].sum() + 1]]),
"\n| tgt word: ", [dataset.i2v[i] for i in (seqs_[0] * loss_mask[0].view(-1)).cpu().data.numpy() if
i != dataset.v2i["<PAD>"]],
"\n| prd word: ", [dataset.i2v[i] for i in pred * (loss_mask[0].view(-1).cpu().data.numpy()) if
i != dataset.v2i["<PAD>"]],
)
os.makedirs("./visual/models/bert", exist_ok=True)
torch.save(model.state_dict(), "./visual/models/bert/model.pth")
export_attention(model, device, dataset)
678

被折叠的 条评论
为什么被折叠?



