# -*- coding: utf-8 -*- """Uni bert.ipynb """ from typing import List, Optional, Tuple import numpy as np import random import tensorflow as tf """构建encoder""" # 通用注意力实现 def scaled_dot_product_attention(q, k, v, mask=None): # 计算匹配度 matmul_qk = tf.matmul(q, k, transpose_b=True) dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_attention_dot = matmul_qk / tf.math.sqrt(dk) # 用mask矩阵干扰attention矩阵 if mask is not None: scaled_attention_dot += (mask * -1e9) # 归一化然后聚合 attention_weights = tf.nn.softmax(scaled_attention_dot, axis=-1) output = tf.matmul(attention_weights, v) # 输出三维张量和attention矩阵 return output, attention_weights ''' 多头注意力 MultiHeadAttention类就是完整的注意力层的实现。 它的构造函数有三个参数: d_model:模型的主维度,体现了模型的表达能力。 是注意力层所有张量的最后一个维度的大小, 也是整个BERT大部分张量的最后一个维度的大小 num_heads:注意力头的数量 seed:随机数种子,影响初始化质量 ''' def get_initializer(seed): return tf.keras.initializers.truncated_normal(mean=0., stddev=0.02, seed=seed) class MultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, seed=233): super(MultiHeadAttention, self).__init__() self.num_heads = num_heads self.d_model = d_model assert d_model % self.num_heads == 0 self.depth = d_model // self.num_heads # Q,K,V前的全连接层 self.wq = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(seed), name='wq') self.wk = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(seed), name='wk') self.wv = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(seed), name='wv') # 组合多头注意力结果的全连接层 self.dense = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(seed)) def split_heads(self, x, batch_size): # 新增一个注意力头的维度 x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, v, k, q, mask): batch_size = tf.shape(q)[0] # 过全连接层 q = self.wq(q) k = self.wk(k) v = self.wv(v) # 分拆注意力头 q = self.split_heads(q, batch_size) k = self.split_heads(k, batch_size) v = self.split_heads(v, batch_size) # 多头注意力机制计算 scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask) # 先拼接多头计算结果再组合 scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) output = self.dense(concat_attention) return output, attention_weights # BERT使用激活函数GELU def gelu(x): cdf = 0.5 * (1.0 + tf.tanh( (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf # 点式前馈网络层 def point_wise_feed_forward_network(d_model, dff, seed=233): return tf.keras.Sequential([ tf.keras.layers.Dense(dff, activation=gelu, kernel_initializer=get_initializer(seed), name='ffn_1'), tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(seed), name='ffn_2'), ]) ''' encoder层 鉴于我们已经将注意力层和点式前馈网络层实现完毕, 现在就可以将它们组合起来实现encoder层了。 encoder层的输入是一个三维张量x和四维张量mask, 输出也是一个三维张量。首先我们要用注意力层实现自注意力机制, 方法就是将x同时当作Q,K,V输入到注意力层中: attention_output=Attention(x,x,x,mask) 然后我们会在这里应用到残差网络中的残差连接, 还会用到一个LayerNormalization层: output1=LayerNorm(x+attention_output) 残差连接和LayerNormalization都能在具有很多层的网络里起到稳定梯度的作用。 最后,我们让output_1 通过点式前馈网络层, 再应用一下残差连接和LayerNormalization层,整个encoder层就实现完毕了 ''' def get_layer_norm(): return tf.keras.layers.LayerNormalization( epsilon=1e-6, beta_initializer=tf.keras.initializers.zeros(), gamma_initializer=tf.keras.initializers.ones(), ) class EncoderLayer(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, dff, name, rate=0.1, seed=233): super(EncoderLayer, self).__init__(name=name) self.mha = MultiHeadAttention(d_model, num_heads, seed=seed) self.ffn = point_wise_feed_forward_network(d_model, dff, seed=seed) self.layer_norm_1 = get_layer_norm() self.layer_norm_2 = get_layer_norm() self.dropout_1 = tf.keras.layers.Dropout(rate, seed=seed) self.dropout_2 = tf.keras.layers.Dropout(rate, seed=seed) def call(self, x, mask): # 自注意力层 + 残差连接 + layer-norm attn_output, _ = self.mha(x, x, x, mask) attn_output = self.dropout_1(attn_output) out1 = self.layer_norm_1(x + attn_output) # 点式前馈网络层 + 残差连接 + layer-norm ffn_output = self.ffn(out1) ffn_output = self.dropout_2(ffn_output) out2 = self.layer_norm_2(out1 + ffn_output) return out2 ''' [cls] my dog is cute [sep] he likes play #ing [sep] [pad] [pad] [pad] [pad] [pad] 其中[cls]和[sep]这两个特殊token的作用分别是指出哪里是序列的开头和结尾。 因为有两个句子,所以会有两个[sep]。 [pad]的作用是:通过在末尾添加无意义token(padding操作), 将两个句子拼接结果的长度限定在一个定值上。在上面的例子中,这个定值为16。 ''' ''' 在embedding层中,每个token会拿到三个embedding。分别是: E_{Token}:每个不同单词有一个不同的embedding,可以简单理解为这个单词的词向量 E_{Segment}:总共只有两种不同的embedding,第一个句子A是一种,第二个句子B是另一种 nE_{Position}:每个不同位置有一个不同embedding,通过它encoder可以知道这个token在什么位置 ''' ''' 拿到embedding层输出的三维张量后, 先让它依次通过Dropout层和LayerNormalization层, 再让它依次通过若干层encoder, 就可以得到整个encoder的输出了 ''' # 整个encoder的实现 ''' num_layers:有多少个encoder层; d_model:每个encoder的d_model是多少; num_heads:有多少个注意力头; dff:点式前馈网络的第一个全连接层有多少个神经元; segment_max_len:每个句子的最大长度是多少; input_vocab_size:tokenizer的词典的大小是多少; rate:Dropout层的丢弃概率有多大; seed:初始化使用的随机化种子是哪个。 ''' class TransformerEncoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, segment_max_len, input_vocab_size, rate=0.1, seed=233): super(TransformerEncoder, self).__init__() self.d_model = d_model self.num_layers = num_layers # 句子对的最大长度等于两个句子的最大长度加上三个特殊token的长度 max_len = 2 * segment_max_len + 3 # 三种embedding self.embedding = tf.keras.layers.Embedding( input_dim=input_vocab_size, output_dim=d_model, embeddings_initializer=get_initializer(seed), trainable=True, ) self.position_embedding = tf.keras.layers.Embedding( input_dim=max_len, output_dim=d_model, embeddings_initializer=get_initializer(seed), trainable=True, ) self.segment_embedding = tf.keras.layers.Embedding( input_dim=2, output_dim=d_model, embeddings_initializer=get_initializer(seed), trainable=True, ) # 对于每个句对而言,position id序列是固定的:[0, 1, 2, 3, ...] position_ids = range(max_len) self.position_ids = tf.cast(np.array(position_ids).reshape((1, -1)), dtype=tf.float32) # 基于前面提到的简化,segment id序列是固定的:[0, 0, 0, ..., 1, 1, 1, ...] segment_ids = [0 for _ in range(2 + segment_max_len)] + [1 for _ in range(1 + segment_max_len)] self.segment_ids = tf.cast(np.array(segment_ids).reshape((1, -1)), dtype=tf.float32) # 堆叠encoder层 self.enc_layers = [] for i in range(num_layers): self.enc_layers.append(EncoderLayer(d_model, num_heads, dff, 'encoder_layer_{}'.format(i), rate, seed)) self.layer_norm = get_layer_norm() self.dropout = tf.keras.layers.Dropout(rate, seed=seed) def call(self, x, mask): # embedding层的输出为三种embedding相加的结果 x = self.embedding(x) + self.position_embedding(self.position_ids) + self.segment_embedding(self.segment_ids) x = self.layer_norm(self.dropout(x)) # 依次调用encoder_layer for i in range(self.num_layers): x = self.enc_layers[i](x, mask) return x """构建模型""" ''' BERT的输入是四个二维张量: token_ids,nsp_labels,mask_token_ids,is_masked。 被mask过的句子对的token id信息, 也就是mask_token_ids张量会先被输入到Transformer Encoder中, 得到三维张量encoder_output,它是BERT的两个预训练任务的共同输入。 BERT的两个预训练任务分别是 Next Sentence Prediction(NSP):“判断输入BERT的句子对在文章中是否是连续的”的任务 Masked Language Model(MLM):完形填空任务, ''' # padding mask # 为了让注意力层感知到句子对的mask操作, # 我们要根据padding信息构造一个mask张量。 def create_padding_mask(seq): seq = tf.cast(tf.math.equal(seq, 0), tf.float32) return seq[:, tf.newaxis, tf.newaxis, :] def build_bert(segment_max_len: int, input_vocab_size: int, num_layers: int, d_model: int, num_heads: int, d_ff: int, rate: float, seed: int): # 句子对的最大长度等于两个句子的最大长度加上三个特殊token的长度 max_len = 2 * segment_max_len + 3 # 输入层 token_ids = tf.keras.layers.Input(shape=(max_len, ), name='token_ids') target_token_ids = tf.keras.layers.Input(shape=(max_len,), name='target_token_ids') is_masked = tf.keras.layers.Input(shape=(max_len,), name='is_masked') # 构造padding mask张量 mask = create_padding_mask(token_ids) # 拿到transformer encoder的输出 encoder = TransformerEncoder( num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=d_ff, segment_max_len=segment_max_len, input_vocab_size=input_vocab_size, rate=rate, seed=seed, ) encoder_output = encoder(token_ids, mask=mask) # NSP目标对应网络结构 pooler = encoder_output[:, 0:1, :] pool_dense = tf.keras.layers.Dense( d_model, activation='tanh', kernel_initializer=get_initializer(seed), name='pooler_dense' ) nsp_prob = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=get_initializer(seed), name='nsp_prob') nsp_output = nsp_prob(pool_dense(pooler)) # MLM目标对应网络结构 mlm_dense = tf.keras.layers.Dense( d_model, activation=gelu, kernel_initializer=get_initializer(seed), name='mlm_dense' ) mlm_norm = get_layer_norm() mlm_activation = tf.keras.layers.Dense( input_vocab_size, activation='linear', kernel_initializer=get_initializer(seed), name='mlm_activation' ) mlm_norm_output = mlm_norm(mlm_dense(encoder_output)) embedding_token = mlm_norm_output @ tf.transpose(encoder.embedding.embeddings) bias = tf.keras.initializers.Zeros()(shape=(input_vocab_size,)) mlm_bias = tf.Variable(tf.cast(bias, tf.float32), name="mlm_bias") mlm_output = mlm_activation(tf.nn.bias_add(embedding_token, mlm_bias)) # MLM的loss和acc的实现 def mlm_loss(inputs): y_true, y_pred, mask_ = inputs loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = tf.reduce_sum(loss * mask_) / (tf.reduce_sum(mask_) + tf.keras.backend.epsilon()) return loss def mlm_acc(inputs): y_true, y_pred, mask_ = inputs acc = tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = tf.reduce_sum(acc * mask_) / (tf.reduce_sum(mask_) + tf.keras.backend.epsilon()) return acc mlm_loss = tf.keras.layers.Lambda(mlm_loss, name='mlm_loss')([token_ids, mlm_output, is_masked]) mlm_acc = tf.keras.layers.Lambda(mlm_acc, name='mlm_acc')([token_ids, mlm_output, is_masked]) # 定义模型对象并返回 model = tf.keras.models.Model( inputs=[token_ids, target_token_ids, is_masked], outputs=[nsp_output, mlm_loss, mlm_acc], ) return model # 编译model # 将模型和训练过程相关的配置参数绑定在一起,以便为训练准备模型。 # "编译模型"指的是为BERT模型选择优化器、定义损失函数和度量指标 def compile_bert(model: tf.keras.models.Model, learning_rate: float) -> tf.keras.models.Model: model.compile( loss={ 'mlm_loss': lambda y_true, y_pred: y_pred, 'nsp_prob': tf.keras.losses.BinaryCrossentropy(), }, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-6), metrics={ 'mlm_acc': lambda y_true, y_pred: y_pred, 'nsp_prob': tf.keras.metrics.AUC() }, ) return model # tokenizer # 原始语料是文本序列,输入到BERT之前就变成了二维张量,这个转化过程就是tokenize。 # '今天天气不错' -> '今 天 天 气 不 错' def segment(raw_texts: List[str]) -> List[str]: res = [] for text in raw_texts: res.append(' '.join([c for c in text])) return res # “训练”tokenizer def build_tokenizer(texts: List[str]) -> tf.keras.preprocessing.text.Tokenizer: tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True) tokenizer.fit_on_texts(texts) return tokenizer # 使用tokenizer对句子做tokenize def tokenize(tokenizer: tf.keras.preprocessing.text.Tokenizer, texts: List[str], max_len: Optional[int] = None) -> List[List[int]]: seqs = tokenizer.texts_to_sequences(texts=texts) if max_len is not None: for i in range(len(seqs)): seqs[i] = seqs[i][0:max_len] while len(seqs[i]) < max_len: seqs[i].append(0) return seqs ''' maskBERT的MLM任务需要将句子中一部分token给遮盖掉, 下面的get_mask_token_ids函数就实现了这个功能。 输入经过tokenize之后的token id序列,也就是token_ids, 输出经过遮盖之后的token id序列,也就是mask_token_ids, 以及指示有哪些token被mask掉的0/1序列,也就是is_masked。 ''' # 替换掉被mask的token def mask_replace(token_id: int, mask_token_id: int, vocab_size: int) -> int: rand = random.random() if rand <= 0.8: return mask_token_id elif rand <= 0.9: return token_id else: return random.randint(0, vocab_size) # 对序列进行替换 def get_mask_token_ids(token_ids: List[int], mask_token_id: int, mask_rate: float, vocab_size: int) -> Tuple[List[int], List[int]]: mask_token_ids, is_masked = [], [] for token_id in token_ids: if token_id == 0 or random.random() > mask_rate: mask_token_ids.append(token_id) is_masked.append(0) else: mask_token_ids.append(mask_replace(token_id, mask_token_id, vocab_size)) is_masked.append(1) return mask_token_ids, is_masked """**实现input构造**""" ''' 这个函数的功能是将语料库构造成BERT输入的make_feature函数。 具体的步骤是:我们遍历文档,然后遍历文档中的每个句子。 为每个句子匹配下一个句子作为NSP任务的正例, 然后随机匹配一个句子作为NSP任务的负例。 最后将每个句子对mask一下,再用[cls]和[sep]包裹一下。 ''' def make_feature(tokenizer: tf.keras.preprocessing.text.Tokenizer, raw_docs: List[List[str]], segment_max_len: int, beg_token_id: int, sep_token_id: int, mask_token_id: int, mask_rate: float, random_seed: int) -> Tuple[List[List[int]], List[List[int]], List[List[int]], List[int]]: random.seed(random_seed) docs = [] tot_texts = [] for raw_doc in raw_docs: doc = segment(raw_doc) docs.append(list(range(len(tot_texts), len(tot_texts) + len(doc)))) tot_texts.extend(doc) tot_token_ids = tokenize(tokenizer, tot_texts, segment_max_len) tot_text_idx = list(range(len(tot_texts))) tmp_token_ids = [] nsp_labels = [] for doc in docs: for i in range(1, len(doc)): pre_doc_id = doc[i - 1] cur_doc_ids = [random.choice(tot_text_idx), doc[i]] for j in range(2): cur_doc_id = cur_doc_ids[j] tmp_token_ids.append([tot_token_ids[pre_doc_id], tot_token_ids[cur_doc_id]]) nsp_labels.append(j) token_ids = [] mask_token_ids = [] is_masked = [] vocab_size = len(tokenizer.word_index) + 4 for p in tmp_token_ids: first_cur_mask_token_ids, first_cur_is_masked = get_mask_token_ids(p[0], mask_token_id, mask_rate, vocab_size) second_cur_mask_token_ids, second_cur_is_masked = get_mask_token_ids(p[1], mask_token_id, mask_rate, vocab_size) token_ids.append([beg_token_id] + p[0] + [sep_token_id] + p[1] + [sep_token_id]) mask_token_ids.append( [beg_token_id] + first_cur_mask_token_ids + [sep_token_id] + second_cur_mask_token_ids + [sep_token_id] ) is_masked.append([0] + first_cur_is_masked + [0] + second_cur_is_masked + [0]) return token_ids, mask_token_ids, is_masked, nsp_labels """**跑起来**""" # 样例数据 data = [ [ '“时逢三五便团圆,满把晴光护玉栏。”', '10日是一年一度的中秋节。', '据天文专家介绍,今年中秋节是十五的月亮十五圆,预计最圆时刻将出现在17时59分左右。', '那么佳节赏月天气如何呢', '能看到的是皓月当空还是彩云伴月?', '是适合登高望月、还是泛舟赏月?', ], [ '中新网9月10日电 综合英媒报道,当地时间9月10日,英国国王查尔斯三世已经批准,英国女王伊丽莎白二世葬礼当天将是英国的公共假日。', '英女王的葬礼日期目前暂未确定,但《卫报》称,葬礼预计为9月19日。', '9月8日,英国女王伊丽莎白二世去世,终年96岁。', '9月10日,查尔斯三世被英国王位授权理事会正式授权成为英国新君主。', ], ] # toknine raw_texts = [text for doc in data for text in doc] texts = segment(raw_texts) tokenizer = build_tokenizer(texts) vocab_size = len(tokenizer.word_index) + 4 # 构建模型 # 随机数种子 SEED = 233 # 定义模型 model = build_bert( segment_max_len=64, input_vocab_size=vocab_size, num_layers=4, d_model=64, num_heads=2, d_ff=256, rate=0.1, seed=SEED ) # 编译模型 model = compile_bert(model, 0.00176) model.summary() #构建数据 #调用maks_feature函数将语料转换成输入BERT的张量,然后将这些张量打包成dataset。 token_ids, mask_token_ids, is_masked, nsp_labels = make_feature( tokenizer, data, 64, vocab_size - 3, vocab_size - 2, vocab_size - 1, 0.15, SEED ) fake_labels = [0 for _ in range(len(nsp_labels))] ds = tf.data.Dataset.from_tensor_slices(( {'token_ids': token_ids, 'target_token_ids': mask_token_ids, 'is_masked': is_masked}, {'nsp_prob': nsp_labels, 'mlm_loss': fake_labels, 'mlm_acc': fake_labels}, )).batch(4) #将中间结果输出,tokenize的结果: def decode(tokenizer_: tf.keras.preprocessing.text.Tokenizer, token_ids_: List[int], vocab_size_: int): res = [] for token_id in token_ids_: if token_id == 0: res.append('[pad]') elif token_id == vocab_size_ - 3: res.append('[beg]') elif token_id == vocab_size_ - 2: res.append('[sep]') elif token_id == vocab_size_ - 1: res.append('[mask]') elif token_id in tokenizer_.index_word: res.append(tokenizer_.index_word[token_id]) else: res.append('[unk]') return ''.join(res) for i in range(len(token_ids)): print(token_ids[i]) print(decode(tokenizer, token_ids[i], vocab_size)) print(mask_token_ids[i]) print(decode(tokenizer, mask_token_ids[i], vocab_size)) print(is_masked[i]) print(nsp_labels[i]) print('') # 训练 model.fit(ds, shuffle=False, epochs=1)
解读BERT及实现
最新推荐文章于 2024-06-19 14:56:22 发布