之前分别用numpy实现了mlp,cnn,lstm和bert模型,这周顺带搞一下GPT-2,纯numpy实现,最重要的是可在树莓派上或其他不能安装pytorch的板子上运行,生成数据
gpt-2的mask-multi-headed-self-attention我现在才彻底的明白它是真的牛逼,比bert的multi-headed-self-attention牛的不是一点半点,提出mask的人智商也是相当高了
这次模型依然是从hungging face上找的一个,gpt-2 small版本,参数比bert还小,主要是gpt-2没有token-type那个2*768的矩阵和一个pooler矩阵,别的都有
gpt-2的模型结构和bert类似,只不过multi-headed-self-attention换成了mask-multi-headed-self-attention,另外
layer_normalization层放到了attention层之前和feedforword层之前
最重点的是,我的代码实现了gpt-2推理加速,以前的gpt-2的工程里经常是生成一个新的token后和原来的token序列拼接起来,再给模型输入,这会使模型大大增加计算量,同时mask失去了它的意义,每次都会把之前的token重新推理一边,浪费时间,我的代码中已经实现使用生成的token作为模型输入去推理后面的结果,计算量大大减少,速度大大提升,试想一下向量做矩阵乘法和矩阵做矩阵乘法的差异,
同样生成100个token,在cpu上的加速效果快了一倍,如果token长度更长,则加速效果会更明显
上numpy代码
import numpy as np
import time
def top_k_sampling(probs, k):
# 使用argsort对概率分布数组进行排序,得到索引数组
sorted_indices = np.argsort(probs)[::-1]
# 选择前K个概率最高的词的索引
topk_indices = sorted_indices[:k]
# 根据选择的Top-K索引进行进一步处理,例如按概率重新归一化或随机采样
return topk_indices
def random_sampling(array, k):
sample = np.random.choice(array, size=k, replace=False)
return sample
def word_embedding(input_ids, word_embeddings):
return word_embeddings[input_ids]
def position_embedding(position_ids, position_embeddings):
return position_embeddings[position_ids]
def token_type_embedding(token_type_ids, token_type_embeddings):
return token_type_embeddings[token_type_ids]
def softmax(x, axis=None):
# e_x = np.exp(x).astype(np.float32) #
e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
sum_ex = np.sum(e_x, axis=axis,keepdims=True).astype(np.float32)
return e_x / sum_ex
def scaled_dot_product_attention(Q, K, V, mask=None):
d_k = Q.shape[-1]
attention_scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k) #一致
if mask is not None:
# min_value = scores.min()
min_value = np.finfo(attention_scores.dtype).min #找的scores.dtype 这个类型数据的最小值
# scores = np.where(mask, scores, np.full_like(scores, -np.inf))
scores = np.where(mask, attention_scores, np.full_like(attention_scores, min_value)) # 用最小值替换0的部分
attention_weights = softmax(scores, axis=-1) # 这样softmax的权重在本来是0的地方得到的数值是0
output = np.matmul(attention_weights, V) #一致了
return output, attention_weights
def scaled_dot_product_attention2(Q, K, V): # 单部推理 降低计算量
global att_scores
d_k = Q.shape[-1]
attention_scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k) #一致
attention_weights = softmax(attention_scores, axis=-1) # 这样softmax的权重在本来是0的地方得到的数值是0
output = np.matmul(attention_weights, V) #一致了
return output, attention_weights
global_q = {}
global_k = {}
global_v = {}
def mask_multihead_attention(i,input, num_heads,W_Q,B_Q,W_K,B_K,W_V,B_V,W_O,B_O):
global global_q,global_k,global_v
q = np.matmul(input, W_Q)+B_Q
k = np.matmul(input, W_K)+B_K
v = np.matmul(input, W_V)+B_V
if q.shape[-2] == 1:
k = global_k[i] = np.concatenate([global_k[i],k],axis=-2)
v = global_v[i] = np.concatenate([global_v[i],v],axis=-2)
else:
global_q[i] = q
global_k[i] = k
global_v[i] = v
_,n,_ = k.shape
# 分割输入为多个头
q = np.split(q, num_heads, axis=-1)
k = np.split(k, num_heads, axis=-1)
v = np.split(v, num_heads, axis=-1) #到这里都是一致的
outputs = []
if q[0].shape[-2] != 1:
mask = np.tril(np.ones((n, n))) #下三角矩阵
for q_,k_,v_ in zip(q,k,v):
output, attention_weights = scaled_dot_product_attention(q_, k_, v_,mask) #一致
outputs.append(output)
else:
for q_,k_,v_ in zip(q,k,v):
output, attention_weights = scaled_dot_product_attention2(q_, k_, v_) #一致
outputs.append(output)
outputs = np.concatenate(outputs, axis=-1)
outputs = np.matmul(outputs, W_O)+B_O
return outputs #一致
def layer_normalization(x, weight, bias, eps=1e-12):
mean = np.mean(x, axis=-1, keepdims=True)
variance = np.var(x, axis=-1, keepdims=True)
std = np.sqrt(variance + eps)
normalized_x = (x - mean) / std
output = weight * normalized_x + bias
return output
def feed_forward_layer(inputs, weight, bias=None, activation='relu'):
if bias is not None:
linear_output = np.matmul(inputs,weight) + bias
else:
linear_output = np.matmul(inputs,weight)
if activation == 'relu':
activated_output = np.maximum(0, linear_output) # ReLU激活函数
elif activation == 'gelu':
activated_output = 0.5 * linear_output * (1 + np.tanh(np.sqrt(2 / np.pi) * (linear_output + 0.044715 * np.power(linear_output, 3)))) # GELU激活函数
elif activation == "tanh" :
activated_output = np.tanh(linear_output)
else:
activated_output = linear_output # 无激活函数
return activated_output
def residual_connection(inputs, residual):
# 残差连接
residual_output = inputs + residual
return residual_output
with open('vocab.txt', 'r', encoding='utf-8') as f:
vocab = f.readlines()
vocab = [i.strip() for i in vocab]
# print(len(vocab))
def tokenize_sentence(sentence):
tokenized_sentence = list(sentence) # 在句子开头添加[cls]
token_ids = [vocab.index(token) for token in tokenized_sentence]
return token_ids
# 加载保存的模型数据
model_data = np.load('gpt2_model_params.npz')
# for i in model_data:
# # print(i)
# print(i,model_data[i].shape)
def get_sentence_ids(sentence):
token_ids = tokenize_sentence(sentence)
input_ids = np.array(token_ids) # 输入的词汇id
return input_ids
word_embeddings = model_data["transformer.wte.weight"]
position_embeddings = model_data["transformer.wpe.weight"]
def model_input(input_ids,position_ids):
word_embedded = word_embedding(input_ids, word_embeddings)
position_ids = np.array(position_ids) # 位置id
# 位置嵌入矩阵,形状为 (max_position, embedding_size)
position_embedded = position_embedding(position_ids, position_embeddings)
embedding_output = np.expand_dims(word_embedded + position_embedded, axis=0)
return embedding_output
def gpt2(input,num_heads):
for i in range(12):
LayerNorm1_weight = model_data['transformer.h.{}.ln_1.weight'.format(i)]
LayerNorm1_bias = model_data['transformer.h.{}.ln_1.bias'.format(i)]
# 调用多头自注意力函数
W_QKV = model_data['transformer.h.{}.attn.c_attn.weight'.format(i)]
B_QKV = model_data['transformer.h.{}.attn.c_attn.bias'.format(i)]
W_O = model_data['transformer.h.{}.attn.c_proj.weight'.format(i)]
B_O = model_data['transformer.h.{}.attn.c_proj.bias'.format(i)]
LayerNorm2_weight = model_data['transformer.h.{}.ln_2.weight'.format(i)]
LayerNorm2_bias = model_data['transformer.h.{}.ln_2.bias'.format(i)]
intermediate_weight = model_data['transformer.h.{}.mlp.c_fc.weight'.format(i)]
intermediate_bias = model_data['transformer.h.{}.mlp.c_fc.bias'.format(i)]
dense_weight = model_data['transformer.h.{}.mlp.c_proj.weight'.format(i)]
dense_bias = model_data['transformer.h.{}.mlp.c_proj.bias'.format(i)]
input1 = layer_normalization(input,LayerNorm1_weight,LayerNorm1_bias) #这里和模型输出一致
W_Q,W_K,W_V = np.split(W_QKV, 3, axis=-1)
B_Q,B_K,B_V = np.split(B_QKV, 3, axis=-1)
output = mask_multihead_attention(i,input1, num_heads,W_Q,B_Q,W_K,B_K,W_V,B_V,W_O,B_O) #一致
output1 = residual_connection(input,output) #一致
output = layer_normalization(output1,LayerNorm2_weight,LayerNorm2_bias) #一致
output = feed_forward_layer(output, intermediate_weight, intermediate_bias, activation='gelu')
output = feed_forward_layer(output, dense_weight, dense_bias, activation='') #一致
output2 = residual_connection(output1,output)
input = output2
ln_f_weight = model_data['transformer.ln_f.weight']
ln_f_bias = model_data['transformer.ln_f.bias']
output = layer_normalization(output2,ln_f_weight,ln_f_bias)
return output
classifier_weight = model_data['lm_head.weight']
def predict(sentence="今天是个好日子",accelerater=True,gen_len=100):
start = time.time()
sentence_ids = get_sentence_ids(sentence)
position_ids = range(len(sentence_ids))
print("prompt输入:",sentence)
for i in range(gen_len):
embeddings = model_input(sentence_ids,position_ids)
output = gpt2(embeddings,num_heads=12)
# print(output)
output = feed_forward_layer(output[:,-1], classifier_weight.T, activation='')
samples = top_k_sampling(output[0],k=1)
label_id = random_sampling(samples,k=1)
print(vocab[label_id[0]],end="")
if accelerater: #是否使用加速推理减少计算量
sentence_ids = label_id #每次使用上一步的q,和全量的key和v做计算,比使用所有历史时间部的q计算量小很多,所以可以加速
position_ids = [position_ids[-1]+1]
else:
sentence_ids = np.concatenate([sentence_ids,label_id],axis=-1) #慢推理,每次都需要从头计算,计算量会越来越大
position_ids = range(len(sentence_ids))
end = time.time()
print("\nspend time:",end-start)
if __name__ == "__main__":
accelerater = False
sentence = "今天是个好日子" #我们要做的就是把握住这个机会
predict(sentence,accelerater)
# embeddings = model_input(sentence_ids)
# output = gpt2(embeddings,num_heads=12)
# # print(output)
# output = feed_forward_layer(output[:,:], classifier_weight.T, activation='')
# samples = np.argmax(output,axis=-1)
# for i in samples[0]:
# print(vocab[i],end="")
结果:同样的结果,序列长度越长,时间上的差异越明显,这才生成100tokens,就已经明显看出速度的差异了
使用加速:
prompt输入: 今天是个好日子
,我们要做的就是把握住这个机会,把握住这个机会,把握住了,我相信我们的投资将会创造奇迹,而且我相信我们的投资团队一定能够创造奇迹,我相信我们的投资团队一定能够创造奇迹,我相信我们的投资团队一定能够创造
spend time: 22.19951105117798
不使用加速:
prompt输入: 今天是个好日子
,我们要做的就是把握住这个机会,把握住这个机会,把握住了,我相信我们的投资将会创造奇迹,而且我相信我们的投资团队一定能够创造奇迹,我相信我们的投资团队一定能够创造奇迹,我相信我们的投资团队一定能够创造
spend time: 42.4955039024353
模型参数:
transformer.wte.weight (21128, 768)
transformer.wpe.weight (1024, 768)
transformer.h.0.ln_1.weight (768,)
transformer.h.0.ln_1.bias (768,)
transformer.h.0.attn.c_attn.weight (768, 2304)
transformer.h.0.attn.c_attn.bias (2304,)
transformer.h.0.attn.c_proj.weight (768, 768)
transformer.h.0.attn.c_proj.bias (768,)
transformer.h.0.ln_2.weight (768,)
transformer.h.0.ln_2.bias (768,)
transformer.h.0.mlp.c_fc.weight (768, 3072)
transformer.h.0.mlp.c_fc.bias (3072,)
transformer.h.0.mlp.c_proj.weight (3072, 768)
transformer.h.0.mlp.c_proj.bias (768,)
transformer.h.1.ln_1.weight (768,)
transformer.h.1.ln_1.bias (768,)
transformer.h.1.attn.c_attn.weight (768, 2304)
transformer.h.1.attn.c_attn.bias (2304,)
transformer.h.1.attn.c_proj.weight (768, 768)
transformer.h.1.attn.c_proj.bias (768,)
transformer.h.1.ln_2.weight (768,)
transformer.h.1.ln_2.bias (768,)
transformer.h.1.mlp.c_fc.weight (768, 3072)
transformer.h.1.mlp.c_fc.bias (3072,)
transformer.h.1.mlp.c_proj.weight (3072, 768)
transformer.h.1.mlp.c_proj.bias (768,)
transformer.h.2.ln_1.weight (768,)
transformer.h.2.ln_1.bias (768,)
transformer.h.2.attn.c_attn.weight (768, 2304)
transformer.h.2.attn.c_attn.bias (2304,)
transformer.h.2.attn.c_proj.weight (768, 768)
transformer.h.2.attn.c_proj.bias (768,)
transformer.h.2.ln_2.weight (768,)
transformer.h.2.ln_2.bias (768,)
transformer.h.2.mlp.c_fc.weight (768, 3072)
transformer.h.2.mlp.c_fc.bias (3072,)
transformer.h.2.mlp.c_proj.weight (3072, 768)
transformer.h.2.mlp.c_proj.bias (768,)
transformer.h.3.ln_1.weight (768,)
transformer.h.3.ln_1.bias (768,)
transformer.h.3.attn.c_attn.weight (768, 2304)
transformer.h.3.attn.c_attn.bias (2304,)
transformer.h.3.attn.c_proj.weight (768, 768)
transformer.h.3.attn.c_proj.bias (768,)
transformer.h.3.ln_2.weight (768,)
transformer.h.3.ln_2.bias (768,)
transformer.h.3.mlp.c_fc.weight (768, 3072)
transformer.h.3.mlp.c_fc.bias (3072,)
transformer.h.3.mlp.c_proj.weight (3072, 768)
transformer.h.3.mlp.c_proj.bias (768,)
transformer.h.4.ln_1.weight (768,)
transformer.h.4.ln_1.bias (768,)
transformer.h.4.attn.c_attn.weight (768, 2304)
transformer.h.4.attn.c_attn.bias (2304,)
transformer.h.4.attn.c_proj.weight (768, 768)
transformer.h.4.attn.c_proj.bias (768,)
transformer.h.4.ln_2.weight (768,)
transformer.h.4.ln_2.bias (768,)
transformer.h.4.mlp.c_fc.weight (768, 3072)
transformer.h.4.mlp.c_fc.bias (3072,)
transformer.h.4.mlp.c_proj.weight (3072, 768)
transformer.h.4.mlp.c_proj.bias (768,)
transformer.h.5.ln_1.weight (768,)
transformer.h.5.ln_1.bias (768,)
transformer.h.5.attn.c_attn.weight (768, 2304)
transformer.h.5.attn.c_attn.bias (2304,)
transformer.h.5.attn.c_proj.weight (768, 768)
transformer.h.5.attn.c_proj.bias (768,)
transformer.h.5.ln_2.weight (768,)
transformer.h.5.ln_2.bias (768,)
transformer.h.5.mlp.c_fc.weight (768, 3072)
transformer.h.5.mlp.c_fc.bias (3072,)
transformer.h.5.mlp.c_proj.weight (3072, 768)
transformer.h.5.mlp.c_proj.bias (768,)
transformer.h.6.ln_1.weight (768,)
transformer.h.6.ln_1.bias (768,)
transformer.h.6.attn.c_attn.weight (768, 2304)
transformer.h.6.attn.c_attn.bias (2304,)
transformer.h.6.attn.c_proj.weight (768, 768)
transformer.h.6.attn.c_proj.bias (768,)
transformer.h.6.ln_2.weight (768,)
transformer.h.6.ln_2.bias (768,)
transformer.h.6.mlp.c_fc.weight (768, 3072)
transformer.h.6.mlp.c_fc.bias (3072,)
transformer.h.6.mlp.c_proj.weight (3072, 768)
transformer.h.6.mlp.c_proj.bias (768,)
transformer.h.7.ln_1.weight (768,)
transformer.h.7.ln_1.bias (768,)
transformer.h.7.attn.c_attn.weight (768, 2304)
transformer.h.7.attn.c_attn.bias (2304,)
transformer.h.7.attn.c_proj.weight (768, 768)
transformer.h.7.attn.c_proj.bias (768,)
transformer.h.7.ln_2.weight (768,)
transformer.h.7.ln_2.bias (768,)
transformer.h.7.mlp.c_fc.weight (768, 3072)
transformer.h.7.mlp.c_fc.bias (3072,)
transformer.h.7.mlp.c_proj.weight (3072, 768)
transformer.h.7.mlp.c_proj.bias (768,)
transformer.h.8.ln_1.weight (768,)
transformer.h.8.ln_1.bias (768,)
transformer.h.8.attn.c_attn.weight (768, 2304)
transformer.h.8.attn.c_attn.bias (2304,)
transformer.h.8.attn.c_proj.weight (768, 768)
transformer.h.8.attn.c_proj.bias (768,)
transformer.h.8.ln_2.weight (768,)
transformer.h.8.ln_2.bias (768,)
transformer.h.8.mlp.c_fc.weight (768, 3072)
transformer.h.8.mlp.c_fc.bias (3072,)
transformer.h.8.mlp.c_proj.weight (3072, 768)
transformer.h.8.mlp.c_proj.bias (768,)
transformer.h.9.ln_1.weight (768,)
transformer.h.9.ln_1.bias (768,)
transformer.h.9.attn.c_attn.weight (768, 2304)
transformer.h.9.attn.c_attn.bias (2304,)
transformer.h.9.attn.c_proj.weight (768, 768)
transformer.h.9.attn.c_proj.bias (768,)
transformer.h.9.ln_2.weight (768,)
transformer.h.9.ln_2.bias (768,)
transformer.h.9.mlp.c_fc.weight (768, 3072)
transformer.h.9.mlp.c_fc.bias (3072,)
transformer.h.9.mlp.c_proj.weight (3072, 768)
transformer.h.9.mlp.c_proj.bias (768,)
transformer.h.10.ln_1.weight (768,)
transformer.h.10.ln_1.bias (768,)
transformer.h.10.attn.c_attn.weight (768, 2304)
transformer.h.10.attn.c_attn.bias (2304,)
transformer.h.10.attn.c_proj.weight (768, 768)
transformer.h.10.attn.c_proj.bias (768,)
transformer.h.10.ln_2.weight (768,)
transformer.h.10.ln_2.bias (768,)
transformer.h.10.mlp.c_fc.weight (768, 3072)
transformer.h.10.mlp.c_fc.bias (3072,)
transformer.h.10.mlp.c_proj.weight (3072, 768)
transformer.h.10.mlp.c_proj.bias (768,)
transformer.h.11.ln_1.weight (768,)
transformer.h.11.ln_1.bias (768,)
transformer.h.11.attn.c_attn.weight (768, 2304)
transformer.h.11.attn.c_attn.bias (2304,)
transformer.h.11.attn.c_proj.weight (768, 768)
transformer.h.11.attn.c_proj.bias (768,)
transformer.h.11.ln_2.weight (768,)
transformer.h.11.ln_2.bias (768,)
transformer.h.11.mlp.c_fc.weight (768, 3072)
transformer.h.11.mlp.c_fc.bias (3072,)
transformer.h.11.mlp.c_proj.weight (3072, 768)
transformer.h.11.mlp.c_proj.bias (768,)
transformer.ln_f.weight (768,)
transformer.ln_f.bias (768,)
lm_head.weight (21128, 768)
原始的hunggingface模型,保存模型参数为numpy,然后上面的numpy版的gpt-2就可以加载了
import numpy as np
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
text_generator = TextGenerationPipeline(model, tokenizer)
print(text_generator("今天是个好日子", max_length=20, do_sample=True))
print(model)
# 打印BERT模型的权重维度
# for name, param in model.named_parameters():
# print(name, param.data.shape)
# print(model.lm_head.weight)
# print(model.lm_head.bias)
# # # 保存模型参数为NumPy格式
model_params = {name: param.data.cpu().numpy() for name, param in model.named_parameters()}
model_params["lm_head.weight"] = model.lm_head.weight.data.cpu().numpy()
np.savez('gpt2_model_params.npz', **model_params)
# model_params