直接上干货
1.0初版代码
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# 定义Transformer模型
def transformer_model(input_shape, num_layers, d_model, num_heads, dff, dropout_rate):
inputs = layers.Input(shape=input_shape)
# 添加掩码
padding_mask = layers.Lambda(lambda x: tf.cast(tf.math.equal(x, 0), tf.float32))(inputs)
encoder_masking = layers.Masking(mask_value=0.0)(inputs)
# 编码器
x = layers.Dense(d_model, activation="relu")(encoder_masking)
for i in range(num_layers):
x = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(
[x, x],
attention_mask=[padding_mask, None],
)
x = layers.Dropout(dropout_rate)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
# 前馈网络
ffn = keras.Sequential(
[layers.Dense(dff, activation="relu"), layers.Dense(d_model)]
)
x = ffn(x)
x = layers.Dropout(dropout_rate)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
# 解码器
outputs = layers.Dense(1)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
# 定义输入数据
input_shape = (10, 1) # 每个子序列的长度为10,每个时间步的特征数为1
x = keras.Input(shape=input_shape)
y = keras.layers.Lambda(lambda x: x[:, -1, :])(x) # 将每个子序列的最后一个时间步的特征作为该子序列的输出
y = keras.layers.Reshape((1, 1))(y)
# 定义模型
model = transformer_model(
input_shape=input_shape,
num_layers=2,
d_model=32,
num_heads=4,
dff=64,
dropout_rate=0.2,
)
# 编译模型
model.compile(loss="mse", optimizer="adam")
# 训练模型
history = model.fit(x, y, epochs=100, batch_size=32)
我们首先定义了一个padding_mask
张量,它是一个与输入张量形状相同的张量,其中每个元素的值为0或1,表示该位置是否是填充位置(如果是填充位置,则对应的值为1)。然后,我们使用Lambda
层将输入张量转换为一个掩码张量,其中填充位置的值为0,非填充位置的值为1。接下来,我们使用Masking
层将掩码张量应用于输入张量,从而对填充位置进行掩码。最后,我们在第一个Transformer层中传递掩码张量,以便模型能够在训练和推理过程中正确地使用掩码。
在Transformer模型中,Multi-Head Attention是一个重要的组件,它允许模型同时关注输入序列的不同位置,并且可以学习输入序列中不同位置之间的关系。在Multi-Head Attention中,我们需要传递两个参数:一个查询序列和一个键值对序列。在实际实现中,我们通常使用同一个输入序列来构建这两个序列,因此在Keras中实现时,传递的参数为[x,x],其中x是输入序列。
具体来说,Multi-Head Attention包括三个线性变换,分别是查询、键和值的线性变换。在Keras中,我们可以使用一个全连接层(Dense
层)来实现这个线性变换:
query = layers.Dense(d_model)(x)
key = layers.Dense(d_model)(x)
value = layers.Dense(d_model)(x)
在实现这个线性变换后,我们将query、key和value分别传递给Multi-Head Attention层。在Keras中,我们使用MultiHeadAttention
层来实现Multi-Head Attention。这个层接受两个输入:一个查询序列和一个键值对序列。在实际应用中,我们通常使用同一个输入序列来构建这两个序列,因此在Keras中,我们需要将x复制一份,作为查询序列和键值对序列的输入:
attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)([query, key, value])
其中,num_heads
表示头的数量,key_dim
表示键和值的维度。在这个层中,我们将查询、键和值分别按头的数量进行划分,并对每个头进行独立的注意力计算。最终,我们将每个头的输出连接起来,形成最终的输出。注意,这个输出的形状与输入序列的形状相同。
因此,在Keras中实现Multi-Head Attention时,我们需要将同一个输入序列复制一份,作为查询序列和键值对序列的输入,然后将它们传递给MultiHeadAttention
层。因此,输入的参数是[x,x],其中x是输入序列。
2.0更新
2023.10.18update,tf 版本可降为 2.0,给出大家自己手写的 attention
#!/usr/bin/env python
# coding: utf-8
from tensorflow.keras.layers import Layer, Dense, LSTM, concatenate, Reshape, Flatten,Embedding
from tensorflow.keras import Input,Model,Sequential
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
import tensorflow as tf
class Embedding(Layer):
def __init__(self, vocab_size, model_dim, **kwargs):
self._vocab_size = vocab_size
self._model_dim = model_dim
super(Embedding, self).__init__(**kwargs)
def build(self, input_shape):
self.embeddings = self.add_weight(
shape=(self._vocab_size, self._model_dim),
initializer='glorot_uniform',
name="embeddings")
super(Embedding, self).build(input_shape)
def call(self, inputs):
if K.dtype(inputs) != 'int32':
inputs = K.cast(inputs, 'int32')
embeddings = K.gather(self.embeddings, inputs)
embeddings *= self._model_dim ** 0.5 # Scale
return embeddings
def compute_output_shape(self, input_shape):
return input_shape + (self._model_dim,)
class PositionEncoding(Layer):
def __init__(self, model_dim, **kwargs):
self._model_dim = model_dim
super(PositionEncoding, self).__init__(**kwargs)
def call(self, inputs):
seq_length = inputs.shape[1]
position_encodings = np.zeros((seq_length, self._model_dim))
for pos in range(seq_length):
for i in range(self._model_dim):
position_encodings[pos, i] = pos / np.power(10000, (i-i%2) / self._model_dim)
position_encodings[:, 0::2] = np.sin(position_encodings[:, 0::2]) # 2i
position_encodings[:, 1::2] = np.cos(position_encodings[:, 1::2]) # 2i+1
position_encodings = K.cast(position_encodings, 'float32')
return position_encodings
def compute_output_shape(self, input_shape):
return input_shape
class Add(Layer):
def __init__(self, **kwargs):
super(Add, self).__init__(**kwargs)
def call(self, inputs):
input_a, input_b = inputs
return input_a + input_b
def compute_output_shape(self, input_shape):
return input_shape[0]
class ScaledDotProductAttention(Layer):
def __init__(self, masking=True, future=False, dropout_rate=0., **kwargs):
self._masking = masking
self._future = future
self._dropout_rate = dropout_rate
self._masking_num = -2**32+1
super(ScaledDotProductAttention, self).__init__(**kwargs)
def mask(self, inputs, masks):
masks = K.cast(masks, 'float32')
masks = K.tile(masks, [K.shape(inputs)[0] // K.shape(masks)[0], 1])
masks = K.expand_dims(masks, 1)
outputs = inputs + masks * self._masking_num
return outputs
def future_mask(self, inputs):
diag_vals = tf.ones_like(inputs[0, :, :])
tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
future_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])
paddings = tf.ones_like(future_masks) * self._masking_num
outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs)
return outputs
def call(self, inputs):
if self._masking:
assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]."
queries, keys, values, masks = inputs
else:
assert len(inputs) == 3, "inputs should be set [queries, keys, values]."
queries, keys, values = inputs
if K.dtype(queries) != 'float32': queries = K.cast(queries, 'float32')
if K.dtype(keys) != 'float32': keys = K.cast(keys, 'float32')
if K.dtype(values) != 'float32': values = K.cast(values, 'float32')
matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul
scaled_matmul = matmul / int(queries.shape[-1]) ** 0.5 # Scale
if self._masking:
scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.)
if self._future:
scaled_matmul = self.future_mask(scaled_matmul)
softmax_out = K.softmax(scaled_matmul) # SoftMax
# Dropout
out = K.dropout(softmax_out, self._dropout_rate)
outputs = K.batch_dot(out, values)
return outputs
def compute_output_shape(self, input_shape):
return input_shape
'''
https://zhuanlan.zhihu.com/p/116091338
'''
class MultiHeadAttention(Layer):
def __init__(self, n_heads, head_dim, dropout_rate=0.1, masking=False, future=False, trainable=True, **kwargs):
self._n_heads = n_heads
self._head_dim = head_dim
self._dropout_rate = dropout_rate
self._masking = masking
self._future = future
self._trainable = trainable
super(MultiHeadAttention, self).__init__(**kwargs)
def build(self, input_shape):
self._weights_queries = self.add_weight(
shape=(input_shape[0][-1], self._n_heads * self._head_dim),
initializer='glorot_uniform',
trainable=self._trainable,
name='weights_queries')
self._weights_keys = self.add_weight(
shape=(input_shape[1][-1], self._n_heads * self._head_dim),
initializer='glorot_uniform',
trainable=self._trainable,
name='weights_keys')
self._weights_values = self.add_weight(
shape=(input_shape[2][-1], self._n_heads * self._head_dim),
initializer='glorot_uniform',
trainable=self._trainable,
name='weights_values')
self.out = Dense(input_shape[0][-1])
super(MultiHeadAttention, self).build(input_shape)
def call(self, inputs):
if self._masking:
assert len(inputs) == 4, "inputs should be set [queries, keys, values, masks]."
queries, keys, values, masks = inputs
else:
assert len(inputs) == 3, "inputs should be set [queries, keys, values]."
queries, keys, values = inputs
queries_linear = K.dot(queries, self._weights_queries)
keys_linear = K.dot(keys, self._weights_keys)
values_linear = K.dot(values, self._weights_values)
queries_multi_heads = tf.concat(tf.split(queries_linear, self._n_heads, axis=2), axis=0)
keys_multi_heads = tf.concat(tf.split(keys_linear, self._n_heads, axis=2), axis=0)
values_multi_heads = tf.concat(tf.split(values_linear, self._n_heads, axis=2), axis=0)
if self._masking:
att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads, masks]
else:
att_inputs = [queries_multi_heads, keys_multi_heads, values_multi_heads]
#print('values_multi_heads',values_multi_heads.shape)# (None, 180, 256)
attention = ScaledDotProductAttention(
masking=self._masking, future=self._future, dropout_rate=self._dropout_rate)
att_out = attention(att_inputs)
#print('att_out',att_out.shape) #(None, 180, 256)
outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2)
return self.out(outputs)
def compute_output_shape(self, input_shape):
return input_shape
if __name__ == '__main__':
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
vocab_size = 5000
max_len = 256
model_dim = 512
batch_size = 128
epochs = 10
print("Data downloading and pre-processing ... ")
# (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=max_len, num_words=vocab_size)
x_train = np.random.rand(100,180,4)
y_train = np.random.rand(100,1)
# x_train = sequence.pad_sequences(x_train, maxlen=max_len)
# x_test = sequence.pad_sequences(x_test, maxlen=max_len)
x_train_masks = tf.equal(x_train, 0)
# x_test_masks = tf.equal(x_test, 0)
# y_train = to_categorical(y_train)
# y_test = to_categorical(y_test)
print('Model building ... ')
# inputs = Input(shape=(max_len,), name="inputs")
inputs = Input(shape=x_train.shape[1:], name="inputs")
# masks = Input(shape=(max_len,), name='masks')
masks = Input(shape=x_train_masks.shape[1:], name='masks')
embeddings = Embedding(vocab_size, model_dim)(inputs)
print(embeddings,embeddings.shape)
encodings = PositionEncoding(model_dim)(embeddings)
print(encodings,encodings.shape)
encodings = Add()([embeddings, encodings])
x = MultiHeadAttention(8, 64)([encodings, encodings, encodings])
x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(10, activation='relu')(x)
outputs = Dense(2, activation='softmax')(x)
model = Model(inputs=[inputs, masks], outputs=outputs)
model.compile(optimizer=Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9),
loss='categorical_crossentropy', metrics=['accuracy'])
print("Model Training ... ")
es = EarlyStopping(patience=5)
model.fit([x_train, x_train_masks], y_train,
batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[es])
# test_metrics = model.evaluate([x_test, x_test_masks], y_test, batch_size=batch_size, verbose=0)
# print("loss on Test: %.4f" % test_metrics[0])
# print("accu on Test: %.4f" % test_metrics[1])