# 开始导入各种模型和层from tensorflow.keras import Model,Input
from tensorflow.keras.layers import concatenate,Concatenate,Dense,Embedding
from time import time # 计算运行时间from tensorflow.keras.layers import Conv1D,Dropout,GlobalMaxPool1D
from tensorflow.keras.layers import Dot,Lambda
from tensorflow.keras.layers import Flatten,Concatenate,MaxPool1D,MaxPooling1D,Dropout
from tensorflow.keras.layers import Activation,Layer
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Softmax
import typing
def_make_multi_layer_perceptron_layer()-> keras.layers.Layer:# TODO: do not create new layers for a second callifnotTrue:raise AttributeError('Parameter `with_multi_layer_perception` not set.')def_wrapper(x):
activation ='relu'for _ inrange(3):
x = keras.layers.Dense(128,
activation=activation)(x)return keras.layers.Dense(64,
activation=activation)(x)return _wrapper
def_kernel_layer(mu:float, sigma:float)->keras.layers.Layer:"""
Gaussian kernel layer in KNRM.
:param mu: Float, mean of the kernel.
:param sigma: Float, sigma of the kernel.
:return: `keras.layers.Layer`.
"""defkernel(x):return tf.math.exp(-0.5*(x - mu)*(x - mu)/ sigma / sigma)return Activation(kernel)
def_hadamard_dot(x):
x1 = x[0]
x2 = x[1]
out = x1 * x2
return out
defattention_layer(attention_input: typing.Any,
attention_mask: typing.Any =None)-> keras.layers.Layer:
dense_input = Dense(1, use_bias=False)(attention_input)# dense_input = Lambda(# lambda x: x + (1.0 - attention_mask) * -10000.0,# name="attention_mask"# )(dense_input)if attention_mask isnotNone:# Since attention_mask is 1.0 for positions we want to attend and# 0.0 for masked positions, this operation will create a tensor# which is 0.0 for positions we want to attend and -10000.0 for# masked positions.#这里是原来的函数"""
dense_input = keras.layers.Lambda(
lambda x: x + (1.0 - attention_mask) * -10000.0,
name="attention_mask"
)(dense_input)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""print('='*20)# 这个的关键应该就是把Lambda原有表达式,抽出来以函数的形式写出来,然后再用函数替换原来的位置defreshape_(x):lambda x: x +(1.0- attention_mask)*-10000.0return x
# shape = [B, L, 1]
dense_input = Lambda(
reshape_,
name="attention_mask")(dense_input)""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""# shape = [B, L, 1]
attention_probs =Lambda(lambda x: tf.nn.softmax(x, axis=1),
output_shape=lambda s:(s[0], s[1], s[2]),
name="attention_probs")(dense_input)return attention_probs
def_sentence_encoder(
input_: typing.Any,
lstm_num_units:int,
drop_rate:float)-> typing.Any:"""
Stack three BiLSTM MaxPooling blocks as a hierarchical structure.
Concatenate the output of three blocs as the input sentence embedding.
Each BiLSTM layer reads the input sentence as the input.
Each BiLSTM layer except the first one is initialized(the initial
hidden state and the cell state) with the final state of the previous
layer.
"""
emb1 = keras.layers.Bidirectional(
keras.layers.LSTM(
units=lstm_num_units,
return_sequences=True,
return_state=True,
dropout=drop_rate,
recurrent_dropout=drop_rate),
merge_mode='concat')(input_)
emb1_maxpooling = keras.layers.GlobalMaxPooling1D()(emb1[0])
emb2 = keras.layers.Bidirectional(
keras.layers.LSTM(
units=lstm_num_units,
return_sequences=True,
return_state=True,
dropout=drop_rate,
recurrent_dropout=drop_rate),
merge_mode='concat')(input_, initial_state=emb1[1:5])
emb2_maxpooling = keras.layers.GlobalMaxPooling1D()(emb2[0])
emb3 = keras.layers.Bidirectional(
keras.layers.LSTM(
units=lstm_num_units,
return_sequences=True,
return_state=True,
dropout=drop_rate,
recurrent_dropout=drop_rate),
merge_mode='concat')(input_, initial_state=emb2[1:5])
emb3_maxpooling = keras.layers.GlobalMaxPooling1D()(emb3[0])
emb = keras.layers.Concatenate(axis=1)([emb1_maxpooling, emb2_maxpooling, emb3_maxpooling])return emb
def_expand_dim(inp: tf.Tensor, axis:int)-> keras.layers.Layer:"""
Wrap keras.backend.expand_dims into a Lambda layer.
:param inp: input tensor to expand the dimension
:param axis: the axis of new dimension
"""return keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=axis))(inp)
def_make_atten_mask_layer()-> keras.layers.Layer:"""
Make mask layer for attention weight matrix so that
each word won't pay attention to <PAD> timestep.
"""return keras.layers.Lambda(lambda weight_mask: weight_mask[0]+(1.0- weight_mask[1])*-1e7,
name="atten_mask")
def_avg(texts: tf.Tensor, mask: tf.Tensor)-> tf.Tensor:"""
Compute the mean of each text according to their real length
:param texts: np.array with shape [B, T, H]
:param lengths: np.array with shape [B, T, ],
where 1 means valid, 0 means pad
"""
mask = _expand_dim(mask, axis=2)
new_texts = keras.layers.Multiply()([texts, mask])# timestep-wise division, exclude the PAD number when calc avg
text_avg = keras.layers.Lambda(lambda text_mask:
tf.reduce_sum(text_mask[0], axis=1)/ tf.reduce_sum(text_mask[1], axis=1),)([new_texts, mask])return text_avg
def_max(texts: tf.Tensor, mask: tf.Tensor)-> tf.Tensor:"""
Compute the max of each text according to their real length
:param texts: np.array with shape [B, T, H]
:param lengths: np.array with shape [B, T, ],
where 1 means valid, 0 means pad
"""
mask = _expand_dim(mask, axis=2)
new_texts = keras.layers.Multiply()([texts, mask])
text_max = keras.layers.Lambda(lambda x: tf.reduce_max(x, axis=1),)(new_texts)return text_max