1 embedding
1.1 根据封装的特征创建embedding网络结构
1.1.1 用户及物品特征封装
# feature 封装
feature_max_idx={'user_id': 4, 'movie_id': 208, 'gender': 3, 'age': 4, 'occupation': 4, 'zip': 4}
embedding_dim=16
#用户特征
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
SparseFeat("age", feature_max_idx['age'], embedding_dim),
SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
VarLenSparseFeat(SparseFeat('hist_movie_id',feature_max_idx['movie_id'],embedding_dim
,embedding_name="movie_id" ),50,'mean','hist_len')
]
#物品特征
item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]
1.1.2 获取单值离散特征和多值离散特征
item_feature_name=item_feature_columns[0].name # 'movie_id'
item_vocabulary_size=item_feature_columns[0].vocabulary_size # 208
feature_columns=user_feature_columns+item_feature_columns
# sparse_feature_columns
sparse_feature_columns=list( filter( lambda x: isinstance(x,SparseFeat),feature_columns )) if feature_columns else []
# varlen_sparse_feature_columns
varlen_sparse_feature_columns=list( filter( lambda x: isinstance(x,VarLenSparseFeat),feature_columns )) if feature_columns else []
1.1.3 离散特征的embedding 结构构造
sparse_embedding_dic={}
for feat in sparse_feature_columns:
print(feat.embedding_name)
emb=keras.layers.Embedding(feat.vocabulary_size,feat.embedding_dim
,name="sparse_emb_"+feat.embedding_name,trainable=feat.trainable)
sparse_embedding_dic[feat.embedding_name]=emb
for feat in varlen_sparse_feature_columns:
print(feat.sparsefeat.embedding_name)
emb=keras.layers.Embedding(feat.sparsefeat.vocabulary_size,feat.sparsefeat.embedding_dim
,name="sparse_seq_emb_"+feat.sparsefeat.name,trainable=feat.sparsefeat.trainable)
sparse_embedding_dic[feat.sparsefeat.embedding_name]=emb
print(sparse_embedding_dic)
{'user_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x17fdf4ee348>,
'gender': <tensorflow.python.keras.layers.embeddings.Embedding at 0x17fb4619ec8>,
'age': <tensorflow.python.keras.layers.embeddings.Embedding at 0x17fb4650188>,
'occupation': <tensorflow.python.keras.layers.embeddings.Embedding at 0x17fb4650488>,
'zip': <tensorflow.python.keras.layers.embeddings.Embedding at 0x17fb4650a08>,
'movie_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x17fb4651fc8>}
2 input
2.1 构造用户侧输入
input_features = {}
for feat in user_feature_columns:
if isinstance(feat,SparseFeat):
input_features[feat.name]=keras.layers.Input(shape=(1,),name=feat.name,dtype=feat.dtype)
elif isinstance(feat,DenseFeat):
input_features[feat.name]= keras.layers.Input(shape=(feat.dimension,),name=feat.name,dtype=feat.dtype)
elif isinstance(feat,VarLenSparseFeat):
input_features[feat.sparsefeat.name]= keras.layers.Input(shape=(feat.maxlen,),name=feat.sparsefeat.name,dtype=feat.sparsefeat.dtype)
else :
raise TypeError("Invalid feature column type ,got",type(fc))
print(input_features )
{'user_id': <tf.Tensor 'user_id_1:0' shape=(None, 1) dtype=int32>,
'gender': <tf.Tensor 'gender_1:0' shape=(None, 1) dtype=int32>,
'age': <tf.Tensor 'age_1:0' shape=(None, 1) dtype=int32>,
'occupation': <tf.Tensor 'occupation_1:0' shape=(None, 1) dtype=int32>,
'zip': <tf.Tensor 'zip_1:0' shape=(None, 1) dtype=int32>,
'hist_movie_id': <tf.Tensor 'hist_movie_id_1:0' shape=(None, 50) dtype=int32>}
3 Input 结合 embedding层
3.1 embedding(input)
input_features 为每个特征的输入形式,
sparse_embedding_dic为每个特征的 embedding结构
单个特征embedding 输出为 sparse_embedding_dic[embedding_name](input_features[feature_name])
user_sparse_feature_columns=list( filter( lambda x: isinstance(x,SparseFeat),user_feature_columns )) if user_feature_columns else []
user_varlen_sparse_feature_columns=list( filter( lambda x: isinstance(x,VarLenSparseFeat),user_feature_columns )) if user_feature_columns else []
embedding_dict={}
for fc in user_sparse_feature_columns:
feature_name=fc.name
embedding_name=fc.embedding_name
print(feature_name)
embedding_dict[feature_name]=sparse_embedding_dic[embedding_name](input_features[feature_name])
# varlen_embedding_lookup
for fc in user_varlen_sparse_feature_columns:
feature_name=fc.sparsefeat.name
embedding_name=fc.sparsefeat.embedding_name
print(feature_name)
embedding_dict[feature_name]=sparse_embedding_dic[embedding_name](input_features[feature_name])
print(embedding_dict)
{'user_id': <tf.Tensor 'sparse_emb_user_id_1/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
'gender': <tf.Tensor 'sparse_emb_gender_1/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
'age': <tf.Tensor 'sparse_emb_age_1/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
'occupation': <tf.Tensor 'sparse_emb_occupation_1/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
'zip': <tf.Tensor 'sparse_emb_zip_1/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
'hist_movie_id': <tf.Tensor 'sparse_seq_emb_hist_movie_id_2/embedding_lookup/Identity_1:0' shape=(None, 50, 16) dtype=float32>}
3.2 测试embedding(input)
modelTest=keras.models.Model(inputs=input_features["user_id"],outputs=embedding_dict["user_id"])
print(modelTest(tf.Variable([[1],[2]])))
print("\n*************************************\n")
modelTest.summary()
Tensor("model_14/sparse_emb_user_id/embedding_lookup/Identity_1:0", shape=(2, 1, 16), dtype=float32)
*************************************
Model: "model_14"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
user_id (InputLayer) [(None, 1)] 0
_________________________________________________________________
sparse_emb_user_id (Embeddin (None, 1, 16) 64
=================================================================
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________
3.3 SequencePoolingLayer
多指特征embedding后进行pooling
class SequencePoolingLayer(keras.layers.Layer):
def __init__(self, mode='mean', supports_masking=False, **kwargs):
super(SequencePoolingLayer, self).__init__(**kwargs)
def call(self,inputs):
return tf.expand_dims(tf.reduce_mean(inputs,axis=1), axis=1)
pooling_vec_list={}
pooling_vec_list['hist_movie_id']=SequencePoolingLayer()(embedding_dict['hist_movie_id'])
pooling_vec_list['hist_movie_id']
3.4 sparse_embedding_list
sparse_embedding_list=[]
list(embedding_dict.values())+list(pooling_vec_list.values())
for key,value in embedding_dict.items():
if key!="hist_movie_id":
sparse_embedding_list.append(value)
for key,value in pooling_vec_list.items():
sparse_embedding_list.append(value)
print(sparse_embedding_list)
[<tf.Tensor 'sparse_emb_user_id/e`在这里插入代码片`mbedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
<tf.Tensor 'sparse_emb_gender/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
<tf.Tensor 'sparse_emb_age/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
<tf.Tensor 'sparse_emb_occupation/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
<tf.Tensor 'sparse_emb_zip/embedding_lookup/Identity_1:0' shape=(None, 1, 16) dtype=float32>,
<tf.Tensor 'sequence_pooling_layer/ExpandDims:0' shape=(None, 1, 16) dtype=float32>]
4 拼接和打平 Concatenate and Flatten
user_dnn_input=keras.layers.Flatten()(keras.layers.Concatenate(-1)(sparse_embedding_list))
user_dnn_input
<tf.Tensor 'flatten/Reshape:0' shape=(None, 96) dtype=float32>
5 item处理
5.1 item Input
item_input_features={}
item_feature_columns
for fc in item_feature_columns:
if isinstance(fc, SparseFeat):
item_input_features[fc.name] = keras.layers.Input(
shape=(1,), name= fc.name, dtype=fc.dtype)
item_inputs_list = list(item_input_features.values())
item_inputs_list
5.2 所有物品Embedding
class EmbeddingIndex(keras.layers.Layer):
def __init__(self, index, **kwargs):
self.index = index
super(EmbeddingIndex, self).__init__(**kwargs)
def build(self, input_shape):
super(EmbeddingIndex, self).build(
input_shape) # Be sure to call this somewhere!
def call(self, x, **kwargs):
return tf.constant(self.index)
item_index=EmbeddingIndex(list(range(item_vocabulary_size)))(item_input_features[item_feature_name])
item_embedding_weight=sparse_embedding_dic[item_feature_name](item_index)
item_embedding_weight
<tf.Tensor 'sparse_seq_emb_hist_movie_id_1/embedding_lookup/Identity_1:0' shape=(208, 16) dtype=float32>
6 全连接层 DNN
全连接层为两层,隐藏层为[64,16]
hidden1 =keras.layers.Dense(64, activation='relu')(user_dnn_input)
user_dnn_out =keras.layers.Dense(16, activation='relu')(user_dnn_input)
user_dnn_out
7 SampledSoftmaxLayer
由于训练的全是正样本,所以采用抽样的softmax损失函数
7.1 SampledSoftmaxLayer 输入
①全量物品embedding
② user_dnn_out
③ item Input
结构如下
embeddings=item_embedding_weight
embeddings
<tf.Tensor 'sparse_seq_emb_hist_movie_id_1/embedding_lookup/Identity_1:0' shape=(208, 16) dtype=float32>
inputs=user_dnn_out
inputs
<tf.Tensor 'dense_1/Relu:0' shape=(None, 16) dtype=float32>
label_idx=item_input_features[item_feature_name]
label_idx
<tf.Tensor 'movie_id:0' shape=(None, 1) dtype=int32>
7.2 SampledSoftmaxLayer 层
class SampledSoftmaxLayer(keras.layers.Layer):
def __init__(self, num_sampled=5, **kwargs):
self.num_sampled = num_sampled
super(SampledSoftmaxLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.size = input_shape[0][0]
self.zero_bias = self.add_weight(shape=[self.size],
initializer=keras.initializers.Zeros,
dtype=tf.float32,
trainable=False,
name="bias")
super(SampledSoftmaxLayer, self).build(input_shape)
def call(self, inputs_with_label_idx, training=None, **kwargs):
embeddings, inputs, label_idx = inputs_with_label_idx
loss = tf.nn.sampled_softmax_loss(weights=embeddings, # self.item_embedding.
biases=self.zero_bias,
labels=label_idx,
inputs=inputs,
num_sampled=self.num_sampled,
num_classes=self.size, # self.target_song_size
)
return tf.expand_dims(loss, axis=1)
def compute_output_shape(self, input_shape):
return (None, 1)
def get_config(self, ):
config = {'num_sampled': self.num_sampled}
base_config = super(SampledSoftmaxLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
output=SampledSoftmaxLayer()([embeddings,inputs,label_idx])
output
<tf.Tensor 'sampled_softmax_layer_1/ExpandDims:0' shape=(None, 1) dtype=float32>
8 模型构建
模型输入为前面的user_inputs_list 和item_inputs_list,结构如下,模型输出为上面的output.
user_inputs_list = list(input_features.values())
print(user_inputs_list )
print(item_inputs_list)
model=keras.models.Model(inputs=user_inputs_list + item_inputs_list, outputs=output)
[<tf.Tensor 'user_id:0' shape=(None, 1) dtype=int32>,
<tf.Tensor 'gender:0' shape=(None, 1) dtype=int32>,
<tf.Tensor 'age:0' shape=(None, 1) dtype=int32>,
<tf.Tensor 'occupation:0' shape=(None, 1) dtype=int32>,
<tf.Tensor 'zip:0' shape=(None, 1) dtype=int32>,
<tf.Tensor 'hist_movie_id:0' shape=(None, 50) dtype=int32>]
[<tf.Tensor 'movie_id:0' shape=(None, 1) dtype=int32>]