import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
import random
import numpy as np
from tqdm import tqdm
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from deepmatch.models import*from deepmatch.utils import sampledsoftmaxloss
from tensorflow.keras.utils import plot_model
import warnings
warnings.filterwarnings("ignore")
K.set_learning_phase(True)import tensorflow as tf
if tf.__version__ >='2.0.0':
tf.compat.v1.disable_eager_execution()
model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64,16))
WARNING:root:
DeepCTR version 0.8.3 detected. Your version is 0.8.2.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.3
WARNING:tensorflow:From /home/gavin/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/adagrad.py:82: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
WARNING:tensorflow:From /home/gavin/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/adagrad.py:82: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 908 samples
Epoch 1/5
908/908 [==============================] - 1s 556us/sample - loss: 1.4003
Epoch 2/5
908/908 [==============================] - 0s 22us/sample - loss: 1.1123
Epoch 3/5
908/908 [==============================] - 0s 24us/sample - loss: 1.0816
Epoch 4/5
908/908 [==============================] - 0s 22us/sample - loss: 1.2225
Epoch 5/5
908/908 [==============================] - 0s 25us/sample - loss: 0.9680
Generate user features for testing and full item features for retrieval
# 测试阶段
test_user_model_input = test_model_input
all_item_model_input ={"movie_id": item_profile['movie_id'].values}
user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2**12)# user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2**12)
[Optional] ANN search by faiss and evaluate the result
test_true_label ={line[0]:[line[2]]for line in test_set}import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N
index = faiss.IndexFlatIP(embedding_dim)# faiss.normalize_L2(item_embs)
index.add(item_embs)# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs),50)
s =[]
hit =0for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):try:
pred =[item_profile['movie_id'].values[x]for x in I[i]]
filter_item =None
recall_score = recall_N(test_true_label[uid], pred, N=50)
s.append(recall_score)if test_true_label[uid]in pred:
hit +=1except:print(i)print("recall", np.mean(s))print("hr", hit /len(test_user_model_input['user_id']))
3it [00:00, 769.93it/s]
recall 0.0
hr 0.0
源码分析
召回
defYoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5,
user_dnn_hidden_units=(64,32),
dnn_activation='relu', dnn_use_bn=False,
l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, output_activation='linear', seed=1024,):"""Instantiates the YoutubeDNN Model architecture.
:param user_feature_columns: An iterable containing user's features used by the model.
:param item_feature_columns: An iterable containing item's features used by the model.
:param num_sampled: int, the number of classes to randomly sample per batch.
:param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower
:param dnn_activation: Activation function to use in deep net
:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
:param seed: integer ,to use as random seed.
:param output_activation: Activation function to use in output layer
:return: A Keras model instance.
"""iflen(item_feature_columns)>1:raise ValueError("Now YoutubeNN only support 1 item feature like item_id")
item_feature_name = item_feature_columns[0].name
item_vocabulary_size = item_feature_columns[0].vocabulary_size
# 为稀疏特征创建对应的Embedding字典
embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
seed=seed)# 获得用户输入特征
user_features = build_input_features(user_feature_columns)
user_inputs_list =list(user_features.values())
user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, user_feature_columns,
l2_reg_embedding, seed=seed,
embedding_matrix_dict=embedding_matrix_dict)
user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list)
item_features = build_input_features(item_feature_columns)
item_inputs_list =list(item_features.values())
user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, output_activation=output_activation, seed=seed)(user_dnn_input)
item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name])
item_embedding_matrix = embedding_matrix_dict[item_feature_name]#获得每一个item的Embedding向量
item_embedding_weight = NoMask()(item_embedding_matrix(item_index))
pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight])
output = SampledSoftmaxLayer(num_sampled=num_sampled)([pooling_item_embedding_weight, user_dnn_out, item_features[item_feature_name]])
model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)# 设置属性接口,调用对应的数据
model.__setattr__("user_input", user_inputs_list)
model.__setattr__("user_embedding", user_dnn_out)
model.__setattr__("item_input", item_inputs_list)
model.__setattr__("item_embedding",
get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name]))return model