为了实现视频推荐, 参照DIN 网络模型, 我做了如下修改:
DIN 是为了广告推荐而生, 但是我是视频推荐,因此将click 表示为喜欢, unclick 表示为不喜欢, 只要是没有用户操作的: 点击,收藏,评论等行为都属于不喜欢。 有相关记录的表示为click。 因此实际上采用的是二分类模型。
为了降低网络的参数, 稀疏向量映射到等长向量空间采用同样的权重(unclick and click)
代码距离部署还有需要完善的地方, 因此勿喷.....
在这篇文章中,提到了两个重要的模型, 一个是DIN 的base model, 一个是 DIN 模型。 两个模型的区别请参照论文部分:
作者进行了改进。 这篇博客主要实现了base model 模型。 同时base mode 模型与cf(协同过滤模型进行配合)。
文中的embedding layer :是将候选序列进行等长向量化, 在我实现的过程中, 没有采用加权table lookup, 而是直接采用sum 操作:
关键代码如下:
如果采用内部加权并得到等长的向量结果可以参照:
两者的区别需要自行去研究。
所有代码放在两个文件中, cf 文件存放的是协同过滤相关的代码,DIN 文件存放的是base 模型。
cf.py:
======================================================================
#!/usr/bin/python # -*- coding: UTF-8 -*- import numpy as np import pandas as pd from pathlib import Path import os import operator import pickle import sys from multiprocessing.dummy import Pool as ThreadPool ''' @name: kenny adelaide @email: kenny13141314@163.com @time: 2022/2/8 @description: this is a cf algorithm for jojo RS. ''' def calculation_correlation_by_user_information(user_information_data): ''' calculation user's correlation parameters. :param user_information_data: user's basic information data. :return: dict ''' print(user_information_data) class cf(object): ''' this is a cf algorithm for RS system. where two methods had implemented are that pre-calculation and predict. otherwise, we had implemented zero-start for a user, and how to transform the zero-starting pattern into none-zero -starting pattern. zero-staring: 1, we use user basic information to calculate a user's cluster. 2, we calculate the hot videos for a user set. ''' def __init__(self, _flag=True, _user_information_dir=None, _user_video_score_dir=None, plk_dir=None): ''' this is init method for none-zero-starting pattern and onload local calculating result for a user. :param _flag: this is a bool variable for changing a starting pattern. True is zero-starting pattern. :param _user_information_dir: this is a file path about user basic information. :param _user_video_score_dir: this is a file path about user score information. :param plk_dir: this is a data persistence file path for predicting. and return a video-sequence. ''' os_path = os.path.dirname(os.path.abspath(__file__)) self.user_information_dir = os_path self.user_video_score_dir = os_path if _flag: self.threads = 6 self.user_information_dir = os_path + _user_information_dir self.user_video_score_dir = os_path + _user_video_score_dir if Path(self.user_information_dir).is_file(): user_information_data = pd.read_csv(self.user_information_dir) user_video_score_data = np.array(pd.read_csv(self.user_video_score_dir)[['userid', 'videoid', 'score']]) max_userid = int(np.max(user_video_score_data[:, 0]) + 1) max_videoid = int(np.max(user_video_score_data[:, 1]) + 1) user_score_matrix = np.zeros(shape=[max_userid, max_videoid]) for index, value in enumerate(user_video_score_data): user_score_matrix[int(value[0]), int(value[1])] = value[2] dic = dict() dic_corr = self.person_correlation_matrix(socre_matrix=user_score_matrix) dic_corr['users'] = user_score_matrix.shape[0] dic['dic_corr'] = dic_corr dic['user_score_matrix'] = user_score_matrix with open(os_path + '/jojo.pkl', 'wb') as f: pickle.dump(dic, f, pickle.HIGHEST_PROTOCOL) # self.predict(current_userid=3, n=1, dic=dic) else: # if Path(self.user_information_dir).is_file(plk_dir): plk_dir = os_path + plk_dir with open(plk_dir, 'rb') as f: self.dic = pickle.load(f) def nearst_user(self, userid, K=None, dic=None): ''' this method is for calculating the current user's nearest users, these users's items will be send to current user. :param userid: current userid. :param K: k is a threshold for return other user's count. :param dic: this is dict() struct for saving person-correlation between a user and other users. :return: returning a dict. ''' coor = {} for i in range(1, int(dic['users'])): key = '{}_{}'.format(userid, i) if key in dic and float(dic[key]) != 1: coor[i] = dic[key] sorted_distances = sorted(coor.items(), key=operator.itemgetter(1), reverse=True) return dict(sorted_distances[:K]) def determine_current_user_has_existed(self, current_user=None): ''' to determine the current user has existed. :param current_user: user id :return: True or False. ''' score_matrix = self.dic['user_score_matrix'] if int(current_user) not in range(1, len(score_matrix[:, 0])): return True else: return False def predict(self, current_userid=None): ''' this is a predict method for app. :param current_userid: current userid :return: a list of videos. ''' recommand = {} recommand[current_userid] = current_userid user_items_score = np.array(self.dic['user_score_matrix']) current_user_items = np.array(np.nonzero(user_items_score[current_userid, :]))[0] for user, coor in self.nearst_user(dic=self.dic['dic_corr'], K=20, userid=current_userid).items(): if coor > 0.50: # print('推荐的用户:{},person:{}'.format(user, coor)) other_user_items_score = user_items_score[user, :] items = np.array(np.nonzero(other_user_items_score))[0] for item in items: if (item not in current_user_items): ''' recommand: saves a set of video for a user that the current user had not seen. theory: using weighted implementation as the formula: user a's perference about item a1 = other user and current user's person param * other's item. ''' recommand[item] = other_user_items_score[item] * coor recommand_result = sorted(recommand.items(), key=operator.itemgetter(1), reverse=True) results = [] for index, result in enumerate(recommand_result): # print('推荐的视频:{}, 偏好:{}'.format(result[0], result[1])) results.append(result[0]) return results def person_correlation_matrix(self, socre_matrix=None): ''' via user's score information to calculate person-correlation parameter. :param socre_matrix: score matrix. :return: return a dict() ''' corr_dict = dict() for i in range(1, socre_matrix.shape[0]): for j in range(1, socre_matrix.shape[0]): corr = self.person_correlation_calculation(X=socre_matrix[i, :], Y=socre_matrix[j, :]) corr_dict['{}_{}'.format(i, j)] = corr return corr_dict def person_correlation_calculation(self, X=None, Y=None): ''' this is a person correlation calculation method. :param X: :param Y: :return: ''' N = len(X) son = np.sum(X * Y) - (np.sum(X) * np.sum(Y)) / N mother = np.sqrt((np.sum(np.power(X, 2)) - np.power(np.sum(X), 2) / N) * ( np.sum(np.power(Y, 2)) - np.power(np.sum(Y), 2) / N)) mother = 1 if mother == 0 else mother return (son / mother + 1) / 2 def euclidean_distance(self, X=None, Y=None, K=None): ''' this is a method to calculate the distance via Euclidean space. :param X: vecvtor X :param Y: vector Y :param K: the count of k :return: ''' return def video_hot_recommondation(self): ''' this is a hot-video recommondation . :return:hot videos ''' self.user_video_score_dir = self.user_video_score_dir + '/data/00000005.csv' df = pd.read_csv(self.user_video_score_dir) data = list(np.array(df[['userid', 'videoid', 'score']])[:, 1]) number = list(set(data)) recommond = dict() for item in number: if data.count(item) >= 1: recommond[int(item)] = data.count(item) recommond = sorted(recommond.items(), key=operator.itemgetter(1), reverse=True)[:20] resutls = [item[0] for item in recommond] return resutls # demo = cf(_flag=True, # _user_information_dir='/data/00000008.csv', # _user_video_score_dir='/data/00000005.csv') demo = cf(_flag=False, plk_dir='/jojo.pkl') def jojo_predict(userid=None, flag=None): if flag: result = demo.video_hot_recommondation() print(result) return else: if demo.determine_current_user_has_existed(userid): print('[{},{}]'.format(0, 'userid parameters is error...')) return else: result = demo.predict(current_userid=userid) if len(result) <= 1: result = demo.video_hot_recommondation() print(result) return def jojo_predict_1(userid=None, flag=None): if flag: result = demo.video_hot_recommondation() return result else: if demo.determine_current_user_has_existed(userid): print('[{},{}]'.format(0, 'userid parameters is error...')) return else: result = demo.predict(current_userid=userid) if len(result) <= 1: result = demo.video_hot_recommondation() return result # if __name__ == '__main__': # userid = int(sys.argv[1]) # flag = False if (str(sys.argv[2]) == 'False' or str(sys.argv[2])) == 'false' else True # jojo_predict(userid=userid, flag=flag)
DIN.py:
=========================================================================
import tensorflow as tf import pandas as pd import numpy as np import matplotlib.pyplot as plt import keras.backend as K import os import pickle from jojo.cf import jojo_predict_1, cf import operator class DataManagermMachine(object): def __init__(self): self.score_dir = './jojo/data/00000005.csv' self.user_info_dir = './jojo/data/00000008.csv' self.user_info_data = pd.read_csv(self.user_info_dir) self.score_data = pd.read_csv(self.score_dir) self.user_information_one_hot_features_dic = dict() self.user_click_one_hot_features_dic = dict() self.user_candidate_one_hot_features_dic = dict() # =============================================================================================================== user_infor_titles = ['userid', 'age', 'profeesion', 'sex', 'area', 'income', 'hobby'] user_info_data = self.user_info_data[user_infor_titles] userids = np.array(user_info_data)[:, 0] self.users = userids # users_info = user_info_data[:, 1: user_info_data.shape[1]] user_info_temp_titles = ['age', 'profeesion', 'sex', 'area', 'income', 'hobby'] user_info_one_hot_features = [] for index, item in enumerate(user_info_temp_titles): data = np.array(user_info_data[item]) user_info_one_hot = tf.one_hot(data, len(set(data))) user_info_one_hot_features.append(user_info_one_hot) user_info_one_hot_features = tf.concat(user_info_one_hot_features, axis=1) for userid in userids: self.user_information_one_hot_features_dic[userid] = tf.Variable( user_info_one_hot_features.numpy()[userid - 1, :]) # ============================================================================================================== behavior_item_titles = ['userid', 'videoid', 'score'] behavior_items_data = self.score_data[behavior_item_titles] userids = set(np.array(behavior_items_data)[:, 0]) video_max = np.max(np.array(behavior_items_data)[:, 1]) + 1 video_sequence = [i for i in range(1, int(video_max))] behavior_one_hot_features = tf.one_hot(video_sequence, int(video_max)).numpy() self.data = [] self.y = [] for user in userids: vedioes = np.array(behavior_items_data.loc[behavior_items_data['userid'] == user])[:, 1] candidate = list(set(vedioes) ^ set(video_sequence)) features1 = [] features2 = [] for video in video_sequence: if video in vedioes: features1.append(behavior_one_hot_features[int(video) - 1, :]) if video in candidate: features2.append(behavior_one_hot_features[int(video) - 1, :]) self.data.append( tf.concat([self.user_information_one_hot_features_dic[user], tf.concat(features1, axis=0)], axis=0)) self.data.append(tf.concat( [self.user_information_one_hot_features_dic[user], tf.concat(features2, axis=0)], axis=0)) self.y.append(tf.Variable([1, 0], dtype=tf.float32)) self.y.append(tf.Variable([0, 1], dtype=tf.float32)) class EmbeddingLayer(tf.keras.layers.Layer): ''' this is a embedding layer for embeded one-hot vector to dense vector. ''' def __init__(self, **kwargs): super().__init__(**kwargs) ''' first, we wanna implement the embedding that implemented the pool layer function before. :param titles: feature weight name. :param shape: the shape of weight. :return: None ''' self.emb_weights_click = tf.Variable(initial_value=tf.random.truncated_normal(shape=[99999, 128], dtype=tf.float32), dtype=tf.float32, name='embedding_weight_click', trainable=False) def build(self, input_shape): pass def call(self, inputs, training=None, mask=None): result = [] for vector in inputs: row_index = np.nonzero(vector.numpy()) result.append(tf.reduce_sum(tf.nn.embedding_lookup(self.emb_weights_click, row_index), axis=1)) return tf.Variable(result, dtype=tf.float32) class MLP(tf.keras.Model): def __init__(self, **kwargs): super().__init__(**kwargs) self.embedding_layer = EmbeddingLayer() self.hidden1 = tf.keras.layers.Dense(128, activation='relu', name='hidden1') self.hidden2 = tf.keras.layers.Dense(64, activation='relu', name='hidden2') self.hidden3 = tf.keras.layers.Dense(2, activation='softmax', name='hidden3') def build(self, input_shape): pass def call(self, inputs, training=None, mask=None): output1 = self.embedding_layer(inputs) output2 = self.hidden1(output1) output3 = self.hidden2(output2) output4 = self.hidden3(output3) return output4 def predict(self): pass def save_weight(self, dir): ''' saving all weights to local file via dictionary. :return: ''' dic = dict() dic['embedding'] = self.embedding_layer.emb_weights_click dic['hidden1'] = self.hidden1.trainable_weights dic['hidden2'] = self.hidden2.trainable_weights dic['hidden3'] = self.hidden2.trainable_weights os_path = os.getcwd() with open(os_path + dir, 'wb') as f: pickle.dump(dic, f, pickle.HIGHEST_PROTOCOL) def load_weight(self, dir): ''' this is a parameter loading function. :param dir: :return: ''' os_path = os.getcwd() + dir with open(os_path, 'rb') as f: dic = pickle.load(f) self.embedding_layer.emb_weights_click = dic['embedding'] self.hidden1.set_weights = dic['hidden1'] self.hidden2.set_weights = dic['hidden2'] self.hidden3.set_weights = dic['hidden3'] def loss(y_true, y_pred, eps=1e-15): ''' this is a log-likehood function to calculate the network looss. :param y_true: true output. :param y_pred: prediction output. :param eps: this is a etra parameter for avoiding the output is 0 and 1 prob and then the result is error. :return: loss ''' p = K.clip(y_pred, eps, 1 - eps) _loss = K.abs(K.sum(y_true * K.log(p) - (tf.Variable(1, dtype=tf.float32) - y_true) * K.log(1 - p), axis=1)) return _loss / len(y_true) data_generate = DataManagermMachine() x = data_generate.data y_true = data_generate.y mlp = MLP() optimizers = [ tf.keras.optimizers.SGD(learning_rate=1e-4, momentum=0.9), tf.keras.optimizers.RMSprop(learning_rate=1e-3), tf.keras.optimizers.Adagrad(learning_rate=1e-4), tf.keras.optimizers.Adadelta(learning_rate=1e-2), tf.keras.optimizers.Adam(learning_rate=1e-3), tf.keras.optimizers.Adamax(learning_rate=1e-3), tf.keras.optimizers.Nadam(learning_rate=1e-4) ] x = data_generate.data y_true = data_generate.y colors = ['red', 'blue', 'yellow', 'black', 'green', 'violet', 'orange'] # labels = ['sgd', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'] # epochs = 300 models = [MLP() for i in range(0, len(colors))] x_data = [i for i in range(0, epochs)] for index, color in enumerate(colors): losss = [] for i in range(0, epochs): with tf.GradientTape(persistent=True) as tape: y_pred = models[index](x) _loss = loss(y_true, y_pred) losss.append(tf.reduce_mean(_loss).numpy()) # print(tf.reduce_mean(_loss).numpy()) gradfients = tape.gradient(_loss, models[index].trainable_variables) optimizers[index].apply_gradients(zip(gradfients, models[index].trainable_variables)) plt.plot(x_data, losss, color=color, linewidth=1.0, linestyle='-', label=labels[index]) plt.legend() plt.xlabel('iteration times') plt.ylabel('loss') plt.show() # demo = cf(_flag=False, plk_dir='/jojo.pkl') class jojo(object): def __init__(self): ''' first, we need to load the ''' pass def jojo_predict(self): # candidate = jojo_predict_1(1, False) items = [i for i in range(1, 95)] userinfo_one_hot = data_generate.user_information_one_hot_features_dic[3] result = [] rank = {} for item in items: candidate_one_hot = tf.one_hot(item, 95) one_hot = tf.concat([userinfo_one_hot, candidate_one_hot], axis=0) result.append(models[6]([one_hot]).numpy()[0]) rank[item] = models[6]([one_hot]).numpy()[0][0][0] sorted_distances = sorted(rank.items(), key=operator.itemgetter(1), reverse=True) rank_item = [] [rank_item.append(result[0]) for index, result in enumerate(sorted_distances)] print(rank_item) jo = jojo() jo.jojo_predict()
采用了模拟文件进行实验:
=============================================================
userid,age,profeesion,sex,area,income,hobby 1,48,1,1,41,3,0 2,55,2,1,81,1,1 3,22,1,0,71,2,0 4,56,2,0,1,4,1 5,66,2,0,87,1,0 6,67,1,0,2,2,0 7,49,5,0,51,4,0 8,31,6,0,82,3,0
=========================
userid,videoid,score 1,6,2.921 1,95,1.99074 1,20,0.90122 1,13,2.56789 1,32,2.09868 1,64,1.56651 2,75,2.62655 2,87,2.42395 2,48,1.87967 2,24,1.4725 2,41,2.50778 3,7,2.77701 3,13,1.80425 3,95,2.72534 3,43,1.42114 3,79,2.73125 3,23,1.34794 3,40,1.04891 4,1,2.63128 4,85,2.62381 4,3,2.5799 4,3,1.89203 4,83,2.01583 4,85,1.92016 4,5,3.21532 5,2,2.84718 5,56,2.75968 6,22,2.8018 6,82,1.36478 6,17,2.47285 7,78,3.16126 7,14,2.33041 7,66,1.83116 8,78,1.52823
实验的最终结果: