推荐策略之als基于tensorflow2

今天分享一个在用户规模很大时的模型训练更新代码,还有很多需要优化的地方

'''=================================================
@Function -> 用TensorFlow2实现协同过滤矩阵的分解
@Author :郭艳丹
@Date   :2023-01-11
=================================================='''

import numpy as np
import tensorflow as tf
from keras.callbacks import ModelCheckpoint

def get_all_user_ids():
    #TODO change your real data
    return ["user1","user2","user3","user4","user5","user6"]

def get_all_item_ids():
    # TODO change your real data
    return ["item1","item2","item3","item4","item11","item22","item33","item44","item111","item222","item333","item444"]

def add_negtive_samples(item_ids:set,user_click_item_ids:set,total_len=20):
    # 此处采集和正样本等数的负样本
    candidate_set = list(item_ids - user_click_item_ids)  # 热度采样
    neg_list = np.random.choice(candidate_set, size=total_len-len(user_click_item_ids), replace=True)  # 对于每个正样本,选择n个负样本
    return

import pandas as pd
def get_pretrained_data():
    #TODO 后续调整为tf.data
    user_click_matrix = [("user1","item1"),
                         ("user2","item3"),
                         ("user1","item3"),
                         ("user2","item2"),
                         ("user4","item3"),
                         ("user4","item33"),
                         ("user3","item3"),
                         ("user5","item3"),
                         ("user5","item11"),
                         ("user1","item3"),
                         ("user1","item2"),
                         ]
    original_click_data = pd.DataFrame(user_click_matrix,columns=["user","item"])
    original_click_data["target"] = 1
    item_ids = get_all_item_ids()
    user_ids = get_all_user_ids()
    negtive_samples = np.random.choice(item_ids, size=original_click_data.shape[0],replace=True)
    #负样本构造
    negtive_click_datas = pd.DataFrame({"user":original_click_data["user"],"item": negtive_samples})
    negtive_click_datas["target"] = 0
    result = pd.concat([original_click_data,negtive_click_datas],ignore_index=True,sort=False)
    return item_ids,user_ids, {"user":result["user"],"item":result["item"]},result["target"]


def als_by_batch_train():
    item_ids,user_ids,train_data,target = get_pretrained_data()
    #输入为(None,)表示输入的为一个一维的向量
    user_input = tf.keras.layers.Input(shape=(None,),name="user",dtype=tf.string)
    item_input = tf.keras.layers.Input(shape=(None,),name="item",dtype=tf.string)

    user_string_lookup = tf.keras.layers.StringLookup(vocabulary=user_ids)(user_input)
    item_string_lookup = tf.keras.layers.StringLookup(vocabulary=item_ids)(item_input)

    user_embedding = tf.keras.layers.Embedding(len(user_ids)+5,64)(user_string_lookup)
    item_embedding = tf.keras.layers.Embedding(len(item_ids)+5,64)(item_string_lookup)
    cons_sim_result = tf.keras.layers.Dot(-1,normalize=True)([user_embedding,item_embedding])

    model = tf.keras.Model(inputs=[user_input,item_input], outputs=cons_sim_result)
    model.compile(optimizer="adam", loss=tf.keras.losses.MSE)
    callbacks = [
        ModelCheckpoint(filepath='base_path/als_models/'+'{epoch: 02d}.h5')
    ]

    model.fit(train_data, target, batch_size=4, epochs=2, verbose=1,
              validation_split=0.01,callbacks=callbacks)
    return tf.keras.Model(inputs=[user_input], outputs=user_embedding),tf.keras.Model(inputs=[item_input], outputs=item_embedding)

def get_embedding(model,inputs):
    inputs = tf.constant(inputs)
    return model(inputs)


if __name__ == '__main__':
    user_embedding_model,item_embedding_model = als_by_batch_train()

    user_embedding_result = get_embedding(user_embedding_model,["user1","user2","user100"])
    item_embeddings_result = get_embedding(item_embedding_model,["item2","item1","itemk"])
    print(user_embedding_result)
    print(item_embeddings_result)

其中,tf.keras.layers.Dot是对应的矩阵中的每一行对应相乘并求和,而且可以通过设置normalize可以在进行对应行相乘之前,对其进行行正则化,从而实现了余弦相似性的计算,相关代码已经开源到:sparkle_code_guy/rec_sys - 码云 - 开源中国 (gitee.com)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
除了ALS方法,Spark还提供了其他的用户推荐算法,包括: 1. 基于内容的推荐(Content-Based Recommendation):这种推荐方法根据用户喜欢的物品的属性,找到与其相似的物品进行推荐。在Spark中,可以使用MLlib库中的TF-IDF算法和余弦相似度计算来实现基于内容的推荐。 2. 协同过滤推荐(Collaborative Filtering Recommendation):这种推荐方法使用用户和物品之间的交互信息(如评分、点击等)来推荐物品。在Spark中,除了ALS方法以外,还提供了基于矩阵分解的推荐算法(如SVD++),以及基于模型的协同过滤算法(如基于隐语义模型的推荐算法)。 3. 混合推荐(Hybrid Recommendation):这种推荐方法综合了多种推荐算法,例如基于内容的推荐和协同过滤推荐,以提高推荐的准确度和覆盖率。在Spark中,可以通过将多个推荐算法的结果进行加权求和来实现混合推荐。 下面是基于内容的推荐代码示例: ```python from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.linalg import Vectors from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType # 创建电影数据集 movies = spark.createDataFrame([ (0, "The Shawshank Redemption", "drama"), (1, "The Godfather", "drama"), (2, "The Dark Knight", "action"), (3, "The Lord of the Rings: The Fellowship of the Ring", "adventure"), (4, "The Matrix", "action"), (5, "Inception", "action"), (6, "Forrest Gump", "drama"), (7, "The Lord of the Rings: The Return of the King", "adventure"), (8, "The Godfather: Part II", "drama"), (9, "The Lord of the Rings: The Two Towers", "adventure") ], ["movieId", "title", "genre"]) # 创建用户评分数据集 ratings = spark.createDataFrame([ (0, 0, 5), (0, 1, 4), (0, 2, 3), (0, 3, 5), (0, 4, 4), (0, 5, 3), (1, 0, 4), (1, 1, 5), (1, 2, 4), (1, 3, 3), (1, 4, 4), (1, 5, 5), (2, 0, 3), (2, 1, 4), (2, 3, 5), (2, 4, 3), (2, 5, 4), (3, 1, 5), (3, 3, 4), (3, 4, 5), (3, 5, 5), (4, 0, 4), (4, 1, 3), (4, 2, 5), (4, 3, 4), (4, 4, 3), (4, 5, 4) ], ["userId", "movieId", "rating"]) # 将电影数据集转换为特征向量 tokenizer = Tokenizer(inputCol="genre", outputCol="words") wordsData = tokenizer.transform(movies) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # 定义用户喜好函数,用于计算用户喜欢的电影类型 def userLikes(userId): userMovies = ratings.filter(col("userId") == userId).select("movieId") userGenres = movies.join(userMovies, "movieId").select("genre") genres = [row.genre for row in userGenres.collect()] return genres # 注册用户喜好函数 userLikesUdf = udf(userLikes, ArrayType(StringType())) # 计算用户喜好的电影类型的TF-IDF特征向量 userRatings = ratings.groupBy("userId").agg(collect_list("movieId").alias("movieIds")) userGenres = userRatings.withColumn("genres", userLikesUdf(col("userId"))) userGenres = userGenres.withColumn("genresStr", concat_ws(" ", "genres")) userGenres = tokenizer.transform(userGenres) userFeatures = hashingTF.transform(userGenres) userFeatures = idfModel.transform(userFeatures) # 计算电影和用户之间的余弦相似度 dot_udf = udf(lambda x, y: float(x.dot(y)), FloatType()) similarity = rescaledData.crossJoin(userFeatures).select("movieId", "userId", dot_udf("features", "features").alias("similarity")) # 为用户推荐电影 recommendations = similarity.filter(col("userId") == 0).orderBy(col("similarity").desc()).limit(3) recommendedMovieIds = [row.movieId for row in recommendations.collect()] # 输出推荐结果 recommendedMovies = movies.filter(col("movieId").isin(recommendedMovieIds)) recommendedMovies.show() ``` 该代码示例中使用了TF-IDF算法和余弦相似度计算,计算出了电影和用户之间的相似度,并根据相似度为用户推荐了3部电影。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

会发paper的学渣

您的鼓励和将是我前进的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值