[pyspark]itemcf协同过滤推荐算法------应用华为比赛数据实现(包含转化为稀疏向量,lsh模型,杰卡德距离)

该博客详细介绍了如何使用PySpark进行推荐系统的实现,包括数据预处理、物品相似度计算(MinHash LSH)、兴趣值计算以及最终的推荐。博主首先展示了数据读取和预处理过程,接着通过MinHash LSH计算物品之间的相似度,然后结合用户冷却得分计算兴趣值,最后筛选出推荐物品。文中还提供了部分关键步骤的单独运行结果以帮助理解代码逻辑。
摘要由CSDN通过智能技术生成

学完以后,第一感受,确实会比python代码来得简洁

先上代码

第一步,用pysql处理原始数据,包含计算冷却得分(不是今天的重点,暂不会在后面细说了)

import findspark
findspark.init()
import pathlib
import sys
import json
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as psf
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType
from pyspark.sql.types import StructField
from pyspark.sql.types import IntegerType
import math
import argparse
import time
import os

projectdir = str(
    pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
sys.path.append(projectdir)
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from collections import defaultdict


def get_dataframe():
    # 读取csv文件至dataframe
    # Spark -- DataFrame按指定分隔符读取和写入文件 https://blog.csdn.net/Aeve_imp/article/details/107520678
    df = spark.read.format('csv').option('sep', '\t').option('header', True).load(
        r"file:///home/admin/pyspark_script3/data/4days/history_behavior_data/*.csv")
    # df.show(2,False)
    """
    +-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
    |user_id|video_id|is_watch|is_share|is_collect|is_comment|watch_start_time|watch_label|pt_d    |
    +-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
    |2897092|41270   |0       |0       |0         |0         |null            |0          |20210428|
    |4849237|33564   |0       |0       |0         |0         |null            |0          |20210428|
    +-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
    """
    # 划分训练集测试集
    df.createOrReplaceTempView('df')

    train_sql = """
    select *
    from df
    where pt_d<='20210430' 
    limit 50
    """

    test_sql = """
       select *
       from df
       where pt_d='20210501'
       """

    train_df = spark.sql(train_sql)
    test_df = spark.sql(test_sql)
    # test_df.show(2, False)
    """
    +-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
    |user_id|video_id|is_watch|is_share|is_collect|is_comment|watch_start_time|watch_label|pt_d    |
    +-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
    |4256452|30150   |0       |0       |0         |0         |null            |0          |20210501|
    |4256452|10499   |0       |0       |0         |0         |null            |0          |20210501|
    +-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
    """

    return train_df, test_df

def build_model(df):
    df.createOrReplaceTempView("df")

    def compute_score(input):
        """
        :param input:[" "]
        :return:
        """
        line=input[0].split(",")
        a = int(line[0])
        b = int(line[1])
        c = int(line[2])
        d = float(line[3])
        e = int(line[4])
        score = (a * 2 + b * 2 + c * 2 + 2 * d / 9.0) + 1   #当日得分值域[1:9]

        alpha = -(math.log(1.0 / 9.0)) / (3 * 1.0)
        day_diff = float(20210430 - e)
        cooling_score = float(1 + score * math.exp(-alpha * day_diff))  #冷却得分值域[1:10]

        return float(cooling_score)

    spark.udf.register("compute_score", compute_score)

    x_sql="""
    select user_id,video_id,compute_score(collect_list(concat_ws(',',is_share,is_collect,is_comment,watch_label,pt_d))) as cooling_score
    from df
    group by user_id,video_id
    """
    x_df=spark.sql(x_sql)   #得到每条含冷却得分的行为记录表x_df
    #x_df.show(10,False)
    """
    +-------+--------+--------+-------------+
    |user_id|video_id|pt_d    |cooling_score|
    +-------+--------+--------+-------------+
    |100    |15077   |20210429|0.5          |
    |100    |26061   |20210430|1.0          |
    |100    |32054   |20210430|1.0          |
    |100    |41270   |20210429|0.5          |
    |100    |45295   |20210428|0.25         |
    |100000 |46900   |20210429|0.5          |
    |1000001|12968   |20210428|0.25         |
    |1000003|2946    |20210430|1.0          |
    |1000004|29808   |20210430|1.0          |
    |1000006|25416   |20210428|0.25         |
    +-------+--------+--------+-------------+
    """

    x_df.createOrReplaceTempView('x_df')

    y_sql="""
    select user_id,video_id,sum (cooling_score) as score
    from x_df
    group by user_id,video_id
    order by user_id,video_id
    """
    y_df=spark.sql(y_sql)   # 得到每条含冷却得分的行为记录表x_df
    #y_df.show()
    '''
    +-------+--------+------------------+
    |user_id|video_id|             score|
    +-------+--------+------------------+
    |1000442|   11926|1.2311204247835448|
    |1000442|   47688|1.2311204247835448|
    |1004457|   13497|1.2311204247835448|
    |1004457|   20202|1.2311204247835448|
    |1004457|   23551|1.2311204247835448|
'''

    return y_df

第二步,计算电影之间的相似度(包含将数据转换成稀疏向量喂入MinHashLSH内,得到物品相似度矩阵)

# 调用函数
def run_main():
    ###获取数据
    train, test = get_dataframe()

    train=build_model(train)
    #test=build_model(test)
    """
    +-------+--------+------------------+
    |user_id|video_id|             score|
    +-------+--------+------------------+
    |1000442|   11926|1.2311204247835448|
    |1000442|   47688|1.2311204247835448|
    """
    train=train.rdd
    #test=test.rdd

    def takeSecond(elem):
        return elem[2]

    def getTopN(x, k):
        x.sort(key=takeSecond, reverse=True)
        x = x[:k]
        return x

    def toCSVLine(data):
        output_str = str(data[0]) + "\t"
        return output_str + ','.join([str(d[1]) for d in data[1]])

    #unionRDD = train.union(test)
    unionRDD = train#因为没有用到验证集,所以unionRDD就是train
    &#
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值