获取用户对每篇资讯的评分
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from os import environ
spark = SparkSession.builder.appName('test').enableHiveSupport().getOrCreate()
sc = spark.sparkContext
#获取用户对每篇资讯的得分
#浏览/分享/点赞行为
df = spark.sql(f"select custNo,item_id,(view*1 + share*30 + like*50) as score from table where dt>'90天之前'")
#获取用户所浏览的资讯数量
import pyspark.sql.functions as f
user_df = df.groupby('cust_no').agg(f.countDistinct('item_id').alias('view_news'))
#资讯降权,用户对某篇资讯权重=用户对某篇资讯得分/用户浏览资讯数量
from pyspark.sql.functions import col
res_df = df.join(user_df,on='cust_no',how='left').withColumn('score',col('score')/col('view_news'))
res_df.show()
获取资讯基本属性
new_df = spark.sql("select item_id,max(title) as title from table where dt='day' group by item_id")
res_df = res_df.join(new_df,on='item_id',how='left')
计算资讯得分
df_count = res_df.groupby('item_id').agg(f.countDistinct('cust_no').alias('custs'),f.sum(f.pow(col("score"),2)).alias('sum_pow_score')).filter('custs>5')
#合并
res_df = res_df.join(df_count,on='item_id',how='inner')
时间处理函数
import datetime
from pyspark.sql import types as T
@f.udf(T.DoubleType())
def diff_days(t1,t2):
d1 = datetime.datetime.strptime(t1[:8],'%Y%m%d')
d2 = datetime.datetime.strptime(t2[:8],'%Y%m%d')
return abs((d1-d2).days)
import numpy as np
@f.udf(T.DoubleType())
def decay(x):
return float(np.power(0.9,(x/2)))
资讯相关性计算
#复制两个相同的用户对应资讯得分明细数据
user_df_1 = res_df.withColumnRename('item_id','item_id1').withColumnRename('cust_no','cust_no1').withColumnRename('title','title1').withColumnRename('sum_pow_score','sum_pow_score1').withColumnRename('custs','custs1').withColumnRename('score','score1').withColumnRename('dt','dt1')
user_df_2 = res_df.withColumnRename('item_id','item_id2').withColumnRename('cust_no','cust_no2').withColumnRename('title','title2').withColumnRename('sum_pow_score','sum_pow_score2').withColumnRename('custs','custs2').withColumnRename('score','score2').withColumnRename('dt','dt2')
#将同一用户下的浏览资讯相关联
test_df = user_df_1.join(user_df_2,on=(user_df_1.cust_no1 == user_df_2.cust_no2),how='inner').filter('item_id1 != item_id2')
#通过计算同一用户浏览咨询日期来确定相关性
test_df = test_df.withColumn('diff_days',diff_days(col('dt1'),col('dt2'))).withColumn('decay_coef',decay(col('diff_days')))
#计算两篇资讯相关性得分
test_df = test_df.groupby('item_id1','item_id2','title1','title2').agg(f.count(*).alias('custs'),f.max(col('custs1')).alias('custs1'),f.max(col('custs2')).alias('custs2'),f.sum(col('score1')*col('score2')*col('decay_coef')).alias('sum_score'),f.max(col('sum_pow_score1')).alias('sum_pow_score1'),f.max(col('sum_pow_score2')).alias('sum_pow_score2'))
#相似度计算
from pyspark.sql.window import Window
test_df = test_df.withColumn(
'sim',col('sum_score')/f.sqrt(col('custs1')*col('custs2'))
).withColumn(
'sim1',col('sum_score')/f.sqrt(
col('sum_pow_score1')*col('sum_pow_score2'))
).filter('sim>0')
.withColumn('rank_k',f.row_number().over(Window.partitionBy('item_id1').orderBy(f.desc('sim')))).filter('rank_k<=100')