用户访问资讯行为得分
#获取资讯包
kyp_df = spark.sql(f"select item_id from table where dt='20221211' and pool_code in ('news')")
import pyspark.sql.function as f
from pyspark.sql import types as T
import datetime
import numpy as np
#时间处理函数
def get_date(date_str,days_ago):
date = datetime.datetime.strptime(date_str,'%Y%m%d')
date = (date - datetime.timedelta(days=days_ago)).strftime('%Y%m%d')
return date
#日期相间隔多久
@f.udf(T.DoubleType())
def diff_days_from_now(t2):
d1 = datetime.datetime.now()
d2 = datetime.datetime.strptime(t2[:8],'%Y%m%d')
return abs((d1 - d2).days)
#衰减函数,0.91的n次方
def action_decay(n):
return float(np.power(0.91,n))
#获取用户访问资讯行为数据
user_df = spark.sql(f"select cust_no,item_id (view*1+share*30+like*50) as score,dt from table where dt>'20221120'")
#得分计算:资讯访问日期越久远,系数越衰减,总得分求和
from pyspark.sql.functions import col
df1 = user_df.withColumn('diff_action_date_from_now',diff_days_from_now(col('dt'))).withColumn('action_decay',action_decay(col('diff_action_date_from_now'))).withColumn('y',col('score')*col('action_decay')).groupby('cust_no','item_id').agg(f.sum('score').alias('score'),f.sum('y').alias('y'))
user_action_df = df1.select('cust_no','item_id','score')
user_action_df.show()
资讯相似度
#获取每篇资讯最相关的30篇资讯
from pyspark.sql.window import Window
it_df = spark.sql(f"select item_id,sim_item_id,sim from table").withColumn('rank_k',f.row_number().over(Window.partitionBy("item_id").orderBy(f.desc("sim")))).filter('rank_k<=30')
predict_df1 = user_action_df.join(it_df,on='item_id',how='inner')
#用户对其它相似关联咨询总得分=sum(阅读资讯得分*关联资讯得分)
predict_df2 = predict_df1.join(kyp_df,on=(predict_df1.sim_item_id == kyp_df.item_id),how='inner').groupby('cust_no','sim_item_id').agg(f.sum(col('sim')*col('score')).alias('score'))
#用户对资讯得分排序
predict_df3 = predict_df2.withColumn('rank_k'.f.row_number().over(Window.partitionBy("cust_no").orderBy(f.desc("score")))).withColumnRename('sim_item_id','item_id')