from pyspark.sql import SparkSession, functions as F
import heapq
from pyspark.sql.types import StructType, StructField, LongType,FloatType, ArrayType, IntegerType
spark = SparkSession.builder.appName("search_statistics").getOrCreate()
log = spark.read.format("csv").option("header","true").load("日记点击日志.csv")
print("现有的dataframe结构")
log.printSchema()
log.show(3)
log = log.withColumn("uid", log["uid"].cast(IntegerType())).withColumn("target_id", log["target_id"].cast(IntegerType())).withColumn("device_id", log["device_id"].cast(IntegerType()))
print("格式转换后的dataframe")
log.printSchema()
print("去重前size %d" % log.count())
log = log.selectExpr('uid', 'target_id group_id', 'device_id').distinct() #去重
print("去重后size %d" % log.count())
log.show()
bad_device = spark.read.format("csv").option("header","true").load("作弊用户device_id.csv") #加载作弊用户设备id
bad_device = bad_device.withColumn("device_id", bad_device["device_id"].cast(IntegerType()))
print("作用用户size %d" % bad_device.count())
df = log.join(bad_device, 'device_id', 'left_anti').filter('uid!=0') #使用left_anti对黑名单数据进行剔除。同时剔除uid=0
print("log大小 %d df大小 %d" % (log.count(), df.count()))
# df =df.select(F.when(F.col('uid') == 0, F.col('device_id') * -1).otherwise(F.col('uid')).alias('uid'), 'group_id') #对于uid等于0的,采取-device_id作为uid
df = df.groupBy('group_id').agg(F.collect_set('uid').alias('uids')).filter('size(uids) > 5') #把每个group_id的uid聚合成set。并剔除浏览用户小于5个的日记(数据太小,不利于算相似日记)
df.show()
pdf = df.select('group_id', 'uids').toPandas() #sparkDF转pandas数据,方便本地计算
group_ids = [int(i) for i in pdf['group_id'].tolist()]
uid_sets = pdf['uids'].apply(lambda x: set(x))
# 自定义udf返回类型
TOP_N_TYPE = StructType([
StructField('group_id', LongType()),
StructField("score", FloatType()),
])
def jaccard_similarity(a, b):
a_len = len(a)
b_len = len(b)
if a_len == 0 and b_len == 0:
return 0
else:
return len(a & b) / len(a | b)
# return len(a & b) / sqrt(a_len * b_len)
def item_recommend(group_ids, uid_sets, n_rec, group_id, uids):
"""相关的topN帖子推荐
:param group_ids: {list} 所有的group_id,用于将group_id和杰卡得相似度关联 n个group_id
:param uid_sets: {pandas.core.series.Series} 所有帖子的所有浏览用户集合 n行用户id的set 和group_id顺序对应
:param n_rec: {int} 保留n个最相关的帖子
:param uids: {list} 当前帖子的浏览用户id集合
:param group_id: {list} 当前帖子id,用于在相关贴子中去除当前帖子
:return: {list} [(post_id, js_similarity),...]
"""
uids = set(uids) # uids是list,转为set
jaccard_sims = uid_sets.apply(lambda x: jaccard_similarity(x, uids)).tolist() #计算每个日记的uid和当前uid的计算相似度
recs = (i for i in zip(group_ids, jaccard_sims) if i[0] != group_id)
return heapq.nlargest(n_rec, recs, key=lambda x: x[1]) #堆排序取Top100
n_rec =100 #每个日记推荐最相似的100个日记
jaccard_item_rec = F.udf(lambda group_id, uids:item_recommend(group_ids, uid_sets, n_rec, group_id, uids), ArrayType(TOP_N_TYPE)) #js_item_rec就是一个注册后的udf函数
"""冒号左侧是udf的输入参数,右侧是uid对应的函数"""
df_sim = df.select('group_id',jaccard_item_rec('group_id', 'uids').alias('recs'))
df_sim.show()
表:一个是item的浏览日志表。还要一个作弊用户id表。