#!/usr/bin/env python
# -*- coding: utf-8 -*-
# *******************************************************************************************
# ** 功能描述: itemcf
# ** 创建者: zyf
# ** 创建日期: 2021/09/12
# *******************************************************************************************
import os, pathlib, sys
import math
projectdir = str(
pathlib.Path(os.path.abspath(__file__)).parent.parent.parent.parent)
sys.path.append(projectdir)
import findspark
findspark.init()
import math
import json
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, functions
# from pyspark.sql.functions import *
# from pyspark.sql.types import *
from utils.arg_utils import ArgsUtils
from utils.time_tools import get_today, get_train_days
from utils.config_tools import ConfigServer
from tqdm import tqdm
from utils.ots_tools import OTS_Tools
from pandas.core.frame import DataFrame
import pandas as pd
from tqdm import tqdm
# ******************** 参数加载 ****************************
config_server = ConfigServer()
config_dict = config_server.get_secret_config()
config_dict['today'] = get_today()
config_dict['train_days'] = '31'
config_dict['test_days'] = '1'
config_dict['nitem'] = '3'
config_dict['wordlist_k_num'] = 30
config_dict['recall'] = 'label_v6_zyf_30'
config_dict['env'] = 'dev'
# *************************************************************
## 获取数据 & 数据预处理
def get_portrait_df():
with UsingMysql(log_time=False) as um:
portrait_df = """
select map_id,emotion_label_name
from ads_dfl_video_portrait_df
where (item_type_name = 'movie' or item_type_name = 'content')
"""
portrait_df = um.fetch_list_by_filter(portrait_df)
data = DataFrame(portrait_df)
return data
''' 数据探索
# 统计物品中每个标签被使用的次数,在
def getTagsCount(data):
tagUserDict = dict()
line_list = []
for line in data['emotion_label']:
if len(line) > 0:
line_list = line.split(",")
#print(line_list)
for i in line_list:
#print(i)
# 统计物品中每个标签被打标的次数
if i in tagUserDict.keys():
tagUserDict[i] += 1
else:
tagUserDict[i] = 1
#tagUserDict = sorted(tagUserDict.items(),key = lambda x:x[1],reverse = True)#按被打标的次数从大到小排序
print('******tagUserDict是')
print(tagUserDict)
return tagUserDict
def data_explore(data):
data2 = data[data['emotion_label'].str.len() == 0]
k1 = data2.shape[0]
k2 = data.shape[0]
k3 = k1/k2
print(k1)
print(k2)
print('标签的空值占比为{}'.format(k3))
def data_explore2(data):
k = 0
for i in range(data.shape[0]):
if len(data['tags_name'][i]) == 0:
k = k+1
print('空值个数为{}'.format(k))
def missing_percent(data):
nan_percent = 100*(data.isnull().sum()/len(data))
# df.isnull().sum()统计每一列的缺失值数量
# 再除上len()得到每一列的缺失值比例——小数形式
# *100得到百分数
nan_percent = nan_percent[nan_percent > 0].sort_values()
print('nan_percent')
print(nan_percent)
print('nan_percent')
# 得到每列的缺失值的占比,升序排序
# >0是为了筛掉没有缺失值的列,只返回有缺失值的
return nan_percent
'''
## 获取数据 & 数据预处理
def get_dataframe(day_list):
'''
# 获取数据dataframe
'''
df = spark.read \
.format("org.apache.spark.aliyun.odps.datasource") \
.option("odpsUrl", config_dict.get('odpsUrl')) \
.option("tunnelUrl", config_dict.get('tunnelUrl')) \
.option("table", config_dict.get('dataphin_cross_action_table')) \
.option("project", config_dict.get('dataphin_project')) \
.option("accessKeySecret", config_dict.get('accessKeySecret')) \
.option("accessKeyId", config_dict.get('accessKeyId')) \
.option("partitionSpec", "ds=" + day_list[0]) \
.load()
for i, day in enumerate(day_list):
if i == 0:
continue
tmp_df = spark.read \
.format("org.apache.spark.aliyun.odps.datasource") \
.option("odpsUrl", config_dict.get('odpsUrl')) \
.option("tunnelUrl", config_dict.get('tunnelUrl')) \
.option("table", config_dict.get('dataphin_cross_action_table')) \
.option("project", config_dict.get('dataphin_project')) \
.option("accessKeySecret", config_dict.get('accessKeySecret')) \
.option("accessKeyId", config_dict.get('accessKeyId')) \
.option("partitionSpec", "ds=" + day) \
.load()
df = df.union(tmp_df)
df.createOrReplaceTempView('df')
# 行为数据表
train_df = spark.sql("""
select user_id,map_id,bhv_type,pos_id,play_duration,replace(substring(FROM_UNIXTIME(bhv_time),0,10),"-","") as bhv_time
from df
where bhv_type!='exposure' and (item_type='content' or item_type='movie')
""")
# train_df.show()
"""
原始表:
+------------------+-------+---------+--------+-------------------+-------------+----------+
| user_id|item_id|item_type|bhv_type| pos_id|play_duration| bhv_time|
+------------------+-------+---------+--------+-------------------+-------------+----------+
|570330358550605910| 104| movie|exposure| 为你推荐$$2| -1|1631602033|
|570330358550605910| 9| movie|exposure| 为你推荐$$1| -1|1631602033|
|613135404484767797| 7214| movie|exposure| 轮播banner$$1| -1|1631602049|
|639224497597558818| 7985| movie|exposure| 轮播banner$$5| -1|1631602033|
|634845977937162293| 104| movie|exposure| 为你推荐$$2| -1|1631602037|
+------------------+-------+---------+--------+-------------------+-------------+----------+
tmp_df:
+------------------+--------------------+--------+------+-------------+--------+
| user_id| item_id|bhv_type|pos_id|play_duration|bhv_time|
+------------------+--------------------+--------+------+-------------+--------+
|644283194841477214| movie_3428| play| -1| 902|20210913|
|644324976857952347|movie_m_39bb6e9c-...| play| -1| 9|20210913|
|641008005299023894| shortvideo_267943| play| -1| 493|20210913|
"""
# 数据处理
train_df.createOrReplaceTempView("train_df")
### 计算得分
def compute_score(bhv_type, bhv_time):
score = 1
if bhv_type == 'click':
score = 1
# 原记录得分值域:[1:3]
elif bhv_type == 'like' or 'favorite' or 'forward': # 原记录得分值域:[1:3]
score = 5
elif bhv_type == 'comment':
score = 3
elif bhv_type == 'play' and 'play_duration' <= 60:
score = 2
elif bhv_type == 'play' and 'play_duration' > 60 and 'play_duration' <= 600:
score = 3
elif bhv_type == 'play' and 'play_duration' > 600:
score = 4
alpha = -math.log(1.0 / 5.0) / 7.0 # alpha= - ln(min(score)/max(score)) / (max(date)-min(date))
day_diff = float(int(day_list[-1]) - int(bhv_time))
cooling_score = round(float(1 + score * math.exp(-alpha * day_diff)),4) # 冷却得分值域[1:4] # 冷却得分值域[1:4]
return cooling_score
spark.udf.register("compute_score", compute_score)
score_df = spark.sql("""
select user_id,map_id,compute_score(bhv_type,bhv_time) as cooling_score
from train_df
""")
score_df.createOrReplaceTempView("score_df")
# 用户-物品-累计得分
sum_score_df = spark.sql("""
select user_id,map_id,round(sum(cooling_score),4) as score
from score_df
group by user_id,map_id
""")
# sum_score_df.show()
"""
+------------------+-----------------+---------+
| user_id| item_id| score|
+------------------+-----------------+---------+
|566962862275670074| movie_5711|26.654316|
|644260591896604689|shortvideo_267802| 3.386723|
|592763115331366953|shortvideo_267803| 1.693361|
|563094176263549003| movie_7194| 6.773445|
|638887111374123051| movie_6266|20.320335|
"""
sum_score_df.createOrReplaceTempView("sum_score_df")
# 根据统计值对score值做变换
sum_score_df2 = spark.sql("""
select user_id,map_id,(case when score>100 then 100 else score end) as score
from sum_score_df
""")
sum_score_df2.show(10)
'''
+------------------+------+-----+
| user_id|map_id|score|
+------------------+------+-----+
|653729358015868937| 58637|111.0|
|639394095877828653|200144| 70.0|
|620723639054479437|117398| 3.0|
|593180045921988699|146266| 62.0|
|604821613770682439|219806| 11.0|
|653194075591651337|207707| 11.0|
|652874328022954013| 74236| 11.0|
|653612868163842140|164081| 12.0|
|653345499818471433|315063| 24.0|
|639172479914848335| 73647| 12.0|
|650409670359887949|147351| 12.0|
|558990665330438170|123653| 2.0|
|561919984893669386|229934| 1.0|
|630768821523165275|320646| 2.0|
|653631725008039967|213237| 12.0|
|653247613499846692| 58950| 11.0|
|571817245001887823|213502| 21.0|
|653681559714902070|230890| 12.0|
| 885280|156834| 21.0|
|626406779018260489| 82507| 88.0|
+------------------+------+-----+
'''
return sum_score_df2
def getUserTagNum(sum_score_df2, data):
user_map_score = sum_score_df2.toPandas()
user_map_score['map_id'] = user_map_score['map_id'].astype(int)
df = pd.merge(user_map_score, data, on='map_id') # 用户看过的item对应的评分和label
'''
user_id map_id score emotion_label
0 653729358015868937 58637 23.2787 名侦探柯南,异次元的狙击手
1 652926852776046634 58637 2.0050 名侦探柯南,异次元的狙击手
2 652813330373128221 58637 3.0276 名侦探柯南,异次元的狙击手
3 649688320297119803 58637 25.2888 名侦探柯南,异次元的狙击手
4 652232959969439795 58637 4.0301 名侦探柯南,异次元的狙击手
... ... ... ... ...
1875405 581753979378577490 247331 8.0000 生活
1875406 182750 260651 8.0000
1875407 627947765628383281 218244 8.0000
1875408 578368978175897635 124396 2.0000 历史,古代,世界,帝国,文明,空中课堂
'''
# 统计每个用户对每个标签的打标次数
userTagDict = {}
u_t_rate = {} # 每个用户对应的标签的总得分
user_map_dict = {} # 用户看过哪些电影
for index, row in df.iterrows():
user_id = row["user_id"]
map = row["map_id"]
emotion_label = row["emotion_label_name"]
user_map_dict.setdefault(user_id, set())
user_map_dict[user_id].add(map)
if len(emotion_label) > 0:
emotion_list = emotion_label.split(",")
for i in emotion_list:
userTagDict.setdefault(user_id, {})
if i in userTagDict[user_id].keys():
userTagDict[user_id][i] += 1
else:
userTagDict[user_id][i] = 1
u_t_rate.setdefault(user_id, {})
if i in u_t_rate[user_id].keys():
u_t_rate[user_id][i] += int(row["score"])
else:
u_t_rate[user_id][i] = int(row["score"])
# tagUserDict = sorted(tagUserDict.items(), key=lambda x: x[1], reverse=True) # 按被打标的次数从大到小排序
# 统计用户看的电影对应的标签总数
userTagCount = {}
for i in userTagDict:
userTagCount[i] = len(userTagDict[i])
# 所有用户对标签t的标记计数和
TagUserCount = {}
for user in userTagDict:
for t in userTagDict[user]:
TagUserCount.setdefault(t, 0)
TagUserCount[t] += userTagDict[user][t]
# u_t_rate 用户(u)对标签(t)的喜好
# u_t_rate = {}
# for index, row in df.iterrows():
# user_id = int(row["user_id"])
# emotion_label = row["emotion_label"]
# if len(emotion_label) > 0:
# emotion_list = emotion_label.split(",")
# for i in emotion_list:
# u_t_rate.setdefault(user_id, {})
# if i in u_t_rate[user_id].keys():
# u_t_rate[user_id][i] += int(row["score"])
# else:
# u_t_rate[user_id][i] = int(row["score"])
print('*************************')
# print(u_t_rate)
u_t_rate2 = {} # 用户对标签的评分
for user in u_t_rate:
u_t_rate2.setdefault(user, {})
for t in u_t_rate[user]:
m1 = u_t_rate[user][t]
m2 = userTagDict[user][t]
u_t_rate2[user][t] = round(m1 / m2,4)
print('************************')
# print(u_t_rate2)
# 统计每个用户总共打过多少次标签
u_tagCount = {}
for i in userTagDict:
u_tagCount[i] = sum(userTagDict[i].values())
# u_tagCount = sorted(u_tagCount.items(),key = lambda x:x[1],reverse = True)#按被打标的次数从大到小排序
print('***********u_tagCount*************')
# print(u_tagCount)
# 用户u对标签t的依赖程度
tf_ut = {}
for user in userTagDict:
tf_ut.setdefault(user, {})
for i in userTagDict[user]:
k1 = userTagDict[user][i]
k2 = u_tagCount[user]
tf_ut[user][i] = round(k1 / k2,4)
print("*******tf_ut是")
# print(tf_ut)
# IDF(u, t)表示t的热门程度,即一个标签被不同用户使用的概率。
# 分子表示所有用户对所有标签的标记计数和。分母表示所有用户对标签t的标记计数和
IDF_ut = {}
for i in TagUserCount:
IDF_ut[i] = round(math.log(sum(u_tagCount.values()) / (TagUserCount[i] + 1)),4)
print("##### IDF_ut#######")
# print(IDF_ut)
# 每个标签被标记的总次数
# 用户对标签的依赖度TF-IDF
TF_IDF = {}
for user in tf_ut:
TF_IDF.setdefault(user, {})
for t in tf_ut[user]:
TF_IDF[user][t] = round(tf_ut[user][t] * IDF_ut[t], 4)
print("**********TF_IDF")
# print(TF_IDF)
# 在之前分析了用户对标签的主观喜好程度,本节分析了用户对标签的依赖程度,综合可以得到用户u对标签的兴趣度为Pre_ut
Pre_ut = {}
for user in u_t_rate2:
Pre_ut.setdefault(user, {})
for t in u_t_rate2[user]:
Pre_ut[user][t] = round(u_t_rate2[user][t] * TF_IDF[user][t], 4)
Pre_ut[user] = dict(sorted(Pre_ut[user].items(), key=lambda x: x[1], reverse=True)[0:10])
print("$$$$$$$$$$$$$")
# print(Pre_ut)
return Pre_ut, user_map_dict
# 推荐
def user_rec(data, Pre_ut, user_map_dict, nitems):
portrait_dict = {} # 用户对每部电影的喜爱程度
map_label_dict = {} # 电影对应哪些标签
for index2, row2 in data.iterrows():
map2 = row2['map_id']
label = row2['emotion_label_name']
if len(label) > 0:
label_list = label.split(",")
map_label_dict[map2] = label_list
print("***********map_label_dict********")
# print(map_label_dict)
for user in tqdm(Pre_ut):
portrait_dict.setdefault(user, {})
for map in map_label_dict:
if map not in user_map_dict[user]: # 如果该电影不在用户看过的电影里
#portrait_dict[user].setdefault(map, 0)
emo_list = map_label_dict[map] # 取该map对应的label
k = 0
for i in emo_list:
if i in Pre_ut[user].keys(): # 如果该用户对该标签有喜爱度得分
k += Pre_ut[user][i]
portrait_dict[user][map] = round(k,4) # 通过标签计算该用户对该map的评分
# sorted(portrait_dict[user].items(), key=lambda dict: dict[1], reverse=True)
portrait_dict[user] = dict(sorted(portrait_dict[user].items(), key=lambda x: x[1], reverse=True)[0:nitems])
print("***********portrait_dict********")
#print(portrait_dict)
return portrait_dict
'''此种方法能一步到位完成推荐,但50条行为数据耗时4603s
for user in Pre_ut:
portrait_dict.setdefault(user, {})
for map in data['map_id']:
portrait_dict[user].setdefault(map, 0)
emo_list = data.loc[data['map_id'] == map, 'emotion_label'].iloc[0]#取该map对应的label
if len(emo_list) > 0:#如果label存在
emo_list = emo_list.split(",")#则将label转化为列表
k = 0
for i in emo_list:
if i in Pre_ut[user].keys():#如果该用户对该标签有喜爱度得分
k += Pre_ut[user][i]
portrait_dict[user][map] = k#通过标签计算该用户对该map的评分
print("&&&&&&&&&&&&&&&&&&portrait_dict")
print(portrait_dict)'''
def save_data(portrait_dict):
# 将相似度df转为字典,写入表ads_movie_wordlist_recall_df
# TODO 保存到ots中
mode = '1'
print('开始存数据')
print("*************")
print(config_dict['recall'])
with UsingMysql(log_time=False) as sql_target:
for id, value in tqdm(portrait_dict.items()):
value = {k: v for k, v in
sorted(value.items(), key=lambda d: d[1], reverse=True)[:int(config_dict['wordlist_k_num'])]}
id_sql = "select id from ads_movie_wordlist_recall_df where id =%s and mode=%s and recall=%s"
id_params = (id, mode, config_dict['recall'])
tmp_id_target = sql_target.fetch_one(id_sql, id_params)
if not tmp_id_target:
insert_sql = "insert into ads_movie_wordlist_recall_df (id,mode,recall,sim) values(%s,%s,%s,%s)"
insert_params = (id, mode, config_dict['recall'], json.dumps(value))
sql_target.cursor.execute(insert_sql, insert_params)
else:
update_sql = "update ads_movie_wordlist_recall_df set sim='{sim}' where id='{id}' and mode ='{mode}' and recall='{recall}'".format(
id=id, mode=mode, recall=config_dict['recall'], sim=json.dumps(value))
sql_target.update_by_pk(update_sql)
print('存好数据')
if __name__ == '__main__':
if len(sys.argv) > 0:
ArgsUtils().deal_args(sys.argv, config_dict)
if config_dict['env'] == 'prod':
from utils.pymysql_tools import UsingMysql
else:
from utils.pymysql_tools_ceshi import UsingMysql
config_dict['ots_movie_wordlist_recall'] = 'ceshi_' + config_dict['ots_movie_wordlist_recall']
train_day_list = get_train_days(config_dict.get('today'), config_dict.get('test_days'),
config_dict.get('train_days'))
# print(train_day_list)
conf = SparkConf().setMaster("local[4]").setAppName("train_itemcf_model")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.conf.set("spark.sql.crossJoin.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "4")
spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext
print(config_dict)
### 1.获取数据
portrait_df = get_portrait_df()
# getTagsCount(portrait_df)
# missing_percent(portrait_df)
# show_detail(portrait_df)
# data_explore(portrait_df)
# data_explore2(portrait_df)
train_df = get_dataframe(train_day_list)
Pre_ut, user_map_dict = getUserTagNum(train_df, portrait_df)
portrait_dict = user_rec(portrait_df, Pre_ut, user_map_dict, 30)
save_data(portrait_dict)
spark.stop()
参考原理:推荐系统(3)-基于标签的推荐系统 - 知乎