学完以后,第一感受,确实会比python代码来得简洁
先上代码
第一步,用pysql处理原始数据,包含计算冷却得分(不是今天的重点,暂不会在后面细说了)
import findspark
findspark.init()
import pathlib
import sys
import json
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as psf
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType
from pyspark.sql.types import StructField
from pyspark.sql.types import IntegerType
import math
import argparse
import time
import os
projectdir = str(
pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
sys.path.append(projectdir)
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from collections import defaultdict
def get_dataframe():
# 读取csv文件至dataframe
# Spark -- DataFrame按指定分隔符读取和写入文件 https://blog.csdn.net/Aeve_imp/article/details/107520678
df = spark.read.format('csv').option('sep', '\t').option('header', True).load(
r"file:///home/admin/pyspark_script3/data/4days/history_behavior_data/*.csv")
# df.show(2,False)
"""
+-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
|user_id|video_id|is_watch|is_share|is_collect|is_comment|watch_start_time|watch_label|pt_d |
+-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
|2897092|41270 |0 |0 |0 |0 |null |0 |20210428|
|4849237|33564 |0 |0 |0 |0 |null |0 |20210428|
+-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
"""
# 划分训练集测试集
df.createOrReplaceTempView('df')
train_sql = """
select *
from df
where pt_d<='20210430'
limit 50
"""
test_sql = """
select *
from df
where pt_d='20210501'
"""
train_df = spark.sql(train_sql)
test_df = spark.sql(test_sql)
# test_df.show(2, False)
"""
+-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
|user_id|video_id|is_watch|is_share|is_collect|is_comment|watch_start_time|watch_label|pt_d |
+-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
|4256452|30150 |0 |0 |0 |0 |null |0 |20210501|
|4256452|10499 |0 |0 |0 |0 |null |0 |20210501|
+-------+--------+--------+--------+----------+----------+----------------+-----------+--------+
"""
return train_df, test_df
def build_model(df):
df.createOrReplaceTempView("df")
def compute_score(input):
"""
:param input:[" "]
:return:
"""
line=input[0].split(",")
a = int(line[0])
b = int(line[1])
c = int(line[2])
d = float(line[3])
e = int(line[4])
score = (a * 2 + b * 2 + c * 2 + 2 * d / 9.0) + 1 #当日得分值域[1:9]
alpha = -(math.log(1.0 / 9.0)) / (3 * 1.0)
day_diff = float(20210430 - e)
cooling_score = float(1 + score * math.exp(-alpha * day_diff)) #冷却得分值域[1:10]
return float(cooling_score)
spark.udf.register("compute_score", compute_score)
x_sql="""
select user_id,video_id,compute_score(collect_list(concat_ws(',',is_share,is_collect,is_comment,watch_label,pt_d))) as cooling_score
from df
group by user_id,video_id
"""
x_df=spark.sql(x_sql) #得到每条含冷却得分的行为记录表x_df
#x_df.show(10,False)
"""
+-------+--------+--------+-------------+
|user_id|video_id|pt_d |cooling_score|
+-------+--------+--------+-------------+
|100 |15077 |20210429|0.5 |
|100 |26061 |20210430|1.0 |
|100 |32054 |20210430|1.0 |
|100 |41270 |20210429|0.5 |
|100 |45295 |20210428|0.25 |
|100000 |46900 |20210429|0.5 |
|1000001|12968 |20210428|0.25 |
|1000003|2946 |20210430|1.0 |
|1000004|29808 |20210430|1.0 |
|1000006|25416 |20210428|0.25 |
+-------+--------+--------+-------------+
"""
x_df.createOrReplaceTempView('x_df')
y_sql="""
select user_id,video_id,sum (cooling_score) as score
from x_df
group by user_id,video_id
order by user_id,video_id
"""
y_df=spark.sql(y_sql) # 得到每条含冷却得分的行为记录表x_df
#y_df.show()
'''
+-------+--------+------------------+
|user_id|video_id| score|
+-------+--------+------------------+
|1000442| 11926|1.2311204247835448|
|1000442| 47688|1.2311204247835448|
|1004457| 13497|1.2311204247835448|
|1004457| 20202|1.2311204247835448|
|1004457| 23551|1.2311204247835448|
'''
return y_df
第二步,计算电影之间的相似度(包含将数据转换成稀疏向量喂入MinHashLSH内,得到物品相似度矩阵)
# 调用函数
def run_main():
###获取数据
train, test = get_dataframe()
train=build_model(train)
#test=build_model(test)
"""
+-------+--------+------------------+
|user_id|video_id| score|
+-------+--------+------------------+
|1000442| 11926|1.2311204247835448|
|1000442| 47688|1.2311204247835448|
"""
train=train.rdd
#test=test.rdd
def takeSecond(elem):
return elem[2]
def getTopN(x, k):
x.sort(key=takeSecond, reverse=True)
x = x[:k]
return x
def toCSVLine(data):
output_str = str(data[0]) + "\t"
return output_str + ','.join([str(d[1]) for d in data[1]])
#unionRDD = train.union(test)
unionRDD = train#因为没有用到验证集,所以unionRDD就是train
&#