import findspark
findspark.init()
##############################################
from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import from_unixtime, to_timestamp
spark = SparkSession.builder \
.master("local[*]") \
.appName("PySpark ETL") \
.getOrCreate();
#############################################
df = spark.read.csv('./ml-small/ratings.csv',header=True).cache()
#利用cast方法,转换rating类型
df = df.withColumn("rating", df.rating.cast("double"))
#新增一列date
df = df.withColumn("date",from_unixtime(df.timestamp.cast("bigint"), 'yyyy-MM-dd'))
df = df.withColumn("date",df.date.cast("date"))
#删除timestamp列
df = df.drop("timestamp")
df2 = spark.read.csv('./ml-small/movies.csv',header=True).cache()
df3 = df.join(df2, df.movieId == df2.movieId,"inner") \
.select("userId",df.movieId,"title","date","rating").cache()
from pyspark.sql.functions import udf
#定义普通的python函数
#根据评分来确定用户是否喜欢此电影
def isLike(v):
if v > 4:
return True
else:
return False
from pyspark.sql.types import BooleanType
#创建udf函数,即 用户自定义函数
#通过udf将普通的python函数isLike封装成udf函数,同时给定函数返回类型为BooleanType
udf_isLike=udf(isLike,BooleanType())
#利用udf_isLike函数对字段rating值进行计算,将结果存到新列isLike
df3 = df3.withColumn("isLike",udf_isLike(df3["rating"]))
from pyspark.sql.functions import pandas_udf, PandasUDFType
#定义pandas udf函数,用于GroupedData
#对函数def fmerge(v)进行修饰,表示定义的是一个PandasUDF函数。
@pandas_udf("string", PandasUDFType.GROUPED_AGG)
def fmerge(v):
return ','.join(v)
df5 = spark.read.csv('./ml-small/tags.csv',header=True).cache()
df5 = df5.drop("timestamp")
# groupBy聚合
# 对df5对象进行聚合操作,调用fmerge函数对tag字段进行聚合
# 以userId、movieId分组后,把tag的值进行join拼接在一起
df7 = df5.groupBy(["userId","movieId"]).agg(fmerge(df5["tag"])).cache()
df7 = df7.withColumnRenamed("fmerge(tag)","tags")
df6 = df3.join(df7,(df3.movieId == df7.movieId) & (df3.userId == df7.userId))\
.select(df3.userId,df3.movieId,"title","date","tags","rating","isLike") \
.orderBy(["date"], ascending=[0]).cache()
df6 = df6.filter(df.date>'2015-10-25').cache()
df6.show(20)
#存储数据
df6.rdd.coalesce(1).saveAsTextFile("movie-out")
# 注意:saveAsTextFile 在进行数据存储时,如果目录存在,则会报错。
#存储数据CSV格式
df6.coalesce(1).write.format("csv").option("header","true").save("movie-out-csv")
#parquet格式
df6.write.format('parquet').save("movie-out-parquet")
#json格式
df6.coalesce(1).write.format("json").save("movie-out-json")
##############################################
spark.stop()