# 4、打卡分析
# 4-1 统计每年的打卡次数
df22 = checkin_df \
.select(col('business_id'), explode(split(col('date'), ',')).alias('datetime')) \
.select('business_id', year(to_timestamp(trim(col('datetime')))).alias('year')) \
.groupBy('year') \
.count() \
.orderBy(col("count").desc()) \
.show()
# 4-2 统计24小时每小时打卡次数
df23 = checkin_df.select(col('business_id'), explode(split(col('date'), ',')).alias('datetime')) \
.select('business_id', hour(to_timestamp(trim(col('datetime')))).alias('hour')) \
.groupBy('hour') \
.count()\
.orderBy(col("count").desc()) \
.show()
# 4-3 统计最喜欢打卡的城市
df24 = checkin_df \
.select(col('business_id'), explode(split(col('date'), ',')).alias('datetime')) \
.join(df, checkin_df['business_id'] == df['business_id']) \
.select(df['business_id'], df['city']) \
.groupBy('city') \
.agg(count('city').alias('cnt')) \
.orderBy(col('cnt').desc()) \
.limit(20) \
.show()
# 4-4 全部商家的打卡排行榜
df25 = checkin_df \
.select('business_id', explode(split(col('date'), ',')).alias('time')) \
.groupBy("business_id") \
.count() \
.orderBy(col("count").desc()) \
.limit(5) \
.show()