标记一个DEMO以便于后面复习
DEMO背景阿里天池竞赛系列 口碑商家客流量预测
第二步数据整理
星期几 | 第几天 | 商家id | 浏览量 | 购买量 |
---|---|---|---|---|
星期一 | 1 | 1024 | 600 | 342 |
package com.huadian.bigdata.ijcai
import java.util.Date
import org.apache.spark.sql.{SaveMode, SparkSession}
object IJCAIUserCountSpark {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("SparkSQLDemo")
.master("local[5]")
.config("spark.sql.shuffle.partitions",2)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
/**
* 统计 训练 数据集 中各个 商家 每天的用户浏览数和用户支付数
* date_str,shop_id,count_visit,count_pay
*/
//统计训练用户浏览量数据
val trinUserViewDf = spark.read
.option("header","true")
.csv("file:///F:/IJCAI/train_user_view")
trinUserViewDf.createOrReplaceTempView("view_tmp_train_user_view")
val trainPerDayUserVisitCountDF = spark.sql(
"""
|select
| substring(time_stamp,0,10) as date_str,
| shop_id,
| count(1) as count_visit
|from
| view_tmp_train_user_view
|group by
| substring(time_stamp,0,10),shop_id
""".stripMargin)
trainPerDayUserVisitCountDF.show(20,false)
//对用户支付行为数据统计,使用DSL
val trainUserPayDF = spark.read
.option("header","true")
.csv("file:///F:/IJCAI/train_user_pay")
val trainPryDayUserPayCountDf = trainUserPayDF
.selectExpr("substring(time_stamp,0,10) as date_str","shop_id")
.groupBy($"date_str", $"shop_id").count()
.selectExpr("date_str","shop_id","count as count_pay")
trainPryDayUserPayCountDf.show(20)
spark.udf.register(
"get_weekday",
(dateTime:String)=>{
import java.text.SimpleDateFormat
val f: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
val f1: SimpleDateFormat = new SimpleDateFormat("EEEE")
val dd: Date =f.parse(dateTime)
f1.format(dd)
}
)
spark.udf.register(
"get_day_week",
(dateTime:String)=>{
import java.text.SimpleDateFormat
import java.util.Calendar
val f: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
val cal: Calendar = Calendar.getInstance
// 获得一个日历
try {
val datet = f.parse(dateTime)
cal.setTime(datet)
} catch {
case e:Exception =>
Nil
}
cal.get(Calendar.DAY_OF_WEEK)-1 // 指示一个星期中的某天。
}
)
//合并结果:每天各个商家 浏览量和 客户支付量统计
val trainPrerDayUserDF =trainPerDayUserVisitCountDF
.join(trainPryDayUserPayCountDf,Seq("date_str", "shop_id"))
//对于特征值来讲,我们需要的是周一
.selectExpr(
"get_weekday(date_str) as weekday", //将日期转成 星期几
"get_day_week(date_str) as day_week",//一周的第几天
"shop_id","count_visit","count_pay"
)
trainPrerDayUserDF
.coalesce(1)
.write
.mode(SaveMode.Overwrite)
.option("header","true")
.csv("file:///F:/IJCAI/train_user_visit_pay")
spark.close()
}
}